In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso


In [None]:
compiledcountydata_dir = "https://raw.githubusercontent.com/jaredbarkley0/PythonGroupProject/master/MN%20County%20Data_032721.csv"
compiledcountydata = pd.read_csv(compiledcountydata_dir)
compiledcountydata

In [None]:
#Fill NaN values for all facility-related data points with 0, which is an equivalent value in this case
fillfacilitiesdata = {'Casesinfacility':0,'FacilitiesinCounty':0,'Percent Cases in Facilities':0,'Cases per number of Facilities':0}
compiledcountydata.fillna(fillfacilitiesdata, inplace=True)

In [None]:
#drops all na's in dataset
compiledcountydata.dropna(inplace=True)

In [None]:
#shows data info
compiledcountydata.info()

In [None]:
# Assign columns of interest and target value
# Columns were chosen based on a Mixed Stepwise Regression and removing Multicollinearity utilizing JMP program
columns = ["Total Positive Cases","Pop Density","Mean Household Size", "Cases per number of Nursing Home"]
target = "Total Deaths"

X = compiledcountydata[columns]
y = compiledcountydata[target]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [None]:
X_Plot = X["Total Positive Cases"]
X_Plot

In [None]:
#assign a variable to call linear regression function
lr = LinearRegression()
lr

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.score(X_train, y_train)  # What's our score with the training data set?

In [None]:
lr.score(X_test, y_test)    # What's our score with the test data set?

In [None]:
lr.predict(X_test)

In [None]:
print(f'My R-squared value is {lr.score(X_test, y_test)}')
print(f'My slope coefficients are {lr.coef_}')
print(f'My intercept is {lr.intercept_}')

In [None]:
#Plotting predictions of test values for reference

lr.predict(X_test)

In [None]:
# Commenting this out for now as it's not working, but would be nice to be able to make a plot

# import matplotlib.pyplot as plt

# fig, ax = plt.subplots(figsize = (12,12))
# y_pred = lr.predict(X_test)

# plt.scatter(X_test, y_test,  color='black')
# plt.plot(X_test, y_pred, color='blue', linewidth=3)
# plt.show()

In [None]:
# Example with Lasso
# Commenting out, these were example cells taken from scikitlearn tutorials

# clf = Lasso(alpha=0.1)
# clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
# Lasso(alpha=0.1)
# print(clf.coef_)

In [None]:
# Commenting out, these were example cells taken from scikitlearn tutorials

# print(clf.intercept_)

In [None]:
# Same as linreg and ridge, create a variable for the regression function

clf = Lasso(alpha=0.1)

In [None]:
# Running same pattern of fit on train, score on train, score on test
clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train) # same scores as linear regression

In [None]:
clf.score(X_test, y_test) # Same score as linear regression

In [None]:
#Printing for reference for a plot later on
X_test

In [None]:
clf.predict(X_test)

In [None]:
clf.coef_

In [None]:
clf.intercept_

In [None]:

# Printing predicted values in blue and actual values in red, this regression appears to track very closely

fig, ax = plt.subplots(figsize=(12,12))
y_pred = lr.predict(X)
plt.scatter(X_Plot, y_pred, color='blue')
plt.scatter(compiledcountydata["Total Positive Cases"],compiledcountydata["Total Deaths"], color='red')
plt.show()