In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [3]:
compiledcountydata_dir = "https://raw.githubusercontent.com/jaredbarkley0/PythonGroupProject/master/MN%20County%20Data_032721.csv"
compiledcountydata = pd.read_csv(compiledcountydata_dir)
compiledcountydata

Unnamed: 0.1,Unnamed: 0,County,Total Positive Cases,Total Deaths,Population,Rate of Infection,Pop Density,Mean Household Size,Congregate Living Population,No. of Cases in Nursing Homes,No. of Nursing Homes in County,"Nursing Home Cases, % of Total",Cases per number of Nursing Home
0,0,Aitkin,1191,33,15870.0,0.075047,8.0,2.097233,254.0,118.0,4.0,0.099076,29.500000
1,1,Anoka,33391,390,362648.0,0.092076,753.9,2.712716,3753.0,1675.0,35.0,0.050163,47.857143
2,2,Becker,3131,42,34545.0,0.090635,22.8,2.393929,477.0,294.0,8.0,0.093900,36.750000
3,3,Beltrami,3386,51,47184.0,0.071762,14.8,2.487013,1990.0,264.0,11.0,0.077968,24.000000
4,4,Benton,4482,90,40895.0,0.109598,94.4,2.426614,1050.0,557.0,11.0,0.124275,50.636364
...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,82,Watonwan,1176,8,10923.0,0.107663,25.4,2.397507,151.0,36.0,3.0,0.030612,12.000000
83,83,Wilkin,677,11,6226.0,0.108738,8.7,2.238850,152.0,36.0,2.0,0.053176,18.000000
84,84,Winona,4193,49,50830.0,0.082491,79.9,2.352632,3855.0,277.0,9.0,0.066062,30.777778
85,85,Wright,12554,116,138531.0,0.090622,178.3,2.796739,1158.0,687.0,20.0,0.054724,34.350000


In [4]:
#Cook County has 0 longterm care facilities, but we want to keep in our regression model
#Fill NaN values for all facility-related data points with 0, which is an equivalent value in this case

fillfacilitiesdata = {'Casesinfacility':0,'FacilitiesinCounty':0,'Percent Cases in Facilities':0,'Cases per number of Facilities':0}

compiledcountydata.fillna(fillfacilitiesdata, inplace=True)

In [6]:
compiledcountydata.dropna()
compiledcountydata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 13 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        87 non-null     int64  
 1   County                            87 non-null     object 
 2   Total Positive Cases              87 non-null     int64  
 3   Total Deaths                      87 non-null     int64  
 4   Population                        87 non-null     float64
 5   Rate of Infection                 87 non-null     float64
 6   Pop Density                       87 non-null     float64
 7   Mean Household Size               87 non-null     float64
 8   Congregate Living Population      87 non-null     float64
 9   No. of Cases in Nursing Homes     87 non-null     float64
 10  No. of Nursing Homes in County    87 non-null     float64
 11  Nursing Home Cases, % of Total    87 non-null     float64
 12  Cases per 

In [7]:
columns = ["Pop Density","Congregate Living Population","No. of Nursing Homes in County"]
target = "Rate of Infection"

X = compiledcountydata[columns]
y = compiledcountydata[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

print(f"Length of X_train (feature training set): {len(X_train)}")
print(f"Length of X_test (feature test set): {len(X_test)}")
print(f"Length of y_train (target training set): {len(y_train)}")
print(f"Length of y_test (target training set): {len(y_test)}")

Length of X_train (feature training set): 65
Length of X_test (feature test set): 22
Length of y_train (target training set): 65
Length of y_test (target training set): 22


In [9]:
lr = LinearRegression()
lr

LinearRegression()

In [10]:
lr.fit(X_train, y_train)

LinearRegression()

In [11]:
lr.score(X_train, y_train)

0.016889189458166753

In [13]:
lr.score(X_test, y_test)

-0.03266003517711358

In [14]:
lr.predict(X_test)

array([0.09169349, 0.0917393 , 0.09234343, 0.09250395, 0.09250544,
       0.08742699, 0.09191258, 0.09147833, 0.09002255, 0.09152899,
       0.09220235, 0.09230899, 0.09356038, 0.092146  , 0.09143375,
       0.089264  , 0.09236466, 0.09232128, 0.09449776, 0.09159754,
       0.09268243, 0.09134253])

In [15]:
import math
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

In [16]:
predictions = lr.predict(X_test)
printMetrics(y_test, predictions)

Score: -0.01
MAE: 0.01
RMSE: 0.02
r2: -0.03


## Ridge

In [17]:
rr = Ridge(solver="svd")
rr

Ridge(solver='svd')

In [18]:
rr.fit(X_train, y_train)

Ridge(solver='svd')

In [19]:
rr.score(X_train, y_train)

0.016889189142414884

In [20]:
rr.score(X_test, y_test)

-0.03265416803311272

In [21]:
predictions = rr.predict(X_test)
printMetrics(y_test, predictions)

Score: -0.01
MAE: 0.01
RMSE: 0.02
r2: -0.03
