# BAIS 6040 Data Programming in Python

## Spring 2021

## Week 9 Homework

## Group 3: Jared Barkley, Tyler Zaruba, Netaji Boggarapu, Michael Noonan

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

In [None]:
compiledcountydata_dir = "https://raw.githubusercontent.com/jaredbarkley0/PythonGroupProject/master/MN%20County%20Data_032721.csv"
compiledcountydata = pd.read_csv(compiledcountydata_dir)
compiledcountydata

In [None]:
# Adding column for 'Death Rate' to create a new target for analysis

# compiledcountydata['Death Rate'] = compiledcountydata['Total Deaths']/compiledcountydata['Total Positive Cases']

In [None]:
# Check info on dataframe
compiledcountydata.info()

In [None]:
compiledcountydata.describe()

In [None]:
# Create a correlation matrix as a heat map

corr = compiledcountydata.corr()
f, ax = plt.subplots(figsize=(11, 9))
mask = np.triu(np.ones_like(corr, dtype=bool))
cmap = sns.diverging_palette(220, 20, as_cmap=True)
sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

### Analysis of the correlation matrix indicates we may have high multicollinearity between our variables.  Population, Total Positive Cases, Total Deaths and No. of Cases in Nursing Homes have an almost perfect relationship, suggesting that they are essentially measuring the same thing.  Categories which may be interesting to analyze are the Infection Rate, Death Rate, and total number of deaths.  Since Population, Total Deaths, Total Positive Cases and No. of Cases in Nursing Homes all have correlations greater than 98%, I will only include Total Deaths in my analysis as a target, and will not use the others as explaining variables.

In [None]:
# Commenting this out because the seaborn version is much more powerful

# #Define a function for examining relationships between two variables

# def twovariablelinreg(dataframe,xvar,yvar):
#     columns = xvar
#     target = yvar
    
#     X = dataframe[[columns]]
#     y = dataframe[[target]]
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
#     lr = LinearRegression()
#     lr.fit(X_train, y_train)
#     print(f"The score of my two-variable linear regression is {lr.score(X_test, y_test)}")
#     print(f'My R-squared value is {lr.score(X_test, y_test)}')
#     print(f'My slope coefficients are {lr.coef_}')
#     print(f'My intercept is {lr.intercept_}')
#     ypred = lr.predict(X)
#     fig, ax = plt.subplots(figsize = (4,4))
#     #Keeping the figure size small since I plan to do many of these
#     plt.scatter(X, ypred, color='purple')
#     plt.scatter(X,y, color='green')
#     ax.set(title = f'{yvar} by {xvar}', xlabel = xvar, ylabel=yvar)
#     plt.show

# Section 1

## Investigation of individual variables to try to identify base correlations
## I'm Interested in Total Positive Cases, Rate of Infection, and Total Deaths as possible targets, will create plots
## of possible targets versus relevant possible features for initial analysis

In [None]:
# Helper function that allows me to select a target variable and multiple feature variables
# then plot out scatterplots for visual analysis

def plotpairgrid(dataframe, columns, target):
    pairgrid = dataframe[target + columns]
    pairgridplot = sns.PairGrid(pairgrid, y_vars=target, x_vars=columns, height = 5)
    pairgridplot.map(sns.regplot, color="green")
    pairgridplot.map(sns.scatterplot)

In [None]:
# Examine relationships between rate of infection and other aggregate variables

columns = ['Pop Density','Mean Household Size','Nursing Home Cases, % of Total']
rateofinfectiontarget = ['Rate of Infection']

plotpairgrid(compiledcountydata, columns, rateofinfectiontarget)

### Rate of infection appears to be negatively impacted by pop density, and moderately positively impacted by the mean household size and nursing home case rate.

In [None]:
# Examine relationships between Total Deaths and population factors

totaldeathtarget = ['Total Deaths']

plotpairgrid(compiledcountydata, columns, totaldeathtarget)

### Total Deaths appears to have the strongest relationship with Population Density.  Mean Household size appears to have a slightly positive impact, while nursing home case rate does not appear to have a significant relationship.

In [None]:
# Examine relationship between Death Rate and population factors

deathratetarget = ['Death Rate']

plotpairgrid(compiledcountydata, columns, deathratetarget)

### The death rate appears to be most strongly correlated with the concentration of cases that are in nursing homes, which makes sense.  Population density seems to have zero impact on the rate of death, while mean household size actually seems to have a slightly negative relationship

# Section 2, multiple linear regression

In [None]:
# Create function for linear regression that uses columns variable and different target variables defined above

def linreg(dataframe,columns,target):
    X = dataframe[columns]
    y = dataframe[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    print(f'My R-squared value is {lr.score(X_test, y_test)}')
    print(f'My slope coefficients are {lr.coef_}')
    print(f'My intercept is {lr.intercept_}')

## 2A - Rate of infection, Linear Regression

In [None]:
linreg(compiledcountydata, columns, rateofinfectiontarget)

In [None]:
# Overall model is not effective at predicting rate of infection

## 2B - Total Deaths

In [None]:
linreg(compiledcountydata,columns,totaldeathtarget)

In [None]:
# Overall model is better at predicting total number of deaths.  Pop Density correlates strongly with total population and total death

## 2C - Death Rate

In [None]:
linreg(compiledcountydata,columns,deathratetarget)

In [None]:
# This model is slightly better at predicting death rate than the infection rate, however it is still 
# not a very good predictor overall.

# Section 3 - Regression with Lasso

In [None]:
# Define a function for lasso analysis

def lassoreg(dataframe,columns,target):
    X = dataframe[columns]
    y = dataframe[target]
    clf= Lasso(alpha=0.1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
    clf.fit(X_train, y_train)
    print(f'My R-squared value is {clf.score(X_test, y_test)}')
    print(f'My slope coefficients are {clf.coef_}')
    print(f'My intercept is {clf.intercept_}')

## 3A - Lasso on Rate of Infection

In [None]:
lassoreg(compiledcountydata,columns,rateofinfectiontarget)

In [None]:
# Lasso analysis appears to be much worse than linear regression for this variable

## 3B - Lasso on Total Deaths

In [None]:
lassoreg(compiledcountydata,columns,totaldeathtarget)

In [None]:
# Again, regression appears better at predicting Total Deaths rather than rate of death or rate of infection.
# This is a pretty good value but not as good as the linear regression

## 3C - Lasso on Death Rate

In [None]:
lassoreg(compiledcountydata,columns,deathratetarget)

In [None]:
# Lasso analysis appears to be even less effective at predicting death rate than linear regression
# I am not sure how to even interpret a negative r-squared value.

## Summary

### Comparing and contrasting linear regression versus lasso on this dataset, linear regression appears to be slightly more effective at predicting the variables of interest.  However, both methods of regression struggle to explain either the death rate or rate of infection with our available data.  The visual analysis helps us understand that a little better, since we can see that the data for those two features are tightly clustered around the population density variable, but scattered fairly randomly for the other two variables.