In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sqlalchemy import create_engine
from config import user, db_password, host, database

In [2]:
# Create the database engine
engine = create_engine(f"postgres://{user}:{db_password}@{host}:5432/{database}")

In [3]:
# Merged dataframe
merged_df = pd.read_sql_query("""select * from population_density_by_state pdbs 
                                    inner join uninsured_rates_by_state urbs on 
                                    pdbs."﻿State" = urbs."﻿State"
                                    inner join cleaned_gbp_csv cgc on
                                    pdbs."﻿State" = cgc.state_name 
                                    inner join pci_by_state pbs on 
                                    pdbs."﻿State" = pbs."State" 
                                    inner join flu_percentages fp on 
                                    pdbs."﻿State" = fp.state;""", con=engine)

merged_df.head()

Unnamed: 0,﻿State,xx,2016 Population Density (persons/square mile),2017 Population Density (persons/square mile),2018 Population Density (persons/square mile),2019 Population Density (persons/square mile),﻿State.1,Uninsured Rate 2016,Uninsured Rate 2017,Uninsured Rate 2018,...,State,2016,2017,2018,2019,state,2016_cases_percent,2017_cases_percent,2018_cases_percent,2019_cases_percent
0,Alabama,50645.39,96.03,96.25,96.51,96.81,Alabama,9.1,9.4,10.0,...,Alabama,39536.0,41030.0,42710.0,44145.0,Alabama,0.237399828,0.551401727,1.309393964,1.40161956
1,Alaska,570640.61,1.3,1.3,1.29,1.28,Alaska,14.0,13.7,12.0,...,Alaska,56302.0,57394.0,60355.0,62806.0,Alaska,0.143366565,0.446937948,1.136111674,1.127613476
2,Arizona,113593.91,61.1,62.01,63.01,64.08,Arizona,10.0,10.1,10.0,...,Arizona,40801.0,42590.0,44597.0,46058.0,Arizona,0.322226884,0.320030869,0.343307036,0.281697997
3,Arkansas,52035.35,57.46,57.68,57.84,58.0,Arkansas,7.9,7.9,8.0,...,Arkansas,40385.0,41657.0,43325.0,44629.0,Arkansas,0.096424049,0.174888258,0.193738116,0.196798732
4,California,155779.03,251.43,252.66,253.32,253.64,California,7.3,7.2,7.0,...,California,58048.0,60549.0,63720.0,66619.0,California,0.112745087,0.117120834,0.132161433,0.132136327


In [4]:
# Clean merged dataframe
merged_df.drop(columns=["﻿State", "state_name", "xx", "column1", "description", "State"], inplace=True)

merged_df.columns

cols_order = ['state', '2016 Population Density (persons/square mile)', 
              '2017 Population Density (persons/square mile)','2018 Population Density (persons/square mile)', 
              '2019 Population Density (persons/square mile)', 'Uninsured Rate 2016', 'Uninsured Rate 2017', 
              'Uninsured Rate 2018', 'Uninsured Rate 2019', 'gdp_2015', 'gdp_2016', 'gdp_2017', 'gdp_2018', 
              'gdp_2019', '2016', '2017', '2018', '2019','2016_cases_percent','2017_cases_percent', 
              '2018_cases_percent', '2019_cases_percent']

merged_df = merged_df[cols_order]

merged_df.head()

Unnamed: 0,state,2016 Population Density (persons/square mile),2017 Population Density (persons/square mile),2018 Population Density (persons/square mile),2019 Population Density (persons/square mile),Uninsured Rate 2016,Uninsured Rate 2017,Uninsured Rate 2018,Uninsured Rate 2019,gdp_2015,...,gdp_2018,gdp_2019,2016,2017,2018,2019,2016_cases_percent,2017_cases_percent,2018_cases_percent,2019_cases_percent
0,Alabama,96.03,96.25,96.51,96.81,9.1,9.4,10.0,9.7,189428,...,198053,200829,39536.0,41030.0,42710.0,44145.0,0.237399828,0.551401727,1.309393964,1.40161956
1,Alaska,1.3,1.3,1.29,1.28,14.0,13.7,12.0,12.2,54015,...,52928,53255,56302.0,57394.0,60355.0,62806.0,0.143366565,0.446937948,1.136111674,1.127613476
2,Arizona,61.1,62.01,63.01,64.08,10.0,10.1,10.0,11.3,281935,...,314016,323597,40801.0,42590.0,44597.0,46058.0,0.322226884,0.320030869,0.343307036,0.281697997
3,Arkansas,57.46,57.68,57.84,58.0,7.9,7.9,8.0,9.1,112938,...,116698,117447,40385.0,41657.0,43325.0,44629.0,0.096424049,0.174888258,0.193738116,0.196798732
4,California,251.43,252.66,253.32,253.64,7.3,7.2,7.0,7.7,2437366,...,2708966,2800505,58048.0,60549.0,63720.0,66619.0,0.112745087,0.117120834,0.132161433,0.132136327


In [5]:
# Split data into target and feature variables

# target variables
y = merged_df[["2016_cases_percent", "2017_cases_percent", "2018_cases_percent", "2019_cases_percent"]]

# feature variables
X = merged_df.drop(columns=["state", "2016_cases_percent", "2017_cases_percent", 
                            "2018_cases_percent", "2019_cases_percent"]).copy()

In [6]:
# Create a scaler
scaler = StandardScaler()

In [7]:
# Fit Scaler
X_scaled = scaler.fit_transform(X)

In [8]:
# Train test split the feature dataframe
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, random_state=42)

In [9]:
# Create linear regression model
lr_model = LinearRegression()

In [10]:
# Fit linear regression model
lr_model.fit(X_train_scaled, y_train)

LinearRegression()

In [11]:
# Create prediction for model
y_pred = lr_model.predict(X_test_scaled)

In [13]:
# Print coefficients
print("\n")
print("Coefficients: ", lr_model.coef_)
print("\n")

# Print mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
print("\n")

# Print coefficients of determination
print('Coefficinet of determination: %.2f'
      % r2_score(y_test, y_pred))
print("\n")



Coefficients:  [[-1.44288968e+01  9.74596626e+00  4.76602176e+01 -4.28686707e+01
  -3.03367548e-01  3.92521589e-01 -4.03494918e-01  3.96806417e-01
   5.19416605e-01 -9.74786225e+00 -4.91169876e-01  2.60409938e+01
  -1.64538001e+01 -1.81111190e-01  1.74278274e-01 -3.26335912e-01
   2.12976902e-01]
 [-1.87055639e+01  1.48141485e+01  5.95071248e+01 -5.54783395e+01
  -1.72762654e-01  5.19677315e-01 -5.52431614e-01  3.48272488e-01
  -1.16251471e+00 -3.83551382e+00 -1.30451221e+01  3.87739520e+01
  -2.09402949e+01 -4.09087680e-02  4.23296055e-01  6.34218773e-01
  -1.16317921e+00]
 [-2.70354882e+01  7.41040665e+01 -4.39307713e+01 -2.98524661e+00
  -3.28086768e-01  1.07296534e+00 -5.06054192e-01 -8.74167975e-02
   2.44167484e+00 -2.70327345e+01  2.46228062e+01  2.34448681e+01
  -2.35818364e+01  9.53210772e-02 -5.81710300e-01 -1.28324582e+00
   1.65852240e+00]
 [-3.49317930e+01  1.13967906e+02 -1.05312639e+02  2.64146628e+01
  -6.40328684e-01  1.28482964e+00 -2.44621502e-02 -5.47697197e-01
  

In [None]:
# Plot linear regression model
