In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sqlalchemy import create_engine
from config import user, db_password, host, database

In [2]:
# Create the database engine
engine = create_engine(f"postgres://{user}:{db_password}@{host}:5432/{database}")

In [3]:
# Create total features table from database tables
total_table_df = pd.read_sql_query("""SELECT * 
                                      FROM population_density_by_state as pop
                                         JOIN public.cleaned_flu_csv as flu
                                         ON 'pop.State' = flu.state""", con=engine)

total_table_df.head()

Unnamed: 0,﻿State,xx,2016 Population Density (persons/square mile),2017 Population Density (persons/square mile),2018 Population Density (persons/square mile),2019 Population Density (persons/square mile),state,2016,2017,2018,2019


In [4]:
#Import population density by state table and check column names

# Create population density dataframe
pop_df = pd.read_sql_query("SELECT * FROM population_density_by_state", con=engine)

# Create uninsured dataframe
uni_df = pd.read_sql_query("SELECT * FROM uninsured_rates_by_state", con=engine)

# Create gdp dataframe
gdp_df = pd.read_sql_query("SELECT * FROM cleaned_gbp_csv", con=engine)

# Create flu dataframe
flu_df = pd.read_sql_query("SELECT * FROM cleaned_flu_csv", con=engine)

In [5]:
# Check the pop_df out
pop_df.head()

pop_df.columns

Index(['﻿State', 'xx', '2016 Population Density (persons/square mile)',
       '2017 Population Density (persons/square mile)',
       '2018 Population Density (persons/square mile)',
       '2019 Population Density (persons/square mile)'],
      dtype='object')

In [6]:
# Check out the uninsured dataframe
uni_df.head()

uni_df.columns

Index(['﻿State', 'Uninsured Rate 2016', 'Uninsured Rate 2017',
       'Uninsured Rate 2018', 'Uninsured Rate 2019'],
      dtype='object')

In [7]:
# Check out the gdp dataframe
gdp_df.head()

gdp_df = gdp_df.drop(columns=["column1", "description", "gdp_2015"])

gdp_df.columns

Index(['state_name', 'gdp_2016', 'gdp_2017', 'gdp_2018', 'gdp_2019'], dtype='object')

In [8]:
# Check out the flu_df
flu_df.head()

flu_df.columns

Index(['state', '2016', '2017', '2018', '2019'], dtype='object')

In [9]:
# Create an index list
# pop_df.columns[0]

In [10]:
# Merge dataframes
flu_factors_df = pop_df.merge(uni_df, how="left")

flu_factors_df = pd.concat([flu_factors_df, gdp_df], axis=1)

flu_factors_df = flu_factors_df.drop(flu_factors_df.columns[10], axis = 1)

flu_factors_df.head()

Unnamed: 0,﻿State,xx,2016 Population Density (persons/square mile),2017 Population Density (persons/square mile),2018 Population Density (persons/square mile),2019 Population Density (persons/square mile),Uninsured Rate 2016,Uninsured Rate 2017,Uninsured Rate 2018,Uninsured Rate 2019,gdp_2016,gdp_2017,gdp_2018,gdp_2019
0,Alabama,50645.39,96.03,96.25,96.51,96.81,9.1,9.4,10.0,9.7,191523.0,193693.0,198053.0,200829.0
1,Alaska,570640.61,1.3,1.3,1.29,1.28,14.0,13.7,12.0,12.2,53289.0,52825.0,52928.0,53255.0
2,Arizona,113593.91,61.1,62.01,63.01,64.08,10.0,10.1,10.0,11.3,291259.0,302117.0,314016.0,323597.0
3,Arkansas,52035.35,57.46,57.68,57.84,58.0,7.9,7.9,8.0,9.1,113490.0,114950.0,116698.0,117447.0
4,California,155779.03,251.43,252.66,253.32,253.64,7.3,7.2,7.0,7.7,2519133.0,2628314.0,2708966.0,2800505.0
