### Imports

In [24]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### 2020 Census Population data

In [25]:
url = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010220'
res = requests.get(url)

In [26]:
soup = BeautifulSoup(res.content)

In [27]:
table = soup.find(attrs={'class':'qf-graph-scroll'})
# soup.find_all(attrs={'class':'qf-positive'})
tbody = table.find_all(attrs={'class':"qf-graph-geo"})
tr = tbody[1]
state = tr.find('a').attrs['data-title']
tr.find(attrs={'class':'qf-positive'}).attrs['data-value']

'29145505'

In [28]:
pop_2020 = []
for tr in tbody:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2020 Population': pop}
    pop_2020.append(state_pop)

In [29]:
pop_2020 = pd.DataFrame(pop_2020)

### 2010 Census Population Data

In [30]:
url2 = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010210'
res2 = requests.get(url2)

In [31]:
soup2 = BeautifulSoup(res2.content)
table2 = soup2.find(attrs={'class':'qf-graph-scroll'})
tbody2 = table2.find_all(attrs={'class':"qf-graph-geo"})

In [32]:
pop_2010 = []
for tr in tbody2:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2010 Population': pop}
    pop_2010.append(state_pop)

pop_2010 = pd.DataFrame(pop_2010)

In [33]:
population = pd.merge(pop_2010, pop_2020)
population.head()

Unnamed: 0,State,2010 Population,2020 Population
0,California,37253956,39538223
1,Texas,25145561,29145505
2,New York,19378102,20201249
3,Florida,18801310,21538187
4,Illinois,12830632,12812508


In [34]:
population.to_csv('Data/Population_data_2010_&_2020.csv')

### Excess Deaths Data

In [35]:
deaths = pd.read_csv('Ignore/Excess_Deaths_Associated_with_COVID-19.csv')
deaths.head(100)

Unnamed: 0,Week Ending Date,State,Observed Number,Upper Bound Threshold,Exceeds Threshold,Average Expected Count,Excess Estimate,Total Excess Estimate,Percent Excess Estimate,Year,Type,Outcome,Suppress,Note
0,2017-01-07,Alabama,1121.0,1136,False,1059,62,29601,5.8527,2017,Predicted (weighted),All causes,,
1,2017-01-14,Alabama,1130.0,1140,False,1067,63,29601,5.906102,2017,Predicted (weighted),All causes,,
2,2017-01-21,Alabama,1048.0,1142,False,1071,0,29601,0.0,2017,Predicted (weighted),All causes,,
3,2017-01-28,Alabama,1026.0,1142,False,1070,0,29601,0.0,2017,Predicted (weighted),All causes,,
4,2017-02-04,Alabama,1036.0,1142,False,1068,0,29601,0.0,2017,Predicted (weighted),All causes,,
5,2017-02-11,Alabama,1058.0,1136,False,1062,0,29601,0.0,2017,Predicted (weighted),All causes,,
6,2017-02-18,Alabama,1060.0,1132,False,1057,3,29601,0.283804,2017,Predicted (weighted),All causes,,
7,2017-02-25,Alabama,1099.0,1126,False,1051,48,29601,4.567542,2017,Predicted (weighted),All causes,,
8,2017-03-04,Alabama,1081.0,1119,False,1042,39,29601,3.743414,2017,Predicted (weighted),All causes,,
9,2017-03-11,Alabama,1011.0,1113,False,1036,0,29601,0.0,2017,Predicted (weighted),All causes,,


In [36]:
covid = deaths[(deaths['Type']=='Predicted (weighted)')&((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
       (deaths['Year'] == 2020) | (deaths['Year'] == 2021) | (deaths['Year'] == 2022)]

In [37]:
covid['Type'].value_counts()

Predicted (weighted)    16956
Unweighted               5670
Name: Type, dtype: int64

In [38]:
covid['Outcome'].value_counts()

All causes                        14148
All causes, excluding COVID-19     8478
Name: Outcome, dtype: int64

In [39]:
def covid_deaths(data):
    results = []
    for year in covid['Year']:
        all_causes = 0
        covid = 0
        cause_nocovid = 0
        for state in covid['State'].unique():
            if covid['Outcome'] == 'All causes' and covid['Type'] == 'Predicted (weighted)':
                all_causes += covid['Total Excess Estimate']
            elif covid['Outcome'] == 'All causes, excluding COVID-19':
                cause_nocovid += covid['Total Excess Estimate']
            results.append({'Location': state, f'all_causes{year}':all_causes,
                            f'covid{year}':covid})
    return results

In [40]:
covid_deaths(covid)

UnboundLocalError: cannot access local variable 'covid' where it is not associated with a value

In [None]:
deaths['Exceeds Threshold'].dtypes

In [None]:
# getting sum of each states excess deaths 
def excess_deaths(deaths, start_year=2017, end_year=2022):
    state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        excess_estimate = row['Excess Estimate']
        
        if start_year <= year <= end_year:
            state_totals[state][year - start_year] += excess_estimate

    result = pd.DataFrame(state_totals).T.reset_index()
    result.columns = ['Location'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


In [None]:
# Getting count for each year and each state where they exceed their threshold
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    result = pd.DataFrame()
    
    for year in range(start_year, end_year + 1):
        
        year_data = deaths[deaths['Year'] == year]
        year_counts = year_data.groupby('State')['Exceeds Threshold'].sum().reset_index()
        year_counts = year_counts.rename(columns={'Exceeds Threshold': f'Exc_count_{year}'})
        year_counts.set_index('State', inplace=True)
        
        if result.empty:
            result = year_counts
        else:
            result = result.merge(year_counts, left_index=True, right_index=True, how='outer')
    
    result = result.reset_index().fillna(0)
    
    return result

In [None]:
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    results = []
    
    for year in range(start_year, end_year + 1):
        year_counts = deaths[deaths['Year'] == year]
        state_counts = {}
        
        for state in year_counts['State'].unique():
            count = year_counts[(year_counts['State'] == state) & (year_counts['Exceeds Threshold'] == 'True')].shape[0]
            state_counts[state] = count
        
        results.append({'Year': year, **state_counts})
    
    return pd.DataFrame(results)

exceeds_threshold = count_exceeds_threshold(deaths, start_year=2017, end_year=2022)
exceeds_threshold

In [None]:
deaths["Week Ending Date"] = pd.to_datetime(deaths["Week Ending Date"])
deaths["Year"] = deaths["Week Ending Date"].dt.year
exceeds_threshold = deaths[deaths["Exceeds Threshold"] == True]
result = exceeds_threshold.groupby(["State", "Year"]).size().reset_index(name="Exceeds Threshold Count")

result

In [None]:
# Merging the data
#finaldeaths = pd.merge(excess_deaths, exceeds_threshold, how='left')
#finaldeaths.rename(columns={'State':'Location'}, inplace=True)
#finaldeaths.head()

In [None]:
finaldeaths.to_csv('Data/Excess_deaths&Exceeds_Threshold_data.csv')

### Population Density Data

In [41]:
url3 = 'https://wisevoter.com/state-rankings/population-density-by-state/'
res3 = requests.get(url3)
soup3 = BeautifulSoup(res3.content)

In [42]:
table3 = soup3.find('table', attrs={'id': 'shdb-on-page-table'})
tbody = table3.find('tbody')
trs = tbody.find_all('tr')

In [43]:
pop_density = []
for tr in trs:
    state = tr.find(attrs={'class':'shdb-on-page-table-body-Geo'}).text
    density = tr.find(attrs={'class':'shdb-on-page-table-body-Data'}).text.split()[0]
    
    state_density = {'State':state, 'Population Density per mi²': density}
    pop_density.append(state_density)
Pop_density = pd.DataFrame(pop_density)
Pop_density.head()

Unnamed: 0,State,Population Density per mi²
0,New Jersey,1283.4
1,Rhode Island,1074.3
2,Massachusetts,919.82
3,Connecticut,746.7
4,Maryland,648.84


In [44]:
Pop_density.to_csv('Data/Population_Density_data.csv')

### Mask county data

In [45]:
county_mask = pd.read_csv('Ignore/U.S._State_and_Territorial_Public_Mask_Mandates_From_April_10__2020_through_August_15__2021_by_County_by_Day.csv')
county_mask

In [46]:
# Put my code into Chatgpt who helped simplify to speed up run time
def mandate_length(data):
    county_counts = {}
    for index, row in data.iterrows():
        county_name = row['County_Name']
        mask_required = row['Face_Masks_Required_in_Public']
        state = row['State_Tribe_Territory']
        
        if mask_required == 'Yes':
            if state in county_counts:
                if county_name in county_counts[state]:
                    county_counts[state][county_name]['Count'] += 1
                else:
                    county_counts[state][county_name] = {'State': state, 'Count': 1}
            else:
                county_counts[state] = {county_name: {'State': state, 'Count': 1}}

    county_counts_list = [{'State': state, 'County': county_name, 'Count': county_counts[state][county_name]['Count']} for state in county_counts for county_name in county_counts[state]]

    return pd.DataFrame(county_counts_list)

county_counts_df = mandate_length(county_mask)

In [47]:
county_counts_df.head()

Unnamed: 0,State,County,Count
0,AL,Autauga County,267
1,AL,Baldwin County,267
2,AL,Barbour County,267
3,AL,Bibb County,267
4,AL,Blount County,267


In [48]:
county_counts_df.shape

(2423, 3)

In [49]:
county_counts_df['State'].value_counts()

TX    254
VA    133
KY    120
KS    105
IL    102
NC    100
IA     99
IN     92
OH     88
MN     87
MI     83
MS     82
PR     78
AR     75
WI     72
PA     67
AL     67
LA     64
CO     64
NY     62
CA     58
MT     56
WV     55
ND     53
WA     39
OR     36
NM     33
UT     29
MD     24
WY     23
NJ     21
NV     17
ME     16
MA     14
VT     14
NH     10
CT      8
RI      5
HI      5
AS      5
DE      3
VI      3
GU      1
DC      1
Name: State, dtype: int64

In [50]:
county_counts_df.to_csv('Data/county_mask_mandata.csv', index=False)

### Merging the data

In [83]:
def merge_dataframes():
    employ = pd.read_csv('Data/Cleaned/cleaned_employment.csv')
    income = pd.read_csv('Data/Cleaned/cleaned_income.csv')
    life = pd.read_csv('Data/Cleaned/cleaned_Life_Expentency.csv')
    insur = pd.read_csv('Data/Cleaned/cleaned_insur.csv')
    pop_dense = pd.read_csv('Data/Cleaned/cleaned_pop_dense.csv')
    pop_size = pd.read_csv('Data/Cleaned/cleaned_pop_size.csv')
    pre_cond = pd.read_csv('Data/Cleaned/cleaned_pre_condtions.csv')
    total_phys = pd.read_csv('Data/Cleaned/cleaned_total_physician.csv')
    
    merged_df = employ.merge(income, on='Location').merge(life, on='Location').merge(insur, on='Location') \
        .merge(pop_dense, on='Location').merge(pop_size, on='Location').merge(pre_cond, on='Location') \
        .merge(total_phys, on='Location').merge(excess_deaths, on='Location')

    return merged_df
merged_data = merge_dataframes()

In [84]:
merged_data.drop(columns=['Unnamed: 0'], inplace=True)
merged_data.head()


Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983,224.0,10389,211.9,594,12.1,2649,4062,945,20726,29283,14805
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101,287.2,1825,249.5,276,37.7,417,492,639,1422,3483,1783
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343,252.0,15989,219.7,2353,32.3,3522,4281,1785,30955,40347,20653
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500,215.4,6132,203.2,368,12.2,3054,2199,1434,11285,15391,8969
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718,287.8,107383,271.8,6333,16.0,17241,13434,1029,93875,133206,89195


In [93]:
vax_state = pd.read_csv('Data/Cleaned/vax_state.csv')
vax_state.head()

Unnamed: 0,Location,Dist_Per_100K_2021,Distributed_Per_100k_65Plus_2021,Admin_Per_100K_2021,Admin_Per_100k_65Plus_2021,Administered_Dose1_Pop_Pct_2021,Administered_Dose1_Recip_65PlusPop_Pct_2021,Series_Complete_Pop_Pct_2021,Series_Complete_65PlusPop_Pct_2021,Additional_Doses_Vax_Pct_2021,Additional_Doses_50Plus_Vax_Pct_2021,Additional_Doses_65Plus_Vax_Pct_2021,Dist_Per_100K_2022,Distributed_Per_100k_65Plus_2022,Admin_Per_100K_2022,Admin_Per_100k_65Plus_2022,Administered_Dose1_Pop_Pct_2022,Administered_Dose1_Recip_65PlusPop_Pct_2022,Series_Complete_Pop_Pct_2022,Series_Complete_65PlusPop_Pct_2022,Additional_Doses_Vax_Pct_2022,Additional_Doses_50Plus_Vax_Pct_2022,Additional_Doses_65Plus_Vax_Pct_2022,Second_Booster_50Plus_Vax_Pct,Second_Booster_65Plus_Vax_Pct,Bivalent_Booster_65Plus_Pop_Pct
0,KS,178349,1092740,140263,241990,69.4,95.0,57.1,88.2,33.8,48.5,58.3,273256,1674240,181329,327654,75.9,95.0,65.0,95.0,48.7,64.1,72.8,51.2,59.1,39.7
1,AL,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2
2,NJ,210259,1265690,168844,238218,83.9,95.0,70.6,89.9,34.2,48.6,58.1,317756,1912780,219584,324072,94.3,95.0,78.8,95.0,51.6,64.9,72.2,44.9,53.7,35.1
3,GA,172562,1207790,125950,217166,61.4,92.3,51.1,80.8,28.2,40.1,48.6,258794,1811340,158505,284715,68.1,95.0,57.1,86.4,43.7,58.3,67.1,42.1,48.7,26.6
4,PR,201561,947131,185830,223651,89.1,95.0,77.1,86.1,28.9,41.6,46.2,268619,1262240,248533,299619,95.0,95.0,86.7,92.8,63.4,73.8,75.6,33.1,38.6,12.3


In [94]:
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}
vax_state['Location'] = vax_state['Location'].map(state_name)

In [106]:
state_mask = pd.read_csv('Data/Cleaned/cleaned_mask.csv')
merged_data = merged_data.merge(vax_state, on='Location').merge(state_mask, on='Location')
merged_data.head()

Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022,Dist_Per_100K_2021_x,Distributed_Per_100k_65Plus_2021_x,Admin_Per_100K_2021_x,Admin_Per_100k_65Plus_2021_x,Administered_Dose1_Pop_Pct_2021_x,Administered_Dose1_Recip_65PlusPop_Pct_2021_x,Series_Complete_Pop_Pct_2021_x,Series_Complete_65PlusPop_Pct_2021_x,Additional_Doses_Vax_Pct_2021_x,Additional_Doses_50Plus_Vax_Pct_2021_x,Additional_Doses_65Plus_Vax_Pct_2021_x,Dist_Per_100K_2022_x,Distributed_Per_100k_65Plus_2022_x,Admin_Per_100K_2022_x,Admin_Per_100k_65Plus_2022_x,Administered_Dose1_Pop_Pct_2022_x,Administered_Dose1_Recip_65PlusPop_Pct_2022_x,Series_Complete_Pop_Pct_2022_x,Series_Complete_65PlusPop_Pct_2022_x,Additional_Doses_Vax_Pct_2022_x,Additional_Doses_50Plus_Vax_Pct_2022_x,Additional_Doses_65Plus_Vax_Pct_2022_x,Second_Booster_50Plus_Vax_Pct_x,Second_Booster_65Plus_Vax_Pct_x,Bivalent_Booster_65Plus_Pop_Pct_x,astha_prevalence,Dist_Per_100K_2021_y,Distributed_Per_100k_65Plus_2021_y,Admin_Per_100K_2021_y,Admin_Per_100k_65Plus_2021_y,Administered_Dose1_Pop_Pct_2021_y,Administered_Dose1_Recip_65PlusPop_Pct_2021_y,Series_Complete_Pop_Pct_2021_y,Series_Complete_65PlusPop_Pct_2021_y,Additional_Doses_Vax_Pct_2021_y,Additional_Doses_50Plus_Vax_Pct_2021_y,Additional_Doses_65Plus_Vax_Pct_2021_y,Dist_Per_100K_2022_y,Distributed_Per_100k_65Plus_2022_y,Admin_Per_100K_2022_y,Admin_Per_100k_65Plus_2022_y,Administered_Dose1_Pop_Pct_2022_y,Administered_Dose1_Recip_65PlusPop_Pct_2022_y,Series_Complete_Pop_Pct_2022_y,Series_Complete_65PlusPop_Pct_2022_y,Additional_Doses_Vax_Pct_2022_y,Additional_Doses_50Plus_Vax_Pct_2022_y,Additional_Doses_65Plus_Vax_Pct_2022_y,Second_Booster_50Plus_Vax_Pct_y,Second_Booster_65Plus_Vax_Pct_y,Bivalent_Booster_65Plus_Pop_Pct_y,Mask_Mandate,Mandatory
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983,224.0,10389,211.9,594,12.1,2649,4062,945,20726,29283,14805,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,False,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,07/16/2020,Yes
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101,287.2,1825,249.5,276,37.7,417,492,639,1422,3483,1783,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,False,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,,No
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343,252.0,15989,219.7,2353,32.3,3522,4281,1785,30955,40347,20653,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,False,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,,No
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500,215.4,6132,203.2,368,12.2,3054,2199,1434,11285,15391,8969,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,False,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,07/20/2020,Yes
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718,287.8,107383,271.8,6333,16.0,17241,13434,1029,93875,133206,89195,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,False,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,07/18/2020,Yes


In [53]:
merged_data['Population Density per mi²'] = merged_data['Population Density per mi²'].str.replace(',', '', regex=True).astype(float)
merged_data['Physicians'] = merged_data['Physicians'].str.replace(',', '', regex=True).astype(float)
merged_data['Active DO'] = merged_data['Active DO'].str.replace(',', '', regex=True).astype(float)
merged_data['Active MO'] = merged_data['Active MO'].str.replace(',', '', regex=True).astype(float)


In [54]:
merged_data.dtypes

Location                       object
Employment_2020                 int64
Employment_2021                 int64
Employment_2022                 int64
Inc_Per_Cap_2020                int64
Inc_Per_Cap_2021                int64
Inc_Per_CAp_2022                int64
Life_Exp_2020                 float64
Life_Exp_2019                 float64
Life_Exp_2018                 float64
Employer_2019                 float64
Non-Group_2019                float64
Medicaid_2019                 float64
Medicare_2019                 float64
Military_2019                 float64
Uninsured_2019                float64
Employer_2021                 float64
Non-Group_2021                float64
Medicaid_2021                 float64
Medicare_2021                 float64
Military_2021                 float64
Uninsured_2021                float64
Population Density per mi²    float64
2010 Population                 int64
2020 Population                 int64
flu_vaccination_rate_2019     float64
asthma_preva

In [102]:
merged_data.isnull().sum()

Location                                       0
Employment_2020                                0
Employment_2021                                0
Employment_2022                                0
Inc_Per_Cap_2020                               0
Inc_Per_Cap_2021                               0
Inc_Per_CAp_2022                               0
Life_Exp_2020                                  0
Life_Exp_2019                                  0
Life_Exp_2018                                  0
Employer_2019                                  0
Non-Group_2019                                 0
Medicaid_2019                                  0
Medicare_2019                                  0
Military_2019                                  0
Uninsured_2019                                 0
Employer_2021                                  0
Non-Group_2021                                 0
Medicaid_2021                                  0
Medicare_2021                                  0
Military_2021       

In [103]:
merged_data['asthma_prevalence'].fillna(8.4, inplace=True)
merged_data['kidney_disease_prevalence'].fillna(52.1, inplace=True)
merged_data['flu_vaccination_rate_2019'].fillna(38, inplace=True)
merged_data['high_bp_prevalence'].fillna(33, inplace=True)
merged_data['copd_prevalence'].fillna(5.2, inplace =True)
merged_data['diabetes_prevalence'].fillna(9.5, inplace=True)

In [107]:
merged_data.to_csv('Data/Merged_state_data.csv', index=False)

### Modeling 

In [57]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor
from sklearn import metrics

In [58]:
merged_data.dropna(inplace=True)

#### Model 1 - Benchmark

In [59]:
X = merged_data.drop(columns=['Location','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022'])
y = merged_data[['Exc_deaths_2020','Exc_deaths_2021']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [60]:
lr = MultiOutputRegressor(LinearRegression()).fit(X_train, y_train)

In [61]:
lr.score(X_train, y_train)

1.0

In [62]:
lr.score(X_test, y_test)

0.793477988400882

In [63]:
lr1 = LinearRegression()
lr1.fit(X_train, y_train)

In [64]:
lr1.score(X_train, y_train)

1.0

In [65]:
lr1.score(X_test, y_test)

0.7934779884008776

#### Model 2

In [66]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [67]:
lr2 = LinearRegression()
lr2.fit(X_train_sc, y_train)

In [68]:
lr2.score(X_train_sc, y_train), lr2.score(X_test_sc, y_test)

(1.0, 0.7694275676597457)

#### Model 3

In [69]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn.score(X_train, y_train), knn.score(X_test, y_test)

(0.7597748656291643, 0.653027945365021)

#### Model 4

In [70]:
knn2 = KNeighborsRegressor()
knn2.fit(X_train_sc, y_train)
knn2.score(X_train_sc, y_train), knn2.score(X_test_sc, y_test)

(0.7752211504618127, 0.629732724651286)

#### Model 5

In [71]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree.score(X_train, y_train), tree.score(X_test, y_test)

(1.0, 0.7252620627434156)

#### Model 6

In [72]:
tree2 = DecisionTreeRegressor()
tree2.fit(X_train_sc, y_train)
tree2.score(X_train_sc, y_train), tree2.score(X_test_sc, y_test)

(1.0, 0.19014981299947975)

#### Model 7 

In [73]:
bag1 = BaggingRegressor()
bag1.fit(X_train , y_train)
bag1.score(X_train, y_train), bag1.score(X_test, y_test)

(0.9171158986604249, 0.6501179268850983)

#### Model 8

In [74]:
bag2 = BaggingRegressor()
bag2.fit(X_train_sc, y_train)
bag2.score(X_train_sc, y_train), bag2.score(X_test_sc, y_test)

(0.9343257761724155, 0.5798671247451838)

#### Model 9

In [75]:
rf1 = RandomForestRegressor()
rf1.fit(X_train, y_train)
rf1.score(X_train, y_train), rf1.score(X_test, y_test)

(0.964262902138256, 0.7628486851006646)

#### Model 10

In [76]:
rf2 = RandomForestRegressor()
rf2.fit(X_train_sc, y_train)
rf2.score(X_train_sc, y_train), rf2.score(X_test_sc,y_test)

(0.9490787867310821, 0.7168981406133752)