### Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### 2020 Census Population data

In [2]:
url = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010220'
res = requests.get(url)

In [3]:
soup = BeautifulSoup(res.content)

In [4]:
table = soup.find(attrs={'class':'qf-graph-scroll'})
# soup.find_all(attrs={'class':'qf-positive'})
tbody = table.find_all(attrs={'class':"qf-graph-geo"})
tr = tbody[1]
state = tr.find('a').attrs['data-title']
tr.find(attrs={'class':'qf-positive'}).attrs['data-value']

'29145505'

In [5]:
pop_2020 = []
for tr in tbody:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2020 Population': pop}
    pop_2020.append(state_pop)

In [6]:
pop_2020 = pd.DataFrame(pop_2020)

### 2010 Census Population Data

In [7]:
url2 = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010210'
res2 = requests.get(url2)

In [8]:
soup2 = BeautifulSoup(res2.content)
table2 = soup2.find(attrs={'class':'qf-graph-scroll'})
tbody2 = table2.find_all(attrs={'class':"qf-graph-geo"})

In [9]:
pop_2010 = []
for tr in tbody2:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2010 Population': pop}
    pop_2010.append(state_pop)

pop_2010 = pd.DataFrame(pop_2010)

In [10]:
population = pd.merge(pop_2010, pop_2020)
population

Unnamed: 0,State,2010 Population,2020 Population
0,California,37253956,39538223
1,Texas,25145561,29145505
2,New York,19378102,20201249
3,Florida,18801310,21538187
4,Illinois,12830632,12812508
5,Pennsylvania,12702379,13002700
6,Ohio,11536504,11799448
7,Michigan,9883640,10077331
8,Georgia,9687653,10711908
9,North Carolina,9535483,10439388


In [11]:
population.dtypes

State              object
2010 Population    object
2020 Population    object
dtype: object

In [12]:
population[['2010 Population', '2020 Population']] = population[['2010 Population',
                                                                 '2020 Population']].astype(int)

In [13]:
population.dtypes

State              object
2010 Population     int64
2020 Population     int64
dtype: object

In [14]:
population.to_csv('Data/Population_data_2010_&_2020.csv')

### Excess Deaths Data

In [15]:
deaths = pd.read_csv('Ignore/Excess_Deaths_Associated_with_COVID-19.csv')
deaths.head(100)

Unnamed: 0,Week Ending Date,State,Observed Number,Upper Bound Threshold,Exceeds Threshold,Average Expected Count,Excess Estimate,Total Excess Estimate,Percent Excess Estimate,Year,Type,Outcome,Suppress,Note
0,2017-01-07,Alabama,1121.0,1136,False,1059,62,29601,5.8527,2017,Predicted (weighted),All causes,,
1,2017-01-14,Alabama,1130.0,1140,False,1067,63,29601,5.906102,2017,Predicted (weighted),All causes,,
2,2017-01-21,Alabama,1048.0,1142,False,1071,0,29601,0.0,2017,Predicted (weighted),All causes,,
3,2017-01-28,Alabama,1026.0,1142,False,1070,0,29601,0.0,2017,Predicted (weighted),All causes,,
4,2017-02-04,Alabama,1036.0,1142,False,1068,0,29601,0.0,2017,Predicted (weighted),All causes,,
5,2017-02-11,Alabama,1058.0,1136,False,1062,0,29601,0.0,2017,Predicted (weighted),All causes,,
6,2017-02-18,Alabama,1060.0,1132,False,1057,3,29601,0.283804,2017,Predicted (weighted),All causes,,
7,2017-02-25,Alabama,1099.0,1126,False,1051,48,29601,4.567542,2017,Predicted (weighted),All causes,,
8,2017-03-04,Alabama,1081.0,1119,False,1042,39,29601,3.743414,2017,Predicted (weighted),All causes,,
9,2017-03-11,Alabama,1011.0,1113,False,1036,0,29601,0.0,2017,Predicted (weighted),All causes,,


In [16]:
covid = deaths[(deaths['Type']=='Predicted (weighted)')&((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
       (deaths['Year'] == 2020) | (deaths['Year'] == 2021) | (deaths['Year'] == 2022)]

In [17]:
covid['Type'].value_counts()

Predicted (weighted)    16956
Unweighted               5670
Name: Type, dtype: int64

In [18]:
covid['Outcome'].value_counts()

All causes                        14148
All causes, excluding COVID-19     8478
Name: Outcome, dtype: int64

In [19]:
covid = deaths[(deaths['Type'] == 'Predicted (weighted)') & 
               ((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
               (deaths['Year'].isin([2020, 2021, 2022]))]

def covid_deaths(data):
    grouped = data.groupby(['Year', 'State', 'Outcome'])['Excess Estimate'].sum().reset_index()
    pivoted = grouped.pivot(index=['Year', 'State'], columns='Outcome', 
                            values='Excess Estimate').reset_index()
    pivoted = pivoted.fillna(0)
    pivoted['covid'] = pivoted['All causes'] - pivoted['All causes, excluding COVID-19']
    pivoted.rename(columns={'Year': 'Location'}, inplace=True)
    results = pivoted.to_dict(orient='records')

    return results


In [20]:
deaths_covid = pd.DataFrame(covid_deaths(covid))

In [21]:
deaths_covid

Unnamed: 0,Location,State,All causes,"All causes, excluding COVID-19",covid
0,2020,Alabama,9021,2684,6337
1,2020,Alaska,545,332,213
2,2020,Arizona,13186,4583,8603
3,2020,Arkansas,4992,1301,3691
4,2020,California,41279,11317,29962
5,2020,Colorado,6426,1692,4734
6,2020,Connecticut,6015,549,5466
7,2020,Delaware,1338,414,924
8,2020,District of Columbia,1171,322,849
9,2020,Florida,26556,5979,20577


In [22]:
deaths_covid = deaths_covid.pivot(index='State', columns='Location', 
                                  values=['All causes', 'covid'])

deaths_covid.columns = ['_'.join(map(str, col)) for col in deaths_covid.columns]
deaths_covid = deaths_covid.reset_index()
deaths_covid.rename(columns={'State': 'Location'}, inplace=True)
deaths_covid.rename(columns={
    'All causes_2020': 'all_causes_2020',
    'All causes_2021': 'all_causes_2021',
    'All causes_2022': 'all_causes_2022',
    'covid_2020': 'covid_2020',
    'covid_2021': 'covid_2021',
    'covid_2022': 'covid_2022',
}, inplace=True)
deaths_covid.head()

Unnamed: 0,Location,all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022
0,Alabama,9021,13018,6246,6337,9771,3933
1,Alaska,545,1429,686,213,804,275
2,Arizona,13186,17961,8835,8603,13536,5849
3,Arkansas,4992,6908,3854,3691,5333,2593
4,California,41279,60680,36786,29962,48834,21158


In [23]:
deaths_covid.dtypes

Location           object
all_causes_2020     int64
all_causes_2021     int64
all_causes_2022     int64
covid_2020          int64
covid_2021          int64
covid_2022          int64
dtype: object

In [24]:
(deaths_covid[['covid_2020', 'covid_2021', 'covid_2022']].sum()).sum()

2045325

In [25]:
deaths_covid['Location_Upper'] = deaths_covid['Location'].str.upper()
population['State_Upper'] = population['State'].str.upper()

for year in [2020, 2021, 2022]:
    deaths_covid[f'Covid_pop_perce_{year}'] = (
        deaths_covid.apply(
            lambda row: (
                row[f'covid_{year}'] /
                population.loc[
                    population['State_Upper'] == row['Location_Upper'],
                    '2020 Population'
                ].values
            )[0] if row['Location_Upper'] in population['State_Upper'].values else None,
            axis=1
        )
    )

deaths_covid.head()

Unnamed: 0,Location,all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022,Location_Upper,Covid_pop_perce_2020,Covid_pop_perce_2021,Covid_pop_perce_2022
0,Alabama,9021,13018,6246,6337,9771,3933,ALABAMA,0.001261,0.001945,0.000783
1,Alaska,545,1429,686,213,804,275,ALASKA,0.00029,0.001096,0.000375
2,Arizona,13186,17961,8835,8603,13536,5849,ARIZONA,0.001203,0.001893,0.000818
3,Arkansas,4992,6908,3854,3691,5333,2593,ARKANSAS,0.001226,0.001771,0.000861
4,California,41279,60680,36786,29962,48834,21158,CALIFORNIA,0.000758,0.001235,0.000535


In [26]:
deaths_covid.dropna(inplace=True)
deaths_covid.drop(columns=['Location_Upper'], inplace=True)

In [27]:
deaths_covid.to_csv('Data/Cleaned/cleaned_covid_death_state.csv', index = False)

In [28]:
# getting sum of each states excess deaths 
def excess_deaths(deaths, start_year=2017, end_year=2022):
    state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        excess_estimate = row['Excess Estimate']
        
        if start_year <= year <= end_year:
            state_totals[state][year - start_year] += excess_estimate

    result = pd.DataFrame(state_totals).T.reset_index()
    result.columns = ['Location'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


Unnamed: 0,Location,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022
0,Alabama,2649,4062,945,20726,29283,14805
1,Alaska,417,492,639,1422,3483,1783
2,Arizona,3522,4281,1785,30955,40347,20653
3,Arkansas,3054,2199,1434,11285,15391,8969
4,California,17241,13434,1029,93875,133206,89195


In [29]:
# Getting count for each year and each state where they exceed their threshold
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    result = pd.DataFrame()
    
    for year in range(start_year, end_year + 1):
        
        year_data = deaths[deaths['Year'] == year]
        year_counts = year_data.groupby('State')['Exceeds Threshold'].sum().reset_index()
        year_counts = year_counts.rename(columns={'Exceeds Threshold': f'Exc_count_{year}'})
        year_counts.set_index('State', inplace=True)
        
        if result.empty:
            result = year_counts
        else:
            result = result.merge(year_counts, left_index=True, right_index=True, how='outer')
    
    result = result.reset_index().fillna(0)
    
    return result

In [30]:
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    results = []
    
    for year in range(start_year, end_year + 1):
        year_counts = deaths[deaths['Year'] == year]
        state_counts = {}
        
        for state in year_counts['State'].unique():
            count = year_counts[(year_counts['State'] == state) & (year_counts['Exceeds Threshold'] == 'True')].shape[0]
            state_counts[state] = count
        
        results.append({'Year': year, **state_counts})
    
    return pd.DataFrame(results)

exceeds_threshold = count_exceeds_threshold(deaths, start_year=2017, end_year=2022)
exceeds_threshold

Unnamed: 0,Year,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District of Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,New York City,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Puerto Rico,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,United States,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
0,2017,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2018,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2019,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2020,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,2022,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [31]:
deaths["Week Ending Date"] = pd.to_datetime(deaths["Week Ending Date"])
deaths["Year"] = deaths["Week Ending Date"].dt.year
exceeds_threshold = deaths[deaths["Exceeds Threshold"] == True]
result = exceeds_threshold.groupby(["State", "Year"]).size().reset_index(name="Exceeds Threshold Count")

result

Unnamed: 0,State,Year,Exceeds Threshold Count
0,Alabama,2017,3
1,Alabama,2018,15
2,Alabama,2020,99
3,Alabama,2021,106
4,Alabama,2022,87
5,Alabama,2023,21
6,Alaska,2017,6
7,Alaska,2018,6
8,Alaska,2019,3
9,Alaska,2020,34


In [32]:
# Merging the data
#finaldeaths = pd.merge(excess_deaths, exceeds_threshold, how='left')
#finaldeaths.rename(columns={'State':'Location'}, inplace=True)
#finaldeaths.head()

In [33]:
#finaldeaths.to_csv('Data/Excess_deaths&Exceeds_Threshold_data.csv')

### Population Density Data

In [34]:
url3 = 'https://wisevoter.com/state-rankings/population-density-by-state/'
res3 = requests.get(url3)
soup3 = BeautifulSoup(res3.content)

In [35]:
table3 = soup3.find('table', attrs={'id': 'shdb-on-page-table'})
tbody = table3.find('tbody')
trs = tbody.find_all('tr')

In [36]:
pop_density = []
for tr in trs:
    state = tr.find(attrs={'class':'shdb-on-page-table-body-Geo'}).text
    density = tr.find(attrs={'class':'shdb-on-page-table-body-Data'}).text.split()[0]
    
    state_density = {'State':state, 'Population Density per mi²': density}
    pop_density.append(state_density)
Pop_density = pd.DataFrame(pop_density)
Pop_density.head()

Unnamed: 0,State,Population Density per mi²
0,New Jersey,1283.4
1,Rhode Island,1074.3
2,Massachusetts,919.82
3,Connecticut,746.7
4,Maryland,648.84


In [37]:
Pop_density.to_csv('Data/Population_Density_data.csv')

### Mask county data

In [38]:
county_mask = pd.read_csv('Ignore/U.S._State_and_Territorial_Public_Mask_Mandates_From_April_10__2020_through_August_15__2021_by_County_by_Day.csv')
county_mask

In [39]:
# Put my code into Chatgpt who helped simplify to speed up run time
def mandate_length(data):
    county_counts = {}
    for index, row in data.iterrows():
        county_name = row['County_Name']
        mask_required = row['Face_Masks_Required_in_Public']
        state = row['State_Tribe_Territory']
        
        if mask_required == 'Yes':
            if state in county_counts:
                if county_name in county_counts[state]:
                    county_counts[state][county_name]['Count'] += 1
                else:
                    county_counts[state][county_name] = {'State': state, 'Count': 1}
            else:
                county_counts[state] = {county_name: {'State': state, 'Count': 1}}

    county_counts_list = [{'State': state, 'County': county_name, 'Count': county_counts[state][county_name]['Count']} for state in county_counts for county_name in county_counts[state]]

    return pd.DataFrame(county_counts_list)

county_counts_df = mandate_length(county_mask)

In [40]:
county_counts_df.head()

Unnamed: 0,State,County,Count
0,AL,Autauga County,267
1,AL,Baldwin County,267
2,AL,Barbour County,267
3,AL,Bibb County,267
4,AL,Blount County,267


In [41]:
county_counts_df.shape

(2423, 3)

In [42]:
county_counts_df['State'].value_counts()

TX    254
VA    133
KY    120
KS    105
IL    102
NC    100
IA     99
IN     92
OH     88
MN     87
MI     83
MS     82
PR     78
AR     75
WI     72
PA     67
AL     67
LA     64
CO     64
NY     62
CA     58
MT     56
WV     55
ND     53
WA     39
OR     36
NM     33
UT     29
MD     24
WY     23
NJ     21
NV     17
ME     16
MA     14
VT     14
NH     10
CT      8
RI      5
HI      5
AS      5
DE      3
VI      3
GU      1
DC      1
Name: State, dtype: int64

In [43]:
county_counts_df.to_csv('Data/county_mask_mandata.csv', index=False)

### Merging the data

In [44]:
def merge_dataframes():
    employ = pd.read_csv('Data/Cleaned/cleaned_employment.csv')
    income = pd.read_csv('Data/Cleaned/cleaned_income.csv')
    life = pd.read_csv('Data/Cleaned/cleaned_Life_Expentency.csv')
    insur = pd.read_csv('Data/Cleaned/cleaned_insur.csv')
    pop_dense = pd.read_csv('Data/Cleaned/cleaned_pop_dense.csv')
    pop_size = pd.read_csv('Data/Cleaned/cleaned_pop_size.csv')
    pre_cond = pd.read_csv('Data/Cleaned/cleaned_pre_condtions.csv')
    total_phys = pd.read_csv('Data/Cleaned/cleaned_total_physician.csv')
    
    merged_df = employ.merge(income, on='Location').merge(life, on='Location').merge(insur, on='Location') \
        .merge(pop_dense, on='Location').merge(pop_size, on='Location').merge(pre_cond, on='Location') \
        .merge(total_phys, on='Location').merge(excess_deaths, on='Location')

    return merged_df
merged_data = merge_dataframes()

In [45]:
merged_data.drop(columns=['Unnamed: 0'], inplace=True)
merged_data.head()


Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983,224.0,10389,211.9,594,12.1,2649,4062,945,20726,29283,14805
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101,287.2,1825,249.5,276,37.7,417,492,639,1422,3483,1783
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343,252.0,15989,219.7,2353,32.3,3522,4281,1785,30955,40347,20653
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500,215.4,6132,203.2,368,12.2,3054,2199,1434,11285,15391,8969
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718,287.8,107383,271.8,6333,16.0,17241,13434,1029,93875,133206,89195


In [46]:
vax_state = pd.read_csv('Data/Cleaned/vax_state.csv')
vax_state.head()

Unnamed: 0,Location,Dist_Per_100K_2021,Distributed_Per_100k_65Plus_2021,Admin_Per_100K_2021,Admin_Per_100k_65Plus_2021,Administered_Dose1_Pop_Pct_2021,Administered_Dose1_Recip_65PlusPop_Pct_2021,Series_Complete_Pop_Pct_2021,Series_Complete_65PlusPop_Pct_2021,Additional_Doses_Vax_Pct_2021,Additional_Doses_50Plus_Vax_Pct_2021,Additional_Doses_65Plus_Vax_Pct_2021,Dist_Per_100K_2022,Distributed_Per_100k_65Plus_2022,Admin_Per_100K_2022,Admin_Per_100k_65Plus_2022,Administered_Dose1_Pop_Pct_2022,Administered_Dose1_Recip_65PlusPop_Pct_2022,Series_Complete_Pop_Pct_2022,Series_Complete_65PlusPop_Pct_2022,Additional_Doses_Vax_Pct_2022,Additional_Doses_50Plus_Vax_Pct_2022,Additional_Doses_65Plus_Vax_Pct_2022,Second_Booster_50Plus_Vax_Pct,Second_Booster_65Plus_Vax_Pct,Bivalent_Booster_65Plus_Pop_Pct
0,KS,178349,1092740,140263,241990,69.4,95.0,57.1,88.2,33.8,48.5,58.3,273256,1674240,181329,327654,75.9,95.0,65.0,95.0,48.7,64.1,72.8,51.2,59.1,39.7
1,AL,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2
2,NJ,210259,1265690,168844,238218,83.9,95.0,70.6,89.9,34.2,48.6,58.1,317756,1912780,219584,324072,94.3,95.0,78.8,95.0,51.6,64.9,72.2,44.9,53.7,35.1
3,GA,172562,1207790,125950,217166,61.4,92.3,51.1,80.8,28.2,40.1,48.6,258794,1811340,158505,284715,68.1,95.0,57.1,86.4,43.7,58.3,67.1,42.1,48.7,26.6
4,PR,201561,947131,185830,223651,89.1,95.0,77.1,86.1,28.9,41.6,46.2,268619,1262240,248533,299619,95.0,95.0,86.7,92.8,63.4,73.8,75.6,33.1,38.6,12.3


In [47]:
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}
vax_state['Location'] = vax_state['Location'].map(state_name)

In [48]:
state_mask = pd.read_csv('Data/Cleaned/cleaned_mask.csv')
merged_data = merged_data.merge(vax_state, on='Location').merge(state_mask, on='Location')
merged_data.head()

Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022,Dist_Per_100K_2021,Distributed_Per_100k_65Plus_2021,Admin_Per_100K_2021,Admin_Per_100k_65Plus_2021,Administered_Dose1_Pop_Pct_2021,Administered_Dose1_Recip_65PlusPop_Pct_2021,Series_Complete_Pop_Pct_2021,Series_Complete_65PlusPop_Pct_2021,Additional_Doses_Vax_Pct_2021,Additional_Doses_50Plus_Vax_Pct_2021,Additional_Doses_65Plus_Vax_Pct_2021,Dist_Per_100K_2022,Distributed_Per_100k_65Plus_2022,Admin_Per_100K_2022,Admin_Per_100k_65Plus_2022,Administered_Dose1_Pop_Pct_2022,Administered_Dose1_Recip_65PlusPop_Pct_2022,Series_Complete_Pop_Pct_2022,Series_Complete_65PlusPop_Pct_2022,Additional_Doses_Vax_Pct_2022,Additional_Doses_50Plus_Vax_Pct_2022,Additional_Doses_65Plus_Vax_Pct_2022,Second_Booster_50Plus_Vax_Pct,Second_Booster_65Plus_Vax_Pct,Bivalent_Booster_65Plus_Pop_Pct,Mask_Mandate,Mandatory
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983,224.0,10389,211.9,594,12.1,2649,4062,945,20726,29283,14805,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,07/16/2020,Yes
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101,287.2,1825,249.5,276,37.7,417,492,639,1422,3483,1783,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,,No
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343,252.0,15989,219.7,2353,32.3,3522,4281,1785,30955,40347,20653,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,,No
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500,215.4,6132,203.2,368,12.2,3054,2199,1434,11285,15391,8969,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,07/20/2020,Yes
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718,287.8,107383,271.8,6333,16.0,17241,13434,1029,93875,133206,89195,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,07/18/2020,Yes


In [49]:
merged_data['Population Density per mi²'] = merged_data['Population Density per mi²'].str.replace(',', '', regex=True).astype(float)
merged_data['Physicians'] = merged_data['Physicians'].str.replace(',', '', regex=True).astype(float)
merged_data['Active DO'] = merged_data['Active DO'].str.replace(',', '', regex=True).astype(float)
merged_data['Active MO'] = merged_data['Active MO'].str.replace(',', '', regex=True).astype(float)
merged_data['Mandatory'] = merged_data['Mandatory'].map({'Yes':1, 'No':0})

In [50]:
merged_data.dtypes

Location                                        object
Employment_2020                                  int64
Employment_2021                                  int64
Employment_2022                                  int64
Inc_Per_Cap_2020                                 int64
Inc_Per_Cap_2021                                 int64
Inc_Per_CAp_2022                                 int64
Life_Exp_2020                                  float64
Life_Exp_2019                                  float64
Life_Exp_2018                                  float64
Employer_2019                                  float64
Non-Group_2019                                 float64
Medicaid_2019                                  float64
Medicare_2019                                  float64
Military_2019                                  float64
Uninsured_2019                                 float64
Employer_2021                                  float64
Non-Group_2021                                 float64
Medicaid_2

In [51]:
merged_data.isnull().sum()

Location                                        0
Employment_2020                                 0
Employment_2021                                 0
Employment_2022                                 0
Inc_Per_Cap_2020                                0
Inc_Per_Cap_2021                                0
Inc_Per_CAp_2022                                0
Life_Exp_2020                                   0
Life_Exp_2019                                   0
Life_Exp_2018                                   0
Employer_2019                                   0
Non-Group_2019                                  0
Medicaid_2019                                   0
Medicare_2019                                   0
Military_2019                                   0
Uninsured_2019                                  0
Employer_2021                                   0
Non-Group_2021                                  0
Medicaid_2021                                   0
Medicare_2021                                   0


In [52]:
merged_data['asthma_prevalence'].fillna(8.4, inplace=True)
merged_data['kidney_disease_prevalence'].fillna(52.1, inplace=True)
merged_data['flu_vaccination_rate_2019'].fillna(38, inplace=True)
merged_data['high_bp_prevalence'].fillna(33, inplace=True)
merged_data['copd_prevalence'].fillna(5.2, inplace =True)
merged_data['diabetes_prevalence'].fillna(9.5, inplace=True)

In [53]:
merged_data.to_csv('Data/Merged_state_data.csv', index=False)

# Modelling to pick Y

### Modeling -  Y = Excess Deaths

In [250]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
from sklearn import metrics

In [55]:
merged_data.dropna(inplace=True)

#### Model 1 - Benchmark

In [56]:
X = merged_data.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022'])
y = merged_data[['Exc_deaths_2021']]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [57]:
lr1 = LinearRegression()
lr1.fit(X_train, y_train)

In [58]:
lr1.score(X_train, y_train)

1.0

In [59]:
lr1.score(X_test, y_test)

-1.9669490347275214

#### Model 2

In [60]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [61]:
lr2 = LinearRegression()
lr2.fit(X_train_sc, y_train)

In [62]:
lr2.score(X_train_sc, y_train), lr2.score(X_test_sc, y_test)

(1.0, 0.3771377100198612)

#### Model 3

In [63]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
knn.score(X_train, y_train), knn.score(X_test, y_test)

(0.7061358653697374, 0.7070898739868592)

#### Model 4

In [64]:
knn2 = KNeighborsRegressor()
knn2.fit(X_train_sc, y_train)
knn2.score(X_train_sc, y_train), knn2.score(X_test_sc, y_test)

(0.606112479485353, 0.3430513444405179)

#### Model 5

In [65]:
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
tree.score(X_train, y_train), tree.score(X_test, y_test)

(1.0, 0.8856348202618944)

#### Model 6

In [66]:
tree2 = DecisionTreeRegressor()
tree2.fit(X_train_sc, y_train)
tree2.score(X_train_sc, y_train), tree2.score(X_test_sc, y_test)

(1.0, 0.8846859164405231)

#### Model 7 

In [67]:
bag1 = BaggingRegressor()
bag1.fit(X_train , y_train)
bag1.score(X_train, y_train), bag1.score(X_test, y_test)

  return column_or_1d(y, warn=True)


(0.9018458363849775, 0.676719659204668)

#### Model 8

In [68]:
bag2 = BaggingRegressor()
bag2.fit(X_train_sc, y_train)
bag2.score(X_train_sc, y_train), bag2.score(X_test_sc, y_test)

  return column_or_1d(y, warn=True)


(0.8907646106595325, 0.7536519166852861)

#### Model 9

In [69]:
rf1 = RandomForestRegressor()
rf1.fit(X_train, y_train)
rf1.score(X_train, y_train), rf1.score(X_test, y_test)

  return fit_method(estimator, *args, **kwargs)


(0.9217019092312632, 0.6946731463832816)

#### Model 10

In [70]:
rf2 = RandomForestRegressor()
rf2.fit(X_train_sc, y_train)
rf2.score(X_train_sc, y_train), rf2.score(X_test_sc,y_test)

  return fit_method(estimator, *args, **kwargs)


(0.9284383216318816, 0.589911823812978)

## Modelling - Y = Covid deaths

In [134]:
df = pd.read_csv('Data/merged_state_with_health.csv')
df.head()

Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022,Dist_Per_100K_2021_x,Distributed_Per_100k_65Plus_2021_x,Admin_Per_100K_2021_x,Admin_Per_100k_65Plus_2021_x,Administered_Dose1_Pop_Pct_2021_x,Administered_Dose1_Recip_65PlusPop_Pct_2021_x,Series_Complete_Pop_Pct_2021_x,Series_Complete_65PlusPop_Pct_2021_x,Additional_Doses_Vax_Pct_2021_x,Additional_Doses_50Plus_Vax_Pct_2021_x,Additional_Doses_65Plus_Vax_Pct_2021_x,Dist_Per_100K_2022_x,Distributed_Per_100k_65Plus_2022_x,Admin_Per_100K_2022_x,Admin_Per_100k_65Plus_2022_x,Administered_Dose1_Pop_Pct_2022_x,Administered_Dose1_Recip_65PlusPop_Pct_2022_x,Series_Complete_Pop_Pct_2022_x,Series_Complete_65PlusPop_Pct_2022_x,Additional_Doses_Vax_Pct_2022_x,Additional_Doses_50Plus_Vax_Pct_2022_x,Additional_Doses_65Plus_Vax_Pct_2022_x,Second_Booster_50Plus_Vax_Pct_x,Second_Booster_65Plus_Vax_Pct_x,Bivalent_Booster_65Plus_Pop_Pct_x,astha_prevalence,Dist_Per_100K_2021_y,Distributed_Per_100k_65Plus_2021_y,Admin_Per_100K_2021_y,Admin_Per_100k_65Plus_2021_y,Administered_Dose1_Pop_Pct_2021_y,Administered_Dose1_Recip_65PlusPop_Pct_2021_y,Series_Complete_Pop_Pct_2021_y,Series_Complete_65PlusPop_Pct_2021_y,Additional_Doses_Vax_Pct_2021_y,Additional_Doses_50Plus_Vax_Pct_2021_y,Additional_Doses_65Plus_Vax_Pct_2021_y,Dist_Per_100K_2022_y,Distributed_Per_100k_65Plus_2022_y,Admin_Per_100K_2022_y,Admin_Per_100k_65Plus_2022_y,Administered_Dose1_Pop_Pct_2022_y,Administered_Dose1_Recip_65PlusPop_Pct_2022_y,Series_Complete_Pop_Pct_2022_y,Series_Complete_65PlusPop_Pct_2022_y,Additional_Doses_Vax_Pct_2022_y,Additional_Doses_50Plus_Vax_Pct_2022_y,Additional_Doses_65Plus_Vax_Pct_2022_y,Second_Booster_50Plus_Vax_Pct_y,Second_Booster_65Plus_Vax_Pct_y,Bivalent_Booster_65Plus_Pop_Pct_y,Mask_Mandate,Mandatory,Current President,Your State Governor,Go to work,Go to the gym,Go visit a friend,"Go to a cafe, bar, or restaurant",Go to a doctor or visit a hospital,Go to church or another place of worship,"Take mass transit (e.g. subway, bus or train)",Avoiding contact with other people,Avoiding public or crowded places,Frequently washing hands,Wearing a face mask when outside of your home,Been in a room with someone outside of \n household in the past 24 hours,"Yes, 5-10 people","Yes, 11-50 people","Yes, 50 or more people"
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983,224.0,10389,211.9,594,12.1,2649,4062,945,20726,29283,14805,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,False,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,07/16/2020,Yes,54.371124,58.903232,26.31,2.61,11.48,8.44,6.96,3.23,0.42,67.99,76.28,80.84,51.94,30.52,6.7,1.22,1.0
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101,287.2,1825,249.5,276,37.7,417,492,639,1422,3483,1783,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,False,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,,No,56.060775,64.20881,28.26,3.44,16.69,1.18,4.28,5.07,0.0,51.31,71.37,76.71,46.17,44.03,5.44,1.39,0.0
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343,252.0,15989,219.7,2353,32.3,3522,4281,1785,30955,40347,20653,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,False,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,,No,43.079904,57.217666,32.02,1.48,6.21,6.0,6.81,1.69,2.76,65.18,71.42,79.7,49.7,33.24,8.51,1.14,2.89
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500,215.4,6132,203.2,368,12.2,3054,2199,1434,11285,15391,8969,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,False,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,07/20/2020,Yes,53.716654,65.260685,29.37,0.72,11.52,5.41,5.81,2.08,0.11,58.48,68.32,80.59,44.19,30.78,7.73,1.49,1.0
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718,287.8,107383,271.8,6333,16.0,17241,13434,1029,93875,133206,89195,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,False,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,07/18/2020,Yes,35.634465,72.007486,25.46,1.83,6.79,8.18,7.48,0.77,1.98,75.91,78.55,80.42,72.73,20.95,3.63,1.23,1.18


In [135]:
df = df.merge(deaths_covid, on='Location')
df.head()

Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022,Dist_Per_100K_2021_x,Distributed_Per_100k_65Plus_2021_x,Admin_Per_100K_2021_x,Admin_Per_100k_65Plus_2021_x,Administered_Dose1_Pop_Pct_2021_x,Administered_Dose1_Recip_65PlusPop_Pct_2021_x,Series_Complete_Pop_Pct_2021_x,Series_Complete_65PlusPop_Pct_2021_x,Additional_Doses_Vax_Pct_2021_x,Additional_Doses_50Plus_Vax_Pct_2021_x,Additional_Doses_65Plus_Vax_Pct_2021_x,Dist_Per_100K_2022_x,Distributed_Per_100k_65Plus_2022_x,Admin_Per_100K_2022_x,Admin_Per_100k_65Plus_2022_x,Administered_Dose1_Pop_Pct_2022_x,Administered_Dose1_Recip_65PlusPop_Pct_2022_x,Series_Complete_Pop_Pct_2022_x,Series_Complete_65PlusPop_Pct_2022_x,Additional_Doses_Vax_Pct_2022_x,Additional_Doses_50Plus_Vax_Pct_2022_x,Additional_Doses_65Plus_Vax_Pct_2022_x,Second_Booster_50Plus_Vax_Pct_x,Second_Booster_65Plus_Vax_Pct_x,Bivalent_Booster_65Plus_Pop_Pct_x,astha_prevalence,Dist_Per_100K_2021_y,Distributed_Per_100k_65Plus_2021_y,Admin_Per_100K_2021_y,Admin_Per_100k_65Plus_2021_y,Administered_Dose1_Pop_Pct_2021_y,Administered_Dose1_Recip_65PlusPop_Pct_2021_y,Series_Complete_Pop_Pct_2021_y,Series_Complete_65PlusPop_Pct_2021_y,Additional_Doses_Vax_Pct_2021_y,Additional_Doses_50Plus_Vax_Pct_2021_y,Additional_Doses_65Plus_Vax_Pct_2021_y,Dist_Per_100K_2022_y,Distributed_Per_100k_65Plus_2022_y,Admin_Per_100K_2022_y,Admin_Per_100k_65Plus_2022_y,Administered_Dose1_Pop_Pct_2022_y,Administered_Dose1_Recip_65PlusPop_Pct_2022_y,Series_Complete_Pop_Pct_2022_y,Series_Complete_65PlusPop_Pct_2022_y,Additional_Doses_Vax_Pct_2022_y,Additional_Doses_50Plus_Vax_Pct_2022_y,Additional_Doses_65Plus_Vax_Pct_2022_y,Second_Booster_50Plus_Vax_Pct_y,Second_Booster_65Plus_Vax_Pct_y,Bivalent_Booster_65Plus_Pop_Pct_y,Mask_Mandate,Mandatory,Current President,Your State Governor,Go to work,Go to the gym,Go visit a friend,"Go to a cafe, bar, or restaurant",Go to a doctor or visit a hospital,Go to church or another place of worship,"Take mass transit (e.g. subway, bus or train)",Avoiding contact with other people,Avoiding public or crowded places,Frequently washing hands,Wearing a face mask when outside of your home,Been in a room with someone outside of \n household in the past 24 hours,"Yes, 5-10 people","Yes, 11-50 people","Yes, 50 or more people",all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022,Covid_pop_perce_2020,Covid_pop_perce_2021,Covid_pop_perce_2022
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983,224.0,10389,211.9,594,12.1,2649,4062,945,20726,29283,14805,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,False,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,07/16/2020,Yes,54.371124,58.903232,26.31,2.61,11.48,8.44,6.96,3.23,0.42,67.99,76.28,80.84,51.94,30.52,6.7,1.22,1.0,9021,13018,6246,6337,9771,3933,0.001261,0.001945,0.000783
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101,287.2,1825,249.5,276,37.7,417,492,639,1422,3483,1783,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,False,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,,No,56.060775,64.20881,28.26,3.44,16.69,1.18,4.28,5.07,0.0,51.31,71.37,76.71,46.17,44.03,5.44,1.39,0.0,545,1429,686,213,804,275,0.00029,0.001096,0.000375
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343,252.0,15989,219.7,2353,32.3,3522,4281,1785,30955,40347,20653,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,False,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,,No,43.079904,57.217666,32.02,1.48,6.21,6.0,6.81,1.69,2.76,65.18,71.42,79.7,49.7,33.24,8.51,1.14,2.89,13186,17961,8835,8603,13536,5849,0.001203,0.001893,0.000818
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500,215.4,6132,203.2,368,12.2,3054,2199,1434,11285,15391,8969,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,False,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,07/20/2020,Yes,53.716654,65.260685,29.37,0.72,11.52,5.41,5.81,2.08,0.11,58.48,68.32,80.59,44.19,30.78,7.73,1.49,1.0,4992,6908,3854,3691,5333,2593,0.001226,0.001771,0.000861
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718,287.8,107383,271.8,6333,16.0,17241,13434,1029,93875,133206,89195,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,False,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,07/18/2020,Yes,35.634465,72.007486,25.46,1.83,6.79,8.18,7.48,0.77,1.98,75.91,78.55,80.42,72.73,20.95,3.63,1.23,1.18,41279,60680,36786,29962,48834,21158,0.000758,0.001235,0.000535


In [136]:
#df.drop('Mask_Mandate', inplace = True)
df['Population Density per mi²'] = df['Population Density per mi²'].str.replace(',', '', regex=True).astype(float)
df['Physicians'] = df['Physicians'].str.replace(',', '', regex=True).astype(float)
df['Active DO'] = df['Active DO'].str.replace(',', '', regex=True).astype(float)
df['Active MO'] = df['Active MO'].str.replace(',', '', regex=True).astype(float)
df['Mandatory'] = df['Mandatory'].map({'Yes':1, 'No':0})

In [74]:
X1 = df.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y1 = df[['covid_2020', 'covid_2021', 'covid_2022']]

In [75]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42)

In [76]:
sc = StandardScaler()
X1_train_sc = sc.fit_transform(X1_train)
X1_test_sc = sc.transform(X1_test)

#### Model 1

In [77]:
lr3 = LinearRegression()
lr3.fit(X1_train, y1_train)
lr3.score(X1_train, y1_train), lr3.score(X1_test, y1_test)

(1.0, 0.8804713015012814)

#### Model 2

In [78]:
lr4 = LinearRegression()
lr4.fit(X1_train_sc, y1_train)
lr4.score(X1_train_sc, y1_train), lr4.score(X1_test_sc, y1_test)

(1.0, 0.9133207097355621)

#### Model 3

In [79]:
knn3 = KNeighborsRegressor()
knn3.fit(X1_train, y1_train)
knn3.score(X1_train, y1_train), knn3.score(X1_test, y1_test)

(0.7791424748373533, 0.6836087417832482)

#### Model 4

In [80]:
knn4 = KNeighborsRegressor()
knn4.fit(X1_train_sc, y1_train)
knn4.score(X1_train_sc, y1_train), knn4.score(X1_test_sc, y1_test)

(0.6966822172725827, 0.4383678724517098)

#### Model 5

In [81]:
bag3 = BaggingRegressor()
bag3.fit(X1_train, y1_train)
bag3.score(X1_train, y1_train), bag3.score(X1_test, y1_test)

(0.9521687067303682, 0.6996976023899467)

#### Model 6

In [82]:
bag4 = BaggingRegressor()
bag4.fit(X1_train_sc, y1_train)
bag4.score(X1_train_sc, y1_train), bag4.score(X1_test_sc, y1_test)

(0.9086042950564988, 0.635067554907139)

#### Model 7

In [83]:
rf3 = RandomForestRegressor()
rf3.fit(X1_train, y1_train)
rf3.score(X1_train, y1_train), rf3.score(X1_test, y1_test)

(0.9395362659142088, 0.7178368263503238)

#### Model 8

In [84]:
rf4 = RandomForestRegressor()
rf4.fit(X1_train_sc, y1_train)
rf4.score(X1_train_sc, y1_train), rf4.score(X1_test_sc, y1_test)

(0.9418015871880029, 0.7435688596555051)

## Modeling - Y= Covid Deaths /population *100

In [85]:
X2 = df.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y2 = df[['Covid_pop_perce_2020', 'Covid_pop_perce_2021','Covid_pop_perce_2022']]

In [86]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)

In [87]:
sc2 = StandardScaler()
X2_train_sc = sc2.fit_transform(X2_train)
X2_test_sc = sc2.transform(X2_test)

#### Model 1

In [88]:
lr5 = LinearRegression()
lr5.fit(X2_train, y2_train)
lr5.score(X2_train, y2_train), lr5.score(X2_test, y2_test)

(1.0, -3.7498268302016293)

#### Model 2

In [89]:
lr6 = LinearRegression()
lr6.fit(X2_train_sc, y2_train)
lr6.score(X2_train_sc, y2_train), lr6.score(X2_test_sc, y2_test)

(1.0, -1.5764986241421983)

#### Model 3

In [90]:
knn5 = KNeighborsRegressor()
knn5.fit(X2_train, y2_train)
knn5.score(X2_train, y2_train), knn5.score(X2_test, y2_test)

(0.28285203212024396, -0.5384576466021559)

#### Model 4

In [91]:
knn6 = KNeighborsRegressor()
knn6.fit(X2_train_sc, y2_train)
knn6.score(X2_train_sc, y2_train), knn6.score(X2_test_sc, y2_test)

(0.6721761969412126, 0.15969089531059724)

#### Model 5

In [92]:
bag5 = BaggingRegressor()
bag5.fit(X2_train, y2_train)
bag5.score(X2_train, y2_train), bag5.score(X2_test, y2_test)

(0.8876063457098837, -0.05792069455125034)

#### Model 6

In [93]:
bag6 = BaggingRegressor()
bag6.fit(X2_train_sc, y2_train)
bag6.score(X2_train_sc, y2_train), bag6.score(X2_test_sc, y2_test)

(0.8813350713306285, 0.11297234394180163)

#### Model 7

In [94]:
rf5 = RandomForestRegressor()
rf5.fit(X2_train, y2_train)
rf5.score(X2_train, y2_train), rf5.score(X2_test, y2_test)

(0.9144070559104729, 0.11060293025620109)

#### Model 8

In [95]:
rf6 = RandomForestRegressor()
rf6.fit(X2_train_sc, y2_train)
rf6.score(X2_train_sc, y2_train), rf6.score(X2_test_sc, y2_test)

(0.9176042557209413, 0.09439234458967076)

# Model Tuning

## Y variable is covid deaths

In [96]:
df.head()

Unnamed: 0,Location,Employment_2020,Employment_2021,Employment_2022,Inc_Per_Cap_2020,Inc_Per_Cap_2021,Inc_Per_CAp_2022,Life_Exp_2020,Life_Exp_2019,Life_Exp_2018,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Population Density per mi²,2010 Population,2020 Population,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence,Physicians,Physicians Rate,Active MO,Active MO Rate,Active DO,Active DO Rate,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022,Dist_Per_100K_2021_x,Distributed_Per_100k_65Plus_2021_x,Admin_Per_100K_2021_x,Admin_Per_100k_65Plus_2021_x,Administered_Dose1_Pop_Pct_2021_x,Administered_Dose1_Recip_65PlusPop_Pct_2021_x,Series_Complete_Pop_Pct_2021_x,Series_Complete_65PlusPop_Pct_2021_x,Additional_Doses_Vax_Pct_2021_x,Additional_Doses_50Plus_Vax_Pct_2021_x,Additional_Doses_65Plus_Vax_Pct_2021_x,Dist_Per_100K_2022_x,Distributed_Per_100k_65Plus_2022_x,Admin_Per_100K_2022_x,Admin_Per_100k_65Plus_2022_x,Administered_Dose1_Pop_Pct_2022_x,Administered_Dose1_Recip_65PlusPop_Pct_2022_x,Series_Complete_Pop_Pct_2022_x,Series_Complete_65PlusPop_Pct_2022_x,Additional_Doses_Vax_Pct_2022_x,Additional_Doses_50Plus_Vax_Pct_2022_x,Additional_Doses_65Plus_Vax_Pct_2022_x,Second_Booster_50Plus_Vax_Pct_x,Second_Booster_65Plus_Vax_Pct_x,Bivalent_Booster_65Plus_Pop_Pct_x,astha_prevalence,Dist_Per_100K_2021_y,Distributed_Per_100k_65Plus_2021_y,Admin_Per_100K_2021_y,Admin_Per_100k_65Plus_2021_y,Administered_Dose1_Pop_Pct_2021_y,Administered_Dose1_Recip_65PlusPop_Pct_2021_y,Series_Complete_Pop_Pct_2021_y,Series_Complete_65PlusPop_Pct_2021_y,Additional_Doses_Vax_Pct_2021_y,Additional_Doses_50Plus_Vax_Pct_2021_y,Additional_Doses_65Plus_Vax_Pct_2021_y,Dist_Per_100K_2022_y,Distributed_Per_100k_65Plus_2022_y,Admin_Per_100K_2022_y,Admin_Per_100k_65Plus_2022_y,Administered_Dose1_Pop_Pct_2022_y,Administered_Dose1_Recip_65PlusPop_Pct_2022_y,Series_Complete_Pop_Pct_2022_y,Series_Complete_65PlusPop_Pct_2022_y,Additional_Doses_Vax_Pct_2022_y,Additional_Doses_50Plus_Vax_Pct_2022_y,Additional_Doses_65Plus_Vax_Pct_2022_y,Second_Booster_50Plus_Vax_Pct_y,Second_Booster_65Plus_Vax_Pct_y,Bivalent_Booster_65Plus_Pop_Pct_y,Mask_Mandate,Mandatory,Current President,Your State Governor,Go to work,Go to the gym,Go visit a friend,"Go to a cafe, bar, or restaurant",Go to a doctor or visit a hospital,Go to church or another place of worship,"Take mass transit (e.g. subway, bus or train)",Avoiding contact with other people,Avoiding public or crowded places,Frequently washing hands,Wearing a face mask when outside of your home,Been in a room with someone outside of \n household in the past 24 hours,"Yes, 5-10 people","Yes, 11-50 people","Yes, 50 or more people",all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022,Covid_pop_perce_2020,Covid_pop_perce_2021,Covid_pop_perce_2022
0,Alabama,2671005,2769464,2869931,45887,50059,50916,73.2,75.2,75.1,0.472,0.055,0.195,0.16,0.021,0.097,0.466,0.06,0.192,0.162,0.021,0.1,100.65,4779736,5024279,39.5,9.4,289.3,39.4,9.3,3.1,12.2,10983.0,224.0,10389.0,211.9,594.0,12.1,2649,4062,945,20726,29283,14805,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,False,164304,947963,115022,205476,58.6,92.8,47.7,79.7,27.4,38.9,49.0,242643,1399940,141328,259917,64.8,95.0,53.0,85.0,39.5,53.1,62.7,37.9,43.5,20.2,07/16/2020,1,54.371124,58.903232,26.31,2.61,11.48,8.44,6.96,3.23,0.42,67.99,76.28,80.84,51.94,30.52,6.7,1.22,1.0,9021,13018,6246,6337,9771,3933,0.001261,0.001945,0.000783
1,Alaska,430840,443047,457687,61898,65662,68635,76.6,77.7,78.0,0.484,0.035,0.213,0.1,0.053,0.115,0.433,0.041,0.257,0.107,0.053,0.108,1.3,710231,733391,37.0,9.7,178.5,32.8,4.6,1.8,7.1,2101.0,287.2,1825.0,249.5,276.0,37.7,417,492,639,1422,3483,1783,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,False,180516,1441840,140316,225626,65.1,93.3,56.4,84.2,34.8,52.8,64.1,282456,2256080,178247,307195,72.8,95.0,64.9,89.2,49.0,67.5,78.1,51.8,59.8,37.6,,0,56.060775,64.20881,28.26,3.44,16.69,1.18,4.28,5.07,0.0,51.31,71.37,76.71,46.17,44.03,5.44,1.39,0.0,545,1429,686,213,804,275,0.00029,0.001096,0.000375
2,Arizona,3920033,4086802,4287595,52133,56420,58442,76.3,78.8,78.7,0.451,0.052,0.21,0.161,0.015,0.111,0.45,0.054,0.213,0.162,0.015,0.106,64.96,6392017,7151502,37.1,9.8,181.8,29.9,6.0,3.6,9.8,18343.0,252.0,15989.0,219.7,2353.0,32.3,3522,4281,1785,30955,40347,20653,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,False,173088,962728,144603,236197,67.4,95.0,57.1,83.6,30.4,44.0,51.7,261578,1454920,196497,332661,77.1,95.0,65.8,90.3,49.5,65.0,71.8,51.8,58.9,34.6,,0,43.079904,57.217666,32.02,1.48,6.21,6.0,6.81,1.69,2.76,65.18,71.42,79.7,49.7,33.24,8.51,1.14,2.89,13186,17961,8835,8603,13536,5849,0.001203,0.001893,0.000818
3,Arkansas,1639829,1686444,1755536,47147,51636,52618,73.8,75.7,75.6,0.42,0.054,0.262,0.159,0.014,0.091,0.411,0.056,0.27,0.156,0.015,0.092,58.43,2915918,3011524,40.1,9.3,284.3,38.2,9.7,3.7,12.2,6500.0,215.4,6132.0,203.2,368.0,12.2,3054,2199,1434,11285,15391,8969,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,False,165324,952344,125929,211199,62.7,92.6,51.2,78.3,30.2,44.3,55.4,266250,1533720,159188,282457,69.6,95.0,56.7,83.8,44.2,59.8,69.9,46.0,53.0,29.7,07/20/2020,1,53.716654,65.260685,29.37,0.72,11.52,5.41,5.81,2.08,0.11,58.48,68.32,80.59,44.19,30.78,7.73,1.49,1.0,4992,6908,3854,3691,5333,2593,0.001226,0.001771,0.000861
4,California,23154091,23934549,25300974,70061,76991,77036,79.0,80.9,80.8,0.48,0.066,0.253,0.114,0.009,0.078,0.47,0.07,0.265,0.117,0.008,0.07,258.21,37253956,39538223,40.7,7.8,192.5,26.6,4.2,2.8,9.4,113718.0,287.8,107383.0,271.8,6333.0,16.0,17241,13434,1029,93875,133206,89195,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,False,193006,1306260,167055,246904,82.9,95.0,66.2,86.8,33.5,48.6,58.0,293125,1983860,219183,338897,84.3,95.0,74.4,92.7,58.4,73.6,81.2,54.9,63.0,41.8,07/18/2020,1,35.634465,72.007486,25.46,1.83,6.79,8.18,7.48,0.77,1.98,75.91,78.55,80.42,72.73,20.95,3.63,1.23,1.18,41279,60680,36786,29962,48834,21158,0.000758,0.001235,0.000535


In [97]:
X1 = df.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y1 = df[['covid_2020', 'covid_2021', 'covid_2022']]

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=42)

#### Model 1 - SS & Poly (RandomForest)

In [99]:
poly = PolynomialFeatures()
X_trainpoly = poly.fit_transform(X_train)
X_testpoly = poly.transform(X_test)

ss = StandardScaler()
X_train_polysc = ss.fit_transform(X_trainpoly)
X_test_polysc = ss.transform(X_testpoly)

In [100]:
rf = RandomForestRegressor()
rf.fit(X_train_polysc, y_train)
rf.score(X_train_polysc, y_train), rf.score(X_test_polysc, y_test)

(0.950687977951414, 0.7612151449978831)

#### Model 2 - SS & Poly (bagging)

In [101]:
bag = BaggingRegressor()
bag.fit(X_train_polysc, y_train)
bag.score(X_train_polysc, y_train), bag.score(X_test_polysc, y_test)

(0.9194340164271573, 0.7105364675682658)

#### Model 3 - SS & Poly (KNN)

In [297]:
knn = KNeighborsRegressor()
knn.fit(X_train_polysc, y_train)
knn.score(X_train_polysc, y_train), knn.score(X_test_polysc, y_test)

(0.7436503805578066, 0.5625614945672842)

#### Model 4 - SS, Poly, RF, Boosting + Gridsearch

In [321]:
base_rf = RandomForestRegressor()
adaboost_rf = AdaBoostRegressor(base_rf)
multi = MultiOutputRegressor(adaboost_rf)

params = {
    'estimator__n_estimators': [10, 100, 200], 
    'estimator__base_estimator__n_estimators': [50, 100], 
    'estimator__learning_rate': [0.1, 1, 10]  
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
gs = GridSearchCV(multi, params, cv=kf, n_jobs=-1)

In [322]:
%%time
gs.fit(X_train_polysc, y_train)

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

CPU times: user 1min 8s, sys: 402 ms, total: 1min 8s
Wall time: 2h 4min 25s


In [323]:
gs.score(X_train_polysc, y_train), gs.score(X_test_polysc, y_test)

(0.9615903661255262, 0.7842375910678697)

In [324]:
gs.best_params_

{'estimator__base_estimator__n_estimators': 100,
 'estimator__learning_rate': 0.1,
 'estimator__n_estimators': 10}

#### Model 5 - Poly, SS, Bagging, GradBoost & GridSearch

In [328]:
grad = GradientBoostingRegressor()
multi1 = MultiOutputRegressor(grad)

pgrid = {
    'estimator__learning_rate': [0.1, 1, 10],
    'estimator__n_estimators': [10, 100],
    'estimator__max_depth': [None, 1, 2, 3]
}

kf1 = KFold(n_splits=10, shuffle=True, random_state=42)
gs1 = GridSearchCV(multi1, pgrid, cv=kf1, n_jobs=-1)

In [329]:
%%time
gs1.fit(X_train_polysc, y_train)

  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


CPU times: user 3.88 s, sys: 110 ms, total: 3.99 s
Wall time: 2min 29s


In [330]:
gs1.score(X_train_polysc, y_train), gs1.score(X_test_polysc, y_test)

(0.9996712212524659, 0.9034508644566325)

#### Model 5

In [332]:
params = {
    'max_features': np.arange(1, X.shape[1] + 1), 
    'max_depth': np.append(np.arange(1, 10), None), 
    'min_samples_leaf': np.arange(1, 31) 
}
kf = KFold(n_splits=10, shuffle=True, random_state=2023)
ranfor = RandomForestRegressor(
    n_estimators=100,
    random_state=2023
)

gs2 = GridSearchCV(ranfor, params, cv=kf, n_jobs=6)

In [334]:
%%time
gs2.fit(X_train_polysc, y_train)

CPU times: user 1min 57s, sys: 18.1 s, total: 2min 15s
Wall time: 33min 59s


In [335]:
gs2.score(X_train_polysc, y_train), gs2.score(X_test_polysc, y_test)

(0.7930247825616247, 0.6592708754598661)

In [115]:
lr = LinearRegression()
lr.fit(X_train_polysc, y_train)
lr.score(X_train_polysc, y_train), lr.score(X_test_polysc, y_test)

(1.0, 0.9170538077072194)

### Y is Excess Deaths

In [102]:
X2 = df.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y2 = df[['Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022']]

In [103]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)

poly = PolynomialFeatures()
X2_train_poly = poly.fit_transform(X2_train)
X2_test_poly = poly.transform(X2_test)

sc = StandardScaler()
X2_train_polysc = sc.fit_transform(X2_train_poly)
X2_test_polysc = sc.transform(X2_test_poly)

#### Model 1

In [104]:
rf = RandomForestRegressor()
rf.fit(X2_train_polysc, y2_train)
rf.score(X2_train_polysc, y2_train), rf.score(X2_test_polysc, y2_test)

(0.9556624783950433, 0.6596390175802633)

#### Model 2

In [105]:
bag = BaggingRegressor()
bag.fit(X2_train_polysc, y2_train)
bag.score(X2_train_polysc, y2_train), bag.score(X2_test_polysc, y2_test)

(0.9668165467988512, 0.7742026379311312)

#### Model 3

In [112]:
base_rf = RandomForestRegressor()
adaboost_rf = AdaBoostRegressor(base_rf)
multi = MultiOutputRegressor(adaboost_rf)

params = {
    'estimator__n_estimators': [10, 100, 200], 
    'estimator__base_estimator__n_estimators': [50, 100], 
    'estimator__learning_rate': [0.1, 1, 10]  
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
gs = GridSearchCV(multi, params, cv=kf, n_jobs=-1)

In [113]:
%%time
gs.fit(X_train_polysc, y_train)


  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_params[key].set_params(**sub_params)
  valid_pa

CPU times: user 33.2 s, sys: 309 ms, total: 33.5 s
Wall time: 2h 16min 19s


In [114]:
gs.score(X_train_polysc, y_train), gs.score(X_test_polysc, y_test)

(0.9581031719495128, 0.7994069264987734)

#### Model 4

In [106]:
grad = GradientBoostingRegressor()
multi1 = MultiOutputRegressor(grad)

pgrid = {
    'estimator__learning_rate': [0.1, 1, 10],
    'estimator__n_estimators': [10, 100],
    'estimator__max_depth': [None, 1, 2, 3]
}

kf1 = KFold(n_splits=10, shuffle=True, random_state=42)
gs1 = GridSearchCV(multi1, pgrid, cv=kf1, n_jobs=-1)

In [107]:
%%time
gs1.fit(X_train_polysc, y_train)

  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


CPU times: user 3.96 s, sys: 117 ms, total: 4.08 s
Wall time: 2min 39s


In [108]:
gs1.score(X_train_polysc, y_train), gs1.score(X_test_polysc, y_test)

(0.9996712212524659, 0.9039638428509988)

#### Model 5

In [109]:
params = {
    'max_features': np.arange(1, X.shape[1] + 1), 
    'max_depth': np.append(np.arange(1, 10), None), 
    'min_samples_leaf': np.arange(1, 31) 
}
kf = KFold(n_splits=10, shuffle=True, random_state=2023)
ranfor = RandomForestRegressor(
    n_estimators=100,
    random_state=2023
)

gs2 = GridSearchCV(ranfor, params, cv=kf, n_jobs=-1)

In [110]:
%%time
gs2.fit(X_train_polysc, y_train)

CPU times: user 1min 1s, sys: 9.24 s, total: 1min 11s
Wall time: 21min 3s


In [111]:
gs2.score(X_train_polysc, y_train), gs2.score(X_test_polysc, y_test)

(0.7930247825616247, 0.6592708754598661)

#### Base LR Models

In [137]:
X1 = df.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y1 = df[['covid_2020', 'covid_2021', 'covid_2022']]

In [138]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state =42)

In [147]:
las = Lasso(max_iter=1000, alpha=0.1)
las.fit(X1_train, y1_train)
las.score(X1_train, y1_train), las.score(X1_test, y1_test)

(0.9999991400245168, 0.9226665134912698)

In [247]:
coef = pd.DataFrame(las.coef_.T).round(2)
var = pd.DataFrame(X1.columns).rename(columns={0:'Variables'})
pd.concat([var, coef], axis=1).rename(columns={0:2020, 1:2021, 2:2022})

Unnamed: 0,Variables,2020,2021,2022
0,Employment_2020,0.0,0.0,0.0
1,Employment_2021,0.0,0.0,0.0
2,Employment_2022,-0.0,0.0,-0.0
3,Inc_Per_Cap_2020,0.01,-0.17,-0.05
4,Inc_Per_Cap_2021,-0.0,0.04,-0.0
5,Inc_Per_CAp_2022,0.03,0.07,0.03
6,Life_Exp_2020,-536.68,-864.95,-472.01
7,Life_Exp_2019,-148.22,-1488.71,-662.15
8,Life_Exp_2018,-125.09,113.67,5.74
9,Employer_2019,-0.0,-0.0,0.0


In [153]:
rid = Ridge(max_iter=1000, alpha=0.1)
rid.fit(X1_train, y1_train)
rid.score(X1_train, y1_train), rid.score(X1_test, y1_test)

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


(0.9999991019303627, 0.8806077277677167)

In [154]:
X2 = df.drop(columns=['Location', 'Mask_Mandate','Exc_deaths_2017', 'Exc_deaths_2018','Exc_deaths_2019',
                              'Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022',
                      'all_causes_2020', 'all_causes_2021', 'all_causes_2022', 'covid_2020',
                      'covid_2021', 'covid_2022', 'Covid_pop_perce_2020', 'Covid_pop_perce_2021',
                     'Covid_pop_perce_2022'])
y2 = df[['Exc_deaths_2020', 'Exc_deaths_2021','Exc_deaths_2022']]

In [155]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)

In [164]:
las1 = Lasso(max_iter=1000, alpha=0.1)
las1.fit(X2_train, y2_train)
las1.score(X2_train, y2_train), las1.score(X2_test, y2_test)

(0.9999983098378628, 0.8926730135813484)

In [248]:
coef1 = pd.DataFrame(las1.coef_.T).round(2)
var1 = pd.DataFrame(X2.columns).rename(columns={0:'Variables'})
pd.concat([var1, coef1], axis=1).rename(columns={0:2020, 1:2021, 2:2022})

Unnamed: 0,Variables,2020,2021,2022
0,Employment_2020,0.01,0.01,0.0
1,Employment_2021,0.0,0.0,0.0
2,Employment_2022,0.0,0.0,-0.0
3,Inc_Per_Cap_2020,-0.12,-0.5,-0.31
4,Inc_Per_Cap_2021,0.05,0.1,0.01
5,Inc_Per_CAp_2022,0.07,0.14,0.07
6,Life_Exp_2020,-1970.66,-2929.77,-1317.09
7,Life_Exp_2019,-150.63,-3468.06,-2189.81
8,Life_Exp_2018,-145.98,321.23,59.82
9,Employer_2019,0.0,-65931.23,0.0


In [170]:
rid1 = Ridge(max_iter=8000, alpha=0.1)
rid1.fit(X2_train, y2_train)
rid1.score(X2_train, y2_train), rid1.score(X2_test, y2_test)

  dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)


(0.999999594752488, 0.8031595859109605)

## OLS

#### Covid Deaths

In [258]:
yols2020 = df['covid_2020']

ols12020 = sm.OLS((yols2020.astype(float)), (X2.astype(float))).fit()
ols12020.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,covid_2020,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,
Time:,02:21:29,Log-Likelihood:,830.34
No. Observations:,50,AIC:,-1561.0
Df Residuals:,0,BIC:,-1465.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Employment_2020,-0.0049,inf,-0,,,
Employment_2021,0.0378,inf,0,,,
Employment_2022,-0.0300,inf,-0,,,
Inc_Per_Cap_2020,-0.4989,inf,-0,,,
Inc_Per_Cap_2021,1.3181,inf,0,,,
Inc_Per_CAp_2022,-0.5156,inf,-0,,,
Life_Exp_2020,-357.0972,inf,-0,,,
Life_Exp_2019,132.7673,inf,0,,,
Life_Exp_2018,87.4442,inf,0,,,

0,1,2,3
Omnibus:,22.072,Durbin-Watson:,1.762
Prob(Omnibus):,0.0,Jarque-Bera (JB):,53.257
Skew:,1.136,Prob(JB):,2.73e-12
Kurtosis:,7.517,Cond. No.,66900000.0


In [259]:
yols2021 = df['covid_2021']

ols12021 = sm.OLS((yols2021.astype(float)), (X2.astype(float))).fit()
ols12021.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,covid_2021,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,
Time:,02:21:32,Log-Likelihood:,870.24
No. Observations:,50,AIC:,-1640.0
Df Residuals:,0,BIC:,-1545.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Employment_2020,0.0149,inf,0,,,
Employment_2021,-0.0138,inf,-0,,,
Employment_2022,0.0015,inf,0,,,
Inc_Per_Cap_2020,0.4708,inf,0,,,
Inc_Per_Cap_2021,-1.0716,inf,-0,,,
Inc_Per_CAp_2022,0.6027,inf,0,,,
Life_Exp_2020,-165.4145,inf,-0,,,
Life_Exp_2019,-97.2284,inf,-0,,,
Life_Exp_2018,-158.1464,inf,-0,,,

0,1,2,3
Omnibus:,7.446,Durbin-Watson:,1.69
Prob(Omnibus):,0.024,Jarque-Bera (JB):,7.729
Skew:,-0.566,Prob(JB):,0.021
Kurtosis:,4.558,Cond. No.,66900000.0


In [260]:
yols2022 = df['covid_2022']

ols12022 = sm.OLS((yols2022.astype(float)), (X2.astype(float))).fit()
ols12022.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,covid_2022,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,
Time:,02:22:03,Log-Likelihood:,879.0
No. Observations:,50,AIC:,-1658.0
Df Residuals:,0,BIC:,-1562.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Employment_2020,-0.0013,inf,-0,,,
Employment_2021,0.0088,inf,0,,,
Employment_2022,-0.0026,inf,-0,,,
Inc_Per_Cap_2020,-0.1804,inf,-0,,,
Inc_Per_Cap_2021,0.5634,inf,0,,,
Inc_Per_CAp_2022,-0.3658,inf,-0,,,
Life_Exp_2020,-12.5379,inf,-0,,,
Life_Exp_2019,-100.5055,inf,-0,,,
Life_Exp_2018,-98.8060,inf,-0,,,

0,1,2,3
Omnibus:,38.165,Durbin-Watson:,1.054
Prob(Omnibus):,0.0,Jarque-Bera (JB):,104.887
Skew:,2.156,Prob(JB):,1.68e-23
Kurtosis:,8.635,Cond. No.,66900000.0


#### Excess Deaths

In [261]:
y2ols2020 = df['Exc_deaths_2020']
ols12020 = sm.OLS((y2ols2020.astype(float)), (X2.astype(float))).fit()
ols12020.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,Exc_deaths_2020,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,
Time:,02:25:15,Log-Likelihood:,776.44
No. Observations:,50,AIC:,-1453.0
Df Residuals:,0,BIC:,-1357.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Employment_2020,-0.0287,inf,-0,,,
Employment_2021,0.1253,inf,0,,,
Employment_2022,-0.1044,inf,-0,,,
Inc_Per_Cap_2020,-1.4281,inf,-0,,,
Inc_Per_Cap_2021,0.7480,inf,0,,,
Inc_Per_CAp_2022,1.2454,inf,0,,,
Life_Exp_2020,-1048.3467,inf,-0,,,
Life_Exp_2019,383.9420,inf,0,,,
Life_Exp_2018,198.9968,inf,0,,,

0,1,2,3
Omnibus:,21.075,Durbin-Watson:,1.759
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.381
Skew:,1.055,Prob(JB):,4.22e-12
Kurtosis:,7.549,Cond. No.,66900000.0


In [262]:
y2ols2021 = df['Exc_deaths_2021']
ols12021 = sm.OLS((y2ols2021.astype(float)), (X2.astype(float))).fit()
ols12021.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,Exc_deaths_2021,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,
Time:,02:25:55,Log-Likelihood:,808.57
No. Observations:,50,AIC:,-1517.0
Df Residuals:,0,BIC:,-1422.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Employment_2020,0.0048,inf,0,,,
Employment_2021,0.0456,inf,0,,,
Employment_2022,-0.0496,inf,-0,,,
Inc_Per_Cap_2020,1.6164,inf,0,,,
Inc_Per_Cap_2021,-4.1706,inf,-0,,,
Inc_Per_CAp_2022,2.4134,inf,0,,,
Life_Exp_2020,-81.2404,inf,-0,,,
Life_Exp_2019,-95.1028,inf,-0,,,
Life_Exp_2018,-293.8956,inf,-0,,,

0,1,2,3
Omnibus:,33.661,Durbin-Watson:,1.409
Prob(Omnibus):,0.0,Jarque-Bera (JB):,103.838
Skew:,-1.749,Prob(JB):,2.83e-23
Kurtosis:,9.133,Cond. No.,66900000.0


In [263]:
y2ols2022 = df['Exc_deaths_2022']
ols12022 = sm.OLS((y2ols2022.astype(float)), (X2.astype(float))).fit()
ols12022.summary()

  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid


0,1,2,3
Dep. Variable:,Exc_deaths_2022,R-squared:,1.0
Model:,OLS,Adj. R-squared:,
Method:,Least Squares,F-statistic:,
Date:,"Tue, 07 Nov 2023",Prob (F-statistic):,
Time:,02:26:09,Log-Likelihood:,826.79
No. Observations:,50,AIC:,-1554.0
Df Residuals:,0,BIC:,-1458.0
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Employment_2020,0.0184,inf,0,,,
Employment_2021,0.0070,inf,0,,,
Employment_2022,-0.0202,inf,-0,,,
Inc_Per_Cap_2020,-0.3531,inf,-0,,,
Inc_Per_Cap_2021,-0.2706,inf,-0,,,
Inc_Per_CAp_2022,0.5733,inf,0,,,
Life_Exp_2020,56.4651,inf,0,,,
Life_Exp_2019,-127.3124,inf,-0,,,
Life_Exp_2018,-234.7820,inf,-0,,,

0,1,2,3
Omnibus:,52.752,Durbin-Watson:,1.607
Prob(Omnibus):,0.0,Jarque-Bera (JB):,267.261
Skew:,-2.782,Prob(JB):,9.23e-59
Kurtosis:,12.865,Cond. No.,66900000.0
