### Imports

In [216]:
import pandas as pd
import numpy as np

# State Level Data

## Pre-Existing Health Conditions Datasets:

#### For pre-existing conditions & immunizations:
1. choose crude or adj
2. drop old index
3. rename data value to question
4. drop year and question columns
5. For total number of incident data -> after concatening with population data, convert to rate

*Crude are the raw numbers. Adj are these numbers adjusted/ standardized based on population's age distribution. We can retain the crude numbers however the age adjusted numbers will probably be better for cross comparison*

In [217]:
asthma = pd.read_csv('../Data/Raw/asthma_adj.csv')
asthma.rename(columns={
    'DataValue':'asthma_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [218]:
asthma.drop(columns=['YearStart', 'Question'], inplace=True)
asthma.head()

Unnamed: 0,Location,asthma_prevalence
0,Alaska,9.7
1,Alabama,9.4
2,Arkansas,9.3
3,Arizona,9.8
4,California,7.8


In [219]:
high_bp = pd.read_csv('../Data/Raw/high_blood_pressure_adj.csv')
high_bp.head()

Unnamed: 0,YearStart,LocationDesc,Question,DataValue
0,2019,Alaska,Awareness of high blood pressure among adults ...,32.8
1,2019,Alabama,Awareness of high blood pressure among adults ...,39.4
2,2019,Arkansas,Awareness of high blood pressure among adults ...,38.2
3,2019,Arizona,Awareness of high blood pressure among adults ...,29.9
4,2019,California,Awareness of high blood pressure among adults ...,26.6


In [220]:
high_bp.rename(columns={
    'DataValue':'high_bp_prevalence',
    'LocationDesc':'Location'}, inplace=True)
high_bp.drop(columns=['YearStart', 'Question'], inplace=True)
high_bp.head()

Unnamed: 0,Location,high_bp_prevalence
0,Alaska,32.8
1,Alabama,39.4
2,Arkansas,38.2
3,Arizona,29.9
4,California,26.6


In [221]:
cardiac_mortality = pd.read_csv('../Data/Raw/cardiac_mortality_adj.csv')
cardiac_mortality.rename(columns={
    'DataValue':'cardiac_mortality_rate',
    'LocationDesc':'Location'}, inplace=True)
cardiac_mortality.drop(columns=['YearStart', 'Question'], inplace=True)
cardiac_mortality.head()

Unnamed: 0,Location,cardiac_mortality_rate
0,Delaware,214.3
1,Georgia,234.3
2,Alaska,178.5
3,Kentucky,253.8
4,Arkansas,284.3


In [222]:
diabetes = pd.read_csv('../Data/Raw/diabetes_adj.csv')
diabetes.head()
diabetes.rename(columns={
    'DataValue':'diabetes_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [223]:
diabetes.drop(columns=['YearStart', 'Question'], inplace=True)
diabetes.head()

Unnamed: 0,Location,diabetes_prevalence
0,Alaska,7.1
1,Alabama,12.2
2,Arkansas,12.2
3,Arizona,9.8
4,California,9.4


In [224]:
kidney = pd.read_csv('../Data/Raw/kidney_adj.csv')
kidney.head()

Unnamed: 0,YearStart,LocationDesc,Question,DataValue
0,2019,Alaska,Prevalence of chronic kidney disease among adu...,1.8
1,2019,Alabama,Prevalence of chronic kidney disease among adu...,3.1
2,2019,Arkansas,Prevalence of chronic kidney disease among adu...,3.7
3,2019,California,Prevalence of chronic kidney disease among adu...,2.8
4,2019,Arizona,Prevalence of chronic kidney disease among adu...,3.6


In [225]:
kidney.rename(columns={
    'DataValue':'kidney_disease_prevalence',
    'LocationDesc':'Location'}, inplace=True)

In [226]:
kidney.drop(columns=['YearStart', 'Question'], inplace=True)
kidney.head()

Unnamed: 0,Location,kidney_disease_prevalence
0,Alaska,1.8
1,Alabama,3.1
2,Arkansas,3.7
3,California,2.8
4,Arizona,3.6


In [227]:
copd = pd.read_csv('../Data/Raw/copd_adj.csv')
copd.rename(columns={
    'DataValue':'copd_prevalence',
    'LocationDesc':'Location'}, inplace=True)
copd.drop(columns=['YearStart', 'Question'], inplace=True)
copd.head()

Unnamed: 0,Location,copd_prevalence
0,Alaska,4.6
1,Alabama,9.3
2,Arkansas,9.7
3,California,4.2
4,Arizona,6.0


In [228]:
immun = pd.read_csv('../Data/Raw/immun_adj.csv')
immun.rename(columns={
    'DataValue':'flu_vaccination_rate_2019',
    'LocationDesc':'Location'}, inplace=True)
immun.drop(columns=['YearStart', 'Question'], inplace=True)
immun.head()

Unnamed: 0,Location,flu_vaccination_rate_2019
0,Alaska,37.0
1,Alabama,39.5
2,Arkansas,40.1
3,Arizona,37.1
4,California,40.7


In [229]:
pre_con = pd.merge(immun, asthma, on='Location', how='inner')

dataframes_to_merge = [cardiac_mortality, high_bp, copd, kidney, diabetes]

for df in dataframes_to_merge:
    pre_con = pd.merge(pre_con, df, on='Location', how='inner')

In [230]:
pre_con.head()

Unnamed: 0,Location,flu_vaccination_rate_2019,asthma_prevalence,cardiac_mortality_rate,high_bp_prevalence,copd_prevalence,kidney_disease_prevalence,diabetes_prevalence
0,Alaska,37.0,9.7,178.5,32.8,4.6,1.8,7.1
1,Alabama,39.5,9.4,289.3,39.4,9.3,3.1,12.2
2,Arkansas,40.1,9.3,284.3,38.2,9.7,3.7,12.2
3,Arizona,37.1,9.8,181.8,29.9,6.0,3.6,9.8
4,California,40.7,7.8,192.5,26.6,4.2,2.8,9.4


In [231]:
pre_con.to_csv('../Data/Cleaned/cleaned_pre_condtions.csv', index = True)

## Insurance rates by state

In [232]:
insur_2019 = pd.read_csv('../Data/Raw/2019_insurance.csv')
insur_2019.head()

Unnamed: 0,Location,Employer,Non-Group,Medicaid,Medicare,Military,Uninsured,Year
0,United States,0.496,0.059,0.198,0.142,0.014,0.092,2019
1,Alabama,0.472,0.055,0.195,0.16,0.021,0.097,2019
2,Alaska,0.484,0.035,0.213,0.1,0.053,0.115,2019
3,Arizona,0.451,0.052,0.21,0.161,0.015,0.111,2019
4,Arkansas,0.42,0.054,0.262,0.159,0.014,0.091,2019


In [233]:
insur_2021 = pd.read_csv('../Data/Raw/2021_insurance.csv')

In [234]:
insur = pd.merge(insur_2019, insur_2021, how ='inner', on ='Location', suffixes=('_2019', '_2021'))
insur.head()

Unnamed: 0,Location,Employer_2019,Non-Group_2019,Medicaid_2019,Medicare_2019,Military_2019,Uninsured_2019,Year_2019,Employer_2021,Non-Group_2021,Medicaid_2021,Medicare_2021,Military_2021,Uninsured_2021,Year_2021
0,United States,0.496,0.059,0.198,0.142,0.014,0.092,2019,0.485,0.061,0.211,0.143,0.013,0.086,2021
1,Alabama,0.472,0.055,0.195,0.16,0.021,0.097,2019,0.466,0.06,0.192,0.162,0.021,0.1,2021
2,Alaska,0.484,0.035,0.213,0.1,0.053,0.115,2019,0.433,0.041,0.257,0.107,0.053,0.108,2021
3,Arizona,0.451,0.052,0.21,0.161,0.015,0.111,2019,0.45,0.054,0.213,0.162,0.015,0.106,2021
4,Arkansas,0.42,0.054,0.262,0.159,0.014,0.091,2019,0.411,0.056,0.27,0.156,0.015,0.092,2021


In [235]:
insur.drop(columns =['Year_2019', 'Year_2021'], inplace=True)

In [236]:
insur.to_csv('../Data/Cleaned/cleaned_insur.csv', index = False)

## Covid Vaccines by State

In [237]:
ea = pd.read_csv('../Data/Raw/Executive Approval.csv')

In [238]:
hb = pd.read_csv('../Data/Raw/Health Behavior.csv')

In [239]:
phb = pd.read_csv('../Data/Raw/Public Health Measures.csv')
phb.head()

Unnamed: 0,Wave_time,State,StateFIPS,Start_Date,End_Date,N_State,Requiring everyone to get a COVID-19 vaccine,Requiring a COVID-19 vaccine to get on an airplane,Requiring children to get a COVID-19 vaccine in order to be allowed in school,Requiring college students to get a COVID-19 vaccine in order to go back to university
0,December,United States,,2020-12-16,2021-01-11,25640,55.5,63.0,56.0,
1,December,Alaska,2.0,2020-12-16,2021-01-11,405,50.7,53.5,48.9,
2,December,Alabama,1.0,2020-12-16,2021-01-11,448,48.4,60.0,51.0,
3,December,Arkansas,5.0,2020-12-16,2021-01-11,469,39.0,46.1,40.6,
4,December,Arizona,4.0,2020-12-16,2021-01-11,469,54.2,58.3,54.3,


In [240]:
ea_2020 = ea[ea['Start_Date'] == '2020-04-16'].copy()
ea_2020.head()

Unnamed: 0,Wave_time,State,StateFIPS,Start_Date,End_Date,N_State,Current President,Your State Governor
0,Late Apr,United States,,2020-04-16,2020-04-30,21405,43.098933,65.083608
1,Late Apr,Alaska,2.0,2020-04-16,2020-04-30,103,56.060775,64.20881
2,Late Apr,Alabama,1.0,2020-04-16,2020-04-30,465,54.371124,58.903232
3,Late Apr,Arkansas,5.0,2020-04-16,2020-04-30,460,53.716654,65.260685
4,Late Apr,Arizona,4.0,2020-04-16,2020-04-30,484,43.079904,57.217666


In [241]:
ea_2020.shape

(51, 8)

In [242]:
ea_2020.drop(columns=['Wave_time', 'StateFIPS', 'Start_Date', 'End_Date', 'N_State'], inplace=True)

In [243]:
hb_2020 = hb[hb['Start_Date'] == '2020-04-16'].copy()
hb_2020.drop(columns=['Wave_time', 'StateFIPS', 'Start_Date', 'End_Date', 'N_state'], inplace=True)
hb_2020.head()

Unnamed: 0,State,Go to work,Go to the gym,Go visit a friend,"Go to a cafe, bar, or restaurant",Go to a doctor or visit a hospital,Go to church or another place of worship,"Take mass transit (e.g. subway, bus or train)",Avoiding contact with other people,Avoiding public or crowded places,Frequently washing hands,Wearing a face mask when outside of your home,Been in a room with someone outside of \r\n household in the past 24 hours,"Yes, 5-10 people","Yes, 11-50 people","Yes, 50 or more people"
0,United States,25.71,1.36,8.31,5.78,5.54,1.29,1.82,68.77,75.23,80.36,55.96,26.27,5.17,1.42,0.83
1,Alaska,28.26,3.44,16.69,1.18,4.28,5.07,0.0,51.31,71.37,76.71,46.17,44.03,5.44,1.39,0.0
2,Alabama,26.31,2.61,11.48,8.44,6.96,3.23,0.42,67.99,76.28,80.84,51.94,30.52,6.7,1.22,1.0
3,Arkansas,29.37,0.72,11.52,5.41,5.81,2.08,0.11,58.48,68.32,80.59,44.19,30.78,7.73,1.49,1.0
4,Arizona,32.02,1.48,6.21,6.0,6.81,1.69,2.76,65.18,71.42,79.7,49.7,33.24,8.51,1.14,2.89


In [244]:
phb_2020 = phb[phb['Start_Date'] == '2020-12-16'].copy()
phb_2020.drop(columns=['Wave_time', 'StateFIPS', 'Start_Date', 'End_Date', 'N_State'], inplace=True)
phb_2020.head()

Unnamed: 0,State,Requiring everyone to get a COVID-19 vaccine,Requiring a COVID-19 vaccine to get on an airplane,Requiring children to get a COVID-19 vaccine in order to be allowed in school,Requiring college students to get a COVID-19 vaccine in order to go back to university
0,United States,55.5,63.0,56.0,
1,Alaska,50.7,53.5,48.9,
2,Alabama,48.4,60.0,51.0,
3,Arkansas,39.0,46.1,40.6,
4,Arizona,54.2,58.3,54.3,


In [245]:
print(ea_2020.shape)
print(phb_2020.shape)
print(hb_2020.shape)

(51, 3)
(52, 5)
(52, 16)


In [246]:
health_behavior = pd.merge(ea_2020, hb_2020, how = 'left', on = 'State')

In [247]:
health_behavior.head()

Unnamed: 0,State,Current President,Your State Governor,Go to work,Go to the gym,Go visit a friend,"Go to a cafe, bar, or restaurant",Go to a doctor or visit a hospital,Go to church or another place of worship,"Take mass transit (e.g. subway, bus or train)",Avoiding contact with other people,Avoiding public or crowded places,Frequently washing hands,Wearing a face mask when outside of your home,Been in a room with someone outside of \r\n household in the past 24 hours,"Yes, 5-10 people","Yes, 11-50 people","Yes, 50 or more people"
0,United States,43.098933,65.083608,25.71,1.36,8.31,5.78,5.54,1.29,1.82,68.77,75.23,80.36,55.96,26.27,5.17,1.42,0.83
1,Alaska,56.060775,64.20881,28.26,3.44,16.69,1.18,4.28,5.07,0.0,51.31,71.37,76.71,46.17,44.03,5.44,1.39,0.0
2,Alabama,54.371124,58.903232,26.31,2.61,11.48,8.44,6.96,3.23,0.42,67.99,76.28,80.84,51.94,30.52,6.7,1.22,1.0
3,Arkansas,53.716654,65.260685,29.37,0.72,11.52,5.41,5.81,2.08,0.11,58.48,68.32,80.59,44.19,30.78,7.73,1.49,1.0
4,Arizona,43.079904,57.217666,32.02,1.48,6.21,6.0,6.81,1.69,2.76,65.18,71.42,79.7,49.7,33.24,8.51,1.14,2.89


In [248]:
health_behavior.shape

(51, 18)

In [249]:
health_behavior.rename(columns={'State' : 'Location'}, inplace = True)

In [250]:
health_behavior.to_csv('../Data/Cleaned/cleaned_health_behavior.csv', index=False)
health_behavior.head()

Unnamed: 0,Location,Current President,Your State Governor,Go to work,Go to the gym,Go visit a friend,"Go to a cafe, bar, or restaurant",Go to a doctor or visit a hospital,Go to church or another place of worship,"Take mass transit (e.g. subway, bus or train)",Avoiding contact with other people,Avoiding public or crowded places,Frequently washing hands,Wearing a face mask when outside of your home,Been in a room with someone outside of \r\n household in the past 24 hours,"Yes, 5-10 people","Yes, 11-50 people","Yes, 50 or more people"
0,United States,43.098933,65.083608,25.71,1.36,8.31,5.78,5.54,1.29,1.82,68.77,75.23,80.36,55.96,26.27,5.17,1.42,0.83
1,Alaska,56.060775,64.20881,28.26,3.44,16.69,1.18,4.28,5.07,0.0,51.31,71.37,76.71,46.17,44.03,5.44,1.39,0.0
2,Alabama,54.371124,58.903232,26.31,2.61,11.48,8.44,6.96,3.23,0.42,67.99,76.28,80.84,51.94,30.52,6.7,1.22,1.0
3,Arkansas,53.716654,65.260685,29.37,0.72,11.52,5.41,5.81,2.08,0.11,58.48,68.32,80.59,44.19,30.78,7.73,1.49,1.0
4,Arizona,43.079904,57.217666,32.02,1.48,6.21,6.0,6.81,1.69,2.76,65.18,71.42,79.7,49.7,33.24,8.51,1.14,2.89


### Total Physicians 

In [251]:
df = pd.read_csv('../data/Raw/total physician.csv')

In [252]:
df.drop(columns=['Unnamed: 9'],inplace=True)

In [253]:
df.drop(columns=['Unnamed: 4'], inplace=True)

In [254]:
df = df.drop(df.index[-1])

In [255]:
df = df.drop(df.index[:3])

In [256]:
df = df.reset_index(drop=True)

In [257]:
df = df.drop(df.index[39])

In [258]:
df = df.reset_index(drop=True)

In [259]:
# rankings_pd.rename(columns = {'test':'TEST'}, inplace = True)
df.rename(columns={
    'Unnamed: 0':'Location',
    'Unnamed: 1':'Population',
    'Unnamed: 2':'Physicians',
    'Unnamed: 3':'Physicians Rate',
    'Unnamed: 5':'Active MO',
    'Unnamed: 6':'Active MO Rate',
    'Unnamed: 7':'Active DO',
    'Unnamed: 8':'Active DO Rate'
}, inplace=True)

In [260]:
df.drop(columns=['Population'],inplace=True)

In [261]:
df = pd.DataFrame(df)

In [262]:
df.to_csv('../Data/Cleaned/cleaned_total_physician.csv',index=False)

### Cleaning Income per Capita

In [263]:
income = pd.read_csv('../Data/Raw/Income per capita.csv')

In [264]:
income = income.drop(income.index[0])

In [265]:
income.drop(columns=['State or DC'],inplace=True)

In [266]:
income.rename(columns={
    'Unnamed: 1':'Location',
    'Unnamed: 2':'Inc_Per_Cap_2020',
    'Unnamed: 3':'Inc_Per_Cap_2021',
    'Unnamed: 4':'Inc_Per_CAp_2022'
},inplace=True)

In [267]:
income = income.reset_index(drop=True);

In [268]:
income.to_csv('../Data/Cleaned/cleaned_income.csv',index=False)

### Cleaning Total Employment

In [269]:
employment = pd.read_csv('../Data/Raw/total employment.csv')

In [270]:
employment = employment.drop(employment.index[0])

In [271]:
employment.drop(columns=['State or DC'],inplace=True)

In [272]:
employment.rename(columns={
    'Unnamed: 1':'Location',
    'Unnamed: 2':'Employment_2020',
    'Unnamed: 3':'Employment_2021',
    'Unnamed: 4':'Employment_2022'
},inplace=True)

In [273]:
employment = employment.reset_index(drop=True);

In [274]:
employment.to_csv('../Data/Cleaned/cleaned_employment.csv',index=False)

### Cleaning Life Expectancy

In [275]:
life = pd.read_csv('../Data/Raw/life_expectancy.csv')

In [276]:
life.drop(columns=['URL'],inplace=True)

In [277]:
life.STATE.unique()

array(['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI',
       'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI',
       'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC',
       'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
       'VT', 'VA', 'WA', 'WV', 'WI', 'WY'], dtype=object)

In [278]:
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}

In [279]:
life['States'] = life['STATE'].map(state_name)

In [280]:
life.drop(columns=['STATE'],inplace=True)

In [281]:
life_2020 = life.head(50)

In [282]:
life_2020.drop(columns=['YEAR'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  life_2020.drop(columns=['YEAR'],inplace=True)


In [283]:
life_2020 = life_2020[['States', 'RATE']]

In [284]:
life_2020 = life_2020.reset_index(drop=True);

In [285]:
life_2020.rename(columns={'RATE':'Life_Exp_2020'},inplace=True)

In [286]:
life_2019 = life.iloc[50:100]

In [287]:
life_2019.drop(columns=['YEAR'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  life_2019.drop(columns=['YEAR'],inplace=True)


In [288]:
life_2019 = life_2019[['States', 'RATE']]

In [289]:
life_2019 = life_2019.reset_index(drop=True);

In [290]:
life_2019.rename(columns={'RATE':'Life_Exp_2019'},inplace=True)

In [291]:
life_2018 = life.iloc[100:150]

In [292]:
life_2018.drop(columns=['YEAR'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  life_2018.drop(columns=['YEAR'],inplace=True)


In [293]:
life_2018 = life_2018[['States', 'RATE']]

In [294]:
life_2018 = life_2018.reset_index(drop=True);

In [295]:
life_2018.rename(columns={'RATE':'Life_Exp_2018'},inplace=True)

In [296]:
life_2018.head()

Unnamed: 0,States,Life_Exp_2018
0,Alabama,75.1
1,Alaska,78.0
2,Arizona,78.7
3,Arkansas,75.6
4,California,80.8


In [297]:
life_exp = pd.concat([life_2020, life_2019, life_2018], axis=1)

In [298]:
life_exp = life_exp.loc[:, ~life_exp.columns.duplicated()]

In [299]:
# Add a new row for the "District of Columbia"
new_row = pd.DataFrame({'States': 'District of Columbia'}, index=[len(life_exp)])
life_exp = pd.concat([life_exp.iloc[:8], new_row, life_exp.iloc[8:]])

In [300]:
life_exp.reset_index(drop=True);

In [301]:
life_exp.rename(columns={'States':'Location'},inplace=True)

In [302]:
life_exp.to_csv('../Data/Cleaned/cleaned_Life_Expentency.csv',index=False)

## Merging the Cleaned State Data into 1 df

In [None]:
deaths_covid.to_csv('Data/Cleaned/cleaned_covid_death_state.csv', index = False)

In [None]:
# getting sum of each states excess deaths 
def excess_deaths(deaths, start_year=2017, end_year=2022):
    state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        excess_estimate = row['Excess Estimate']
        
        if start_year <= year <= end_year:
            state_totals[state][year - start_year] += excess_estimate

    result = pd.DataFrame(state_totals).T.reset_index()
    result.columns = ['Location'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


In [304]:
def merge_dataframes():
    employ = pd.read_csv('../Data/Cleaned/cleaned_employment.csv')
    income = pd.read_csv('../Data/Cleaned/cleaned_income.csv')
    life = pd.read_csv('../Data/Cleaned/cleaned_Life_Expentency.csv')
    insur = pd.read_csv('../Data/Cleaned/cleaned_insur.csv')
    pop_dense = pd.read_csv('../Data/Cleaned/cleaned_pop_dense.csv')
    pop_size = pd.read_csv('../Data/Cleaned/cleaned_pop_size.csv')
    pre_cond = pd.read_csv('../Data/Cleaned/cleaned_pre_condtions.csv')
    total_phys = pd.read_csv('../Data/Cleaned/cleaned_total_physician.csv')
    
    merged_df = employ.merge(income, on='Location').merge(life, on='Location').merge(insur, on='Location') \
        .merge(pop_dense, on='Location').merge(pop_size, on='Location').merge(pre_cond, on='Location') \
        .merge(total_phys, on='Location').merge(excess_deaths, on='Location')

    return merged_df
merged_data = merge_dataframes()

NameError: name 'excess_deaths' is not defined

In [None]:
merged_data.drop(columns=['Unnamed: 0'], inplace=True)
merged_data.head()

In [None]:
vax_state = pd.read_csv('Data/Cleaned/vax_state.csv')
vax_state.head()

In [None]:
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}
vax_state['Location'] = vax_state['Location'].map(state_name)

In [None]:
state_mask = pd.read_csv('Data/Cleaned/cleaned_mask.csv')
merged_data = merged_data.merge(vax_state, on='Location').merge(state_mask, on='Location')
merged_data.head()

In [None]:
merged_data['Population Density per mi²'] = merged_data['Population Density per mi²'].str.replace(',', '', regex=True).astype(float)
merged_data['Physicians'] = merged_data['Physicians'].str.replace(',', '', regex=True).astype(float)
merged_data['Active DO'] = merged_data['Active DO'].str.replace(',', '', regex=True).astype(float)
merged_data['Active MO'] = merged_data['Active MO'].str.replace(',', '', regex=True).astype(float)
merged_data['Mandatory'] = merged_data['Mandatory'].map({'Yes':1, 'No':0})

In [None]:
merged_data.dtypes

In [None]:
merged_data.isnull().sum()

In [None]:
merged_data['asthma_prevalence'].fillna(8.4, inplace=True)
merged_data['kidney_disease_prevalence'].fillna(52.1, inplace=True)
merged_data['flu_vaccination_rate_2019'].fillna(38, inplace=True)
merged_data['high_bp_prevalence'].fillna(33, inplace=True)
merged_data['copd_prevalence'].fillna(5.2, inplace =True)
merged_data['diabetes_prevalence'].fillna(9.5, inplace=True)

In [None]:
merged_data.to_csv('Data/Merged_state_data.csv', index=False)

In [None]:
merged = pd.read_csv('Data/Merged_state_data.csv')
merged.head()

In [None]:
merged = pd.merge(merged, health_behavior, how = 'left', on= 'Location')

In [None]:
merged = pd.read_csv('Data/merged_state_with_health.csv')

In [None]:
df = pd.read_csv('Data/merged_state_with_health.csv')
df.head()

In [None]:
df = df.merge(deaths_covid, on='Location')
df.head()

In [None]:
#df.drop('Mask_Mandate', inplace = True)
df['Population Density per mi²'] = df['Population Density per mi²'].str.replace(',', '', regex=True).astype(float)
df['Physicians'] = df['Physicians'].str.replace(',', '', regex=True).astype(float)
df['Active DO'] = df['Active DO'].str.replace(',', '', regex=True).astype(float)
df['Active MO'] = df['Active MO'].str.replace(',', '', regex=True).astype(float)
df['Mandatory'] = df['Mandatory'].map({'Yes':1, 'No':0})

In [None]:
df.to_csv('Data/merged_state_final.csv', index=False)

# County level Data

### Further Cleaning County Level Data

In [None]:
df = pd.read_csv('../Data/Cleaned/county_df2.csv')
df.head()

In [None]:
# Drop rows that we will not be using 'Segregation index black/white',
df.drop(columns = ['County', 'YPLL Rate (Black)', 'YPLL Rate (Hispanic)', 'YPLL Rate (White)', 'Number Uninsured', 'Number Primary Care Physicians', 'FIPS_y', 'Number pre-mature Deaths',
                        'Preventable Hosp. Rate (Black)', 'Preventable Hosp. Rate (Hispanic)', 'Preventable Hosp. Rate (White)',  'Percent Vaccinated Flu (Black)', 'Percent Uninsured',
                        'Percent  Vaccinated (Hispanic) Flu', 'Percent Vaccinated (White) Flu', 'Number Some College', 'Number Unemployed', 'Labor Force', 'PCP Ratio', 
                        '80th Percentile Income', '20th Percentile Income', '95% CI - Low', '95% CI - High', 'Life Expectancy (Black)', 'Life Expectancy (Hispanic)', 
                        'Life Expectancy (White)', 'Number HIV Cases', 'Household income (Black)', 'Household income (Hispanic)', 'Household income (White)'], inplace = True)

In [None]:
# Make FIPS index 
df.set_index('FIPS_x', inplace=True)

# Dummify  Presence of water violation
df['water'] = df['Presence of water violation'].map({'No': 0, 'Yes': 1})
df.drop(columns = ['Presence of water violation', 'State'], inplace = True)

# Set case and deaths
df['cases'] = df['cases_2022']
df['deaths'] = df['deaths_2022']
df.drop(columns = ['cases_2020', 'cases_2021', 'cases_2022', 'deaths_2020', 'deaths_2021', 'deaths_2022'], inplace = True)

# Drop NA values
df.dropna(inplace=True)
df.shape

In [None]:
df.to_csv('../Data/Cleaned/county_df3.csv')

# Exploratory Data Analysis

## State Level

In [None]:
state_df = pd.read_csv('Data/merged_state_final.csv')

In [None]:
covid_by_pop = state_df['covid_2020'] + state_df['covid_2021'] + state_df['covid_2022']
state_df['covid_deaths_by_population'] = (covid_by_pop / state_df['2020 Population']) * 10000


In [None]:
state_df.head()

In [None]:
top_10_states = state_df.sort_values(by='covid_deaths_by_population', ascending=True).head(10)

In [None]:
bottom_10_states = state_df.sort_values(by='covid_deaths_by_population', ascending=False).head(10)
bottom_10_states

In [None]:
plt.figure(figsize=(10, 6))

# Scatter plot for 2020
plt.scatter(state_df['Inc_Per_Cap_2020'], state_df.covid_deaths_by_population, label='2020', alpha=0.6)

# Scatter plot for 2021
plt.scatter(state_df['Inc_Per_Cap_2021'], state_df.covid_deaths_by_population, label='2021', alpha=0.6)

# Scatter plot for 2022
plt.scatter(state_df['Inc_Per_CAp_2022'], state_df.covid_deaths_by_population, label='2022', alpha=0.6)

plt.title('Scatter Plot of Income Per Capita by Covid Deaths (2020-2022)')
plt.xlabel('Income Per Capita')
plt.ylabel('covid_deaths_by_population')
plt.legend()

## County Level

In [None]:
county_df = pd.read_csv('Data/Cleaned/county_df2.csv')
county_df

In [None]:
covid_by_pop = county_df['deaths_2020'] + county_df['deaths_2021'] + county_df['deaths_2022']
county_df['covid_deaths_by_population'] = (covid_by_pop / county_df['Population']) * 10000

In [None]:
county_df.head()

In [None]:
plt.scatter(county_df['% Physically Inactive'], county_df.covid_deaths_by_population, label='Physically Inactive', alpha=0.25)

plt.title('Scatter Plot of % Physically Inactive by Covid Deaths (2020-2022)')
plt.xlabel('% Physically Inactive')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Percent Unemployed'], county_df.covid_deaths_by_population, label='Percent Unemployed', alpha=0.25)

plt.title('Scatter Plot of Percent Unemployed by Covid Deaths (2020-2022)')
plt.xlabel('% Percent Unemployed')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Average Daily PM2.5'], county_df.covid_deaths_by_population, label='Average Daily PM2.5', alpha=0.25)

plt.title('Scatter Plot of Average Daily PM2.5 by Covid Deaths (2020-2022)')
plt.xlabel('Average Daily PM2.5')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Percent Insufficient Sleep'], county_df.covid_deaths_by_population, label='Percent Insufficient Sleep', alpha=0.25)

plt.title('Scatter Plot of Percent Insufficient Sleep by Covid Deaths (2020-2022)')
plt.xlabel('Percent Insufficient Sleep')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Percent Uninsured Adults'], county_df.covid_deaths_by_population, label='Percent Uninsured Adults', alpha=0.25)
plt.title('Scatter Plot of Percent Uninsured Adults by Covid Deaths (2020-2022)')
plt.xlabel('Percent Uninsured Adults')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Population'], county_df.covid_deaths_by_population, label='Population', alpha=0.25)

plt.title('Scatter Plot of Population by Covid Deaths (2020-2022)')
plt.xlabel('Population')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['percent Asian'], county_df.covid_deaths_by_population, label='percent Asian', alpha=0.25)

plt.title('Scatter Plot of percent Asian by Covid Deaths (2020-2022)')
plt.xlabel('percent Asian')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Masks'], county_df.covid_deaths_by_population, label='Masks', alpha=0.25)

plt.title('Scatter Plot of Masks by Covid Deaths (2020-2022)')
plt.xlabel('Masks')
plt.ylabel('covid_deaths_by_population')
plt.legend()

In [None]:
plt.scatter(county_df['Administered_Dose1_Pop_Pct'], county_df.covid_deaths_by_population, label='Administered_Dose1_Pop_Pct', alpha=0.25)
plt.title('Scatter Plot of Administered_Dose1_Pop_Pct by Covid Deaths (2020-2022)')
plt.xlabel('Administered_Dose1_Pop_Pct')
plt.ylabel('covid_deaths_by_population')
plt.legend()