### Imports

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

# State Level Data

### Census Population

##### 2020
Used webscraping to obtain population data for the years 2010 and 2020 from the US census cite.

In [12]:
# Census site url
url = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010220'
res = requests.get(url)

In [13]:
soup = BeautifulSoup(res.content)
table = soup.find(attrs={'class':'qf-graph-scroll'})
# soup.find_all(attrs={'class':'qf-positive'})
tbody = table.find_all(attrs={'class':"qf-graph-geo"})
tr = tbody[1]
state = tr.find('a').attrs['data-title']
tr.find(attrs={'class':'qf-positive'}).attrs['data-value']

'29145505'

In [14]:
pop_2020 = []
for tr in tbody:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2020 Population': pop}
    pop_2020.append(state_pop)

pop_2020 = pd.DataFrame(pop_2020)
pop_2020.head()

Unnamed: 0,State,2020 Population
0,California,39538223
1,Texas,29145505
2,Florida,21538187
3,New York,20201249
4,Pennsylvania,13002700


##### 2010
The same process was followed for the year 2010.

In [15]:
url2 = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010210'
res2 = requests.get(url2)

In [16]:
soup2 = BeautifulSoup(res2.content)
table2 = soup2.find(attrs={'class':'qf-graph-scroll'})
tbody2 = table2.find_all(attrs={'class':"qf-graph-geo"})

In [17]:
pop_2010 = []
for tr in tbody2:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2010 Population': pop}
    pop_2010.append(state_pop)

pop_2010 = pd.DataFrame(pop_2010)

population = pd.merge(pop_2010, pop_2020)
population.head()

Unnamed: 0,State,2010 Population,2020 Population
0,California,37253956,39538223
1,Texas,25145561,29145505
2,New York,19378102,20201249
3,Florida,18801310,21538187
4,Illinois,12830632,12812508


In [8]:
population.dtypes

State              object
2010 Population    object
2020 Population    object
dtype: object

In [18]:
# 
population[['2010 Population', '2020 Population']] = population[['2010 Population',
                                                                 '2020 Population']].astype(int)
population.to_csv('../Data/Population_data_2010_&_2020.csv', index=False)

### Excess Deaths Data

In [3]:
deaths = pd.read_csv('../Ignore/Excess_Deaths_Associated_with_COVID-19.csv')
deaths.head(100)

Unnamed: 0,Week Ending Date,State,Observed Number,Upper Bound Threshold,Exceeds Threshold,Average Expected Count,Excess Estimate,Total Excess Estimate,Percent Excess Estimate,Year,Type,Outcome,Suppress,Note
0,2017-01-07,Alabama,1121.0,1136,False,1059,62,29601,5.8527,2017,Predicted (weighted),All causes,,
1,2017-01-14,Alabama,1130.0,1140,False,1067,63,29601,5.906102,2017,Predicted (weighted),All causes,,
2,2017-01-21,Alabama,1048.0,1142,False,1071,0,29601,0.0,2017,Predicted (weighted),All causes,,
3,2017-01-28,Alabama,1026.0,1142,False,1070,0,29601,0.0,2017,Predicted (weighted),All causes,,
4,2017-02-04,Alabama,1036.0,1142,False,1068,0,29601,0.0,2017,Predicted (weighted),All causes,,
5,2017-02-11,Alabama,1058.0,1136,False,1062,0,29601,0.0,2017,Predicted (weighted),All causes,,
6,2017-02-18,Alabama,1060.0,1132,False,1057,3,29601,0.283804,2017,Predicted (weighted),All causes,,
7,2017-02-25,Alabama,1099.0,1126,False,1051,48,29601,4.567542,2017,Predicted (weighted),All causes,,
8,2017-03-04,Alabama,1081.0,1119,False,1042,39,29601,3.743414,2017,Predicted (weighted),All causes,,
9,2017-03-11,Alabama,1011.0,1113,False,1036,0,29601,0.0,2017,Predicted (weighted),All causes,,


In [4]:
covid = deaths[(deaths['Type']=='Predicted (weighted)')&((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
       (deaths['Year'] == 2020) | (deaths['Year'] == 2021) | (deaths['Year'] == 2022)]

In [5]:
covid['Type'].value_counts()

Predicted (weighted)    16956
Unweighted               5670
Name: Type, dtype: int64

In [6]:
covid['Outcome'].value_counts()

All causes                        14148
All causes, excluding COVID-19     8478
Name: Outcome, dtype: int64

In [7]:
covid = deaths[(deaths['Type'] == 'Predicted (weighted)') & 
               ((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
               (deaths['Year'].isin([2020, 2021, 2022]))]

def covid_deaths(data):
    grouped = data.groupby(['Year', 'State', 'Outcome'])['Excess Estimate'].sum().reset_index()
    pivoted = grouped.pivot(index=['Year', 'State'], columns='Outcome', 
                            values='Excess Estimate').reset_index()
    pivoted = pivoted.fillna(0)
    pivoted['covid'] = pivoted['All causes'] - pivoted['All causes, excluding COVID-19']
    pivoted.rename(columns={'Year': 'Location'}, inplace=True)
    results = pivoted.to_dict(orient='records')

    return results

deaths_covid = pd.DataFrame(covid_deaths(covid))

In [8]:
deaths_covid = deaths_covid.pivot(index='State', columns='Location', 
                                  values=['All causes', 'covid'])

deaths_covid.columns = ['_'.join(map(str, col)) for col in deaths_covid.columns]
deaths_covid = deaths_covid.reset_index()
deaths_covid.rename(columns={'State': 'Location'}, inplace=True)
deaths_covid.rename(columns={
    'All causes_2020': 'all_causes_2020',
    'All causes_2021': 'all_causes_2021',
    'All causes_2022': 'all_causes_2022',
    'covid_2020': 'covid_2020',
    'covid_2021': 'covid_2021',
    'covid_2022': 'covid_2022',
}, inplace=True)
deaths_covid.head()

Unnamed: 0,Location,all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022
0,Alabama,9021,13018,6246,6337,9771,3933
1,Alaska,545,1429,686,213,804,275
2,Arizona,13186,17961,8835,8603,13536,5849
3,Arkansas,4992,6908,3854,3691,5333,2593
4,California,41279,60680,36786,29962,48834,21158


In [9]:
deaths_covid.dtypes

Location           object
all_causes_2020     int64
all_causes_2021     int64
all_causes_2022     int64
covid_2020          int64
covid_2021          int64
covid_2022          int64
dtype: object

In [10]:
(deaths_covid[['covid_2020', 'covid_2021', 'covid_2022']].sum()).sum()

2045325

In [19]:
deaths_covid['Location_Upper'] = deaths_covid['Location'].str.upper()
population['State_Upper'] = population['State'].str.upper()

for year in [2020, 2021, 2022]:
    deaths_covid[f'Covid_pop_perce_{year}'] = (
        deaths_covid.apply(
            lambda row: (
                row[f'covid_{year}'] /
                population.loc[
                    population['State_Upper'] == row['Location_Upper'],
                    '2020 Population'
                ].values
            )[0] if row['Location_Upper'] in population['State_Upper'].values else None,
            axis=1
        )
    )

deaths_covid.head()

Unnamed: 0,Location,all_causes_2020,all_causes_2021,all_causes_2022,covid_2020,covid_2021,covid_2022,Location_Upper,Covid_pop_perce_2020,Covid_pop_perce_2021,Covid_pop_perce_2022
0,Alabama,9021,13018,6246,6337,9771,3933,ALABAMA,0.001261,0.001945,0.000783
1,Alaska,545,1429,686,213,804,275,ALASKA,0.00029,0.001096,0.000375
2,Arizona,13186,17961,8835,8603,13536,5849,ARIZONA,0.001203,0.001893,0.000818
3,Arkansas,4992,6908,3854,3691,5333,2593,ARKANSAS,0.001226,0.001771,0.000861
4,California,41279,60680,36786,29962,48834,21158,CALIFORNIA,0.000758,0.001235,0.000535


In [20]:
deaths_covid.dropna(inplace=True)
deaths_covid.drop(columns=['Location_Upper'], inplace=True)

In [22]:
deaths_covid.to_csv('../Data/Cleaned/cleaned_covid_death_state.csv', index = False)

In [23]:
# getting sum of each states excess deaths 
def excess_deaths(deaths, start_year=2017, end_year=2022):
    state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        excess_estimate = row['Excess Estimate']
        
        if start_year <= year <= end_year:
            state_totals[state][year - start_year] += excess_estimate

    result = pd.DataFrame(state_totals).T.reset_index()
    result.columns = ['Location'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


Unnamed: 0,Location,Exc_deaths_2017,Exc_deaths_2018,Exc_deaths_2019,Exc_deaths_2020,Exc_deaths_2021,Exc_deaths_2022
0,Alabama,2649,4062,945,20726,29283,14805
1,Alaska,417,492,639,1422,3483,1783
2,Arizona,3522,4281,1785,30955,40347,20653
3,Arkansas,3054,2199,1434,11285,15391,8969
4,California,17241,13434,1029,93875,133206,89195


In [25]:
excess_deaths.to_csv('../Data/Cleaned/excess_deaths.csv', index = False)

In [None]:
# Getting count for each year and each state where they exceed their threshold
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    result = pd.DataFrame()
    
    for year in range(start_year, end_year + 1):
        
        year_data = deaths[deaths['Year'] == year]
        year_counts = year_data.groupby('State')['Exceeds Threshold'].sum().reset_index()
        year_counts = year_counts.rename(columns={'Exceeds Threshold': f'Exc_count_{year}'})
        year_counts.set_index('State', inplace=True)
        
        if result.empty:
            result = year_counts
        else:
            result = result.merge(year_counts, left_index=True, right_index=True, how='outer')
    
    result = result.reset_index().fillna(0)
    
    return result

In [None]:
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    results = []
    
    for year in range(start_year, end_year + 1):
        year_counts = deaths[deaths['Year'] == year]
        state_counts = {}
        
        for state in year_counts['State'].unique():
            count = year_counts[(year_counts['State'] == state) & (year_counts['Exceeds Threshold'] == 'True')].shape[0]
            state_counts[state] = count
        
        results.append({'Year': year, **state_counts})
    
    return pd.DataFrame(results)

exceeds_threshold = count_exceeds_threshold(deaths, start_year=2017, end_year=2022)
exceeds_threshold

In [None]:
deaths["Week Ending Date"] = pd.to_datetime(deaths["Week Ending Date"])
deaths["Year"] = deaths["Week Ending Date"].dt.year
exceeds_threshold = deaths[deaths["Exceeds Threshold"] == True]
result = exceeds_threshold.groupby(["State", "Year"]).size().reset_index(name="Exceeds Threshold Count")

result

In [None]:
# Merging the data
finaldeaths = pd.merge(excess_deaths, exceeds_threshold, how='left')
finaldeaths.rename(columns={'State':'Location'}, inplace=True)
finaldeaths.head()

In [None]:
#finaldeaths.to_csv('Data/Excess_deaths&Exceeds_Threshold_data.csv')

In [None]:
url3 = 'https://wisevoter.com/state-rankings/population-density-by-state/'
res3 = requests.get(url3)
soup3 = BeautifulSoup(res3.content)

In [None]:
table3 = soup3.find('table', attrs={'id': 'shdb-on-page-table'})
tbody = table3.find('tbody')
trs = tbody.find_all('tr')

In [None]:
pop_density = []
for tr in trs:
    state = tr.find(attrs={'class':'shdb-on-page-table-body-Geo'}).text
    density = tr.find(attrs={'class':'shdb-on-page-table-body-Data'}).text.split()[0]
    
    state_density = {'State':state, 'Population Density per mi²': density}
    pop_density.append(state_density)
Pop_density = pd.DataFrame(pop_density)
Pop_density.head()

In [None]:
Pop_density.to_csv('Data/Population_Density_data.csv')

## Pre-Existing Medical Conditions

### Asthma

CSV files with information on asthma, diabetes, cancer, copd, heart disease, kidney disease incident rates/ prevalence rates were downloaded from the CDC website. These csv files are each approximately 40 MB with responses to various questions, rates by demographic as well as overall population totals for more than a decade. Each dataset must be cleaned and unecessary information dropped to reduce the files to a manageable size.

In [35]:
asthma = pd.read_csv('../Ignore/Asthma.csv')
asthma.shape

(80342, 33)

In [36]:
asthma.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,5,AST,AST3_1,NMBR,GENDER,GENM,,,,
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,8,AST,AST3_1,NMBR,OVERALL,OVR,,,,
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,11,AST,AST3_1,NMBR,OVERALL,OVR,,,,
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,13,AST,AST3_1,NMBR,GENDER,GENF,,,,
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,,Number,...,26,AST,AST3_1,NMBR,RACE,HIS,,,,


In [37]:
asthma.Question.value_counts()

Asthma mortality rate                                                                      13497
Current asthma prevalence among adults aged >= 18 years                                     9570
Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma        9570
Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma        9570
Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma     9570
Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma     9570
Hospitalizations for asthma                                                                 7812
Emergency department visit rate for asthma                                                  7608
Asthma prevalence among women aged 18-44 years                                              3575
Name: Question, dtype: int64

In [38]:
asthma = asthma[asthma['Question']== "Current asthma prevalence among adults aged >= 18 years"]

In [39]:
asthma.describe()

Unnamed: 0,YearStart,YearEnd,Response,DataValue,DataValueAlt,LowConfidenceLimit,HighConfidenceLimit,StratificationCategory2,Stratification2,StratificationCategory3,Stratification3,ResponseID,LocationID,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
count,9570.0,9570.0,0.0,7662.0,7662.0,7662.0,7662.0,0.0,0.0,0.0,0.0,0.0,9570.0,0.0,0.0,0.0,0.0
mean,2016.0,2016.0,,10.195184,10.195184,8.039585,13.059867,,,,,,31.542529,,,,
std,3.162443,3.162443,,3.745644,3.745644,2.676902,5.999667,,,,,,18.26542,,,,
min,2011.0,2011.0,,1.9,1.9,1.1,3.3,,,,,,1.0,,,,
25%,2013.0,2013.0,,7.7,7.7,6.1,9.3,,,,,,17.0,,,,
50%,2016.0,2016.0,,9.6,9.6,8.0,11.4,,,,,,31.0,,,,
75%,2019.0,2019.0,,11.9,11.9,9.8,14.8,,,,,,45.0,,,,
max,2021.0,2021.0,,44.1,44.1,26.9,66.6,,,,,,78.0,,,,


In [40]:
# We only need data for pre-existing conditions for 1 year
asthma = asthma[asthma['YearStart'] == 2019]

In [41]:
asthma.shape

(870, 33)

In [42]:
asthma.YearStart.value_counts()

2019    870
Name: YearStart, dtype: int64

In [43]:
asthma = asthma[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue',
                 'Stratification1', 'DataValueType']]
asthma.head(), asthma.shape

(       YearStart LocationAbbr LocationDesc  \
 28919       2019           GU         Guam   
 28927       2019           AK       Alaska   
 28932       2019           AK       Alaska   
 28977       2019           AK       Alaska   
 28983       2019           AK       Alaska   
 
                                                 Question  DataValue  \
 28919  Current asthma prevalence among adults aged >=...        7.5   
 28927  Current asthma prevalence among adults aged >=...       10.0   
 28932  Current asthma prevalence among adults aged >=...       12.6   
 28977  Current asthma prevalence among adults aged >=...        7.0   
 28983  Current asthma prevalence among adults aged >=...        7.0   
 
            Stratification1            DataValueType  
 28919               Female  Age-adjusted Prevalence  
 28927  White, non-Hispanic         Crude Prevalence  
 28932               Female  Age-adjusted Prevalence  
 28977                 Male  Age-adjusted Prevalence  
 28983 

In [44]:
asthma['LocationDesc'].nunique()

55

In [45]:
asthma.drop_duplicates(inplace=True)
asthma = asthma[asthma['Stratification1'] == 'Overall']
asthma.shape

(110, 7)

### Defining a cleaning function for all pre-existing medical condition data
Data for a range of medical conditions at the state level were all collected from the CDC site. Information for each disease was downloaded as a CSV with similar columns and structure to the asthma dataset. Using the steps above, each dataset was cleaned with a cleaning function.

In [46]:
def cleaning(df):
    print(df.shape)
    df = df[df['YearStart'] == 2019]
    print(df.shape)
    df = df[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1', 'DataValueType']]
    print(df.shape)
    df = df[df['Stratification1'] == 'Overall']
    return df  
               

The pre-existing conditions are:
- cardiovascular disease
- high blood pressure
- COPD
- Diabetes
- Kidney Disease

These conditions were chosen as these conditions are known to exacerbate the impacts of coronavirus and individuals with these conditions were more likely to have more severe cases and to die if infected with coronavirus. 

Immunization data including influenza vaccination rates was also collected. 

In [47]:
heart = pd.read_csv('../Ignore/Cardiovascular.csv')
heart.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2010,2010,OR,Oregon,NVSS,Cardiovascular Disease,Mortality from heart failure,,,Number,...,41,CVD,CVD1_4,NMBR,RACE,AIAN,,,,
1,2013,2013,IN,Indiana,NVSS,Cardiovascular Disease,Mortality from heart failure,,"cases per 100,000",Crude Rate,...,18,CVD,CVD1_4,CRDRATE,GENDER,GENM,,,,
2,2017,2017,CA,California,NVSS,Cardiovascular Disease,Mortality from heart failure,,,Number,...,6,CVD,CVD1_4,NMBR,GENDER,GENF,,,,
3,2010,2010,OK,Oklahoma,NVSS,Cardiovascular Disease,Mortality from coronary heart disease,,"cases per 100,000",Crude Rate,...,40,CVD,CVD1_3,CRDRATE,GENDER,GENF,,,,
4,2020,2020,OK,Oklahoma,NVSS,Cardiovascular Disease,Mortality from heart failure,,"cases per 100,000",Age-adjusted Rate,...,40,CVD,CVD1_4,AGEADJRATE,RACE,BLK,,,,


In [48]:
# Splitting cardiac dataset into deaths from cardio diseases and diagnosed prevalence of hypertension
cardiac_mortality = heart[heart['Question'] == 'Mortality from total cardiovascular diseases']
print(cardiac_mortality.shape)

high_blood_pressure = heart[heart['Question']== 'Awareness of high blood pressure among adults aged >= 18 years']
print(high_blood_pressure.shape)

(13497, 33)
(5220, 33)


In [49]:
copd = pd.read_csv('../Ignore/COPD.csv')
copd.shape

(152874, 33)

In [50]:
copd.Question.value_counts()
copd = copd[copd['Question']== 'Prevalence of chronic obstructive pulmonary disease among adults >= 18']
copd.shape

(9570, 33)

In [51]:
diabetes = pd.read_csv('../Ignore/Diabetes.csv')
diabetes.shape

(156808, 33)

In [52]:
diabetes['Question'].value_counts()
diabetes = diabetes[diabetes['Question']== 'Prevalence of diagnosed diabetes among adults aged >= 18 years']
diabetes.shape

(9570, 33)

In [53]:
kidney = pd.read_csv('../Ignore/Kidney.csv')

In [54]:
kidney['Question'].value_counts()
kidney = kidney[kidney['Question']== 'Prevalence of chronic kidney disease among adults aged >= 18 years']
kidney.shape

(9570, 33)

In [55]:
immun = pd.read_csv('../Ignore/Immunization.csv')

In [56]:
immun.shape

(9570, 33)

In [57]:
cardiac_mortality = cleaning(cardiac_mortality)
high_blood_pressure = cleaning(high_blood_pressure)
copd = cleaning(copd)
diabetes = cleaning(diabetes)
immun = cleaning(immun)
kidney = cleaning(kidney)

(13497, 33)
(1227, 33)
(1227, 7)
(5220, 33)
(870, 33)
(870, 7)
(9570, 33)
(870, 33)
(870, 7)
(9570, 33)
(870, 33)
(870, 7)
(9570, 33)
(870, 33)
(870, 7)
(9570, 33)
(870, 33)
(870, 7)


In [58]:
#Examining the size of each dataset to ensure cleaning function worked

print(asthma.shape)
#print(cancer.shape)
print(cardiac_mortality.shape)
print(high_blood_pressure.shape)
print(copd.shape)
print(diabetes.shape)
print(immun.shape)
print(kidney.shape)

(110, 7)
(156, 7)
(110, 7)
(110, 7)
(110, 7)
(110, 7)
(110, 7)


In [59]:
def adj(df):
    adj_df = df[df['DataValueType'] == 'Age-adjusted Prevalence'].copy()
    adj_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return adj_df

def crude(df):
    crude_df = df[df['DataValueType'] == 'Prevalence'].copy()
    crude_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return crude_df

# Cardiac mortality is 'Age-adjusted Rate' and 'Crude Rate' not 'Prevalence'- fix this in final code

### Saving the datasets to the Data folder
Once the file size of each dataset was reduced enough and the features were reduced to those pertaining to the problem, the pre-existing conditions were saved as csvs separated as raw and age-adjusted numbers. 

In [60]:
def process_dataframes(dataframe_list):
    results = {} 
    for name, df in dataframe_list.items():
        adj_result = adj(df)
        crude_result = crude(df)
        results[name] = {'adj': adj_result, 'crude': crude_result}
    return results

# Create a list of DataFrames 
dataframe_list = {
    'copd': copd,
    'asthma': asthma,
    'high_blood_pressure': high_blood_pressure,
    'diabetes': diabetes,
    'immun': immun,
    'kidney': kidney
}
# Process the list of DataFrames
results = process_dataframes(dataframe_list)

# Save the results to CSV
for name, result in results.items():
    result['adj'].to_csv(f'../Data/Raw/{name}_adj.csv', index = False)
    result['crude'].to_csv(f'../Data/Raw/{name}_crude.csv', index = False)

In [61]:
cardiac_mortality = cardiac_mortality[cardiac_mortality['DataValueType'] == 'Age-adjusted Rate']
cardiac_mortality.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
cardiac_mortality.to_csv('../Data/Raw/cardiac_mortality_adj.csv', index = False)

## Vaccination Rates by Stae

In [63]:
vax = pd.read_csv('../Ignore/COVID-19_Vaccinations_in_the_United_States_Jurisdiction_20231101.csv')

In [64]:
vax_2021 = vax[vax['Date'] == '12/31/2021'].copy()

In [65]:
vax_2021.isna().sum()

Date                                       0
MMWR_week                                  0
Location                                   0
Dist_Per_100K                              0
Distributed_Per_100k_65Plus                0
Admin_Per_100K                             0
Admin_Per_100k_65Plus                      0
Administered_Dose1_Pop_Pct                 0
Administered_Dose1_Recip_65PlusPop_Pct     0
Series_Complete_Pop_Pct                    0
Series_Complete_65PlusPop_Pct              0
Additional_Doses_Vax_Pct                   0
Additional_Doses_50Plus_Vax_Pct            0
Additional_Doses_65Plus_Vax_Pct            0
Second_Booster                            64
Second_Booster_50Plus_Vax_Pct             64
Second_Booster_65Plus_Vax_Pct             64
Bivalent_Booster_65Plus_Pop_Pct           64
dtype: int64

In [66]:
vax_2021.drop(columns = ['MMWR_week', 'Date', 'Second_Booster', 'Second_Booster_50Plus_Vax_Pct', 'Second_Booster_65Plus_Vax_Pct', 'Bivalent_Booster_65Plus_Pop_Pct'], inplace=True) 

In [67]:
vax_2022 = vax[vax['Date'] == '12/28/2022'].copy()

In [68]:
vax_2022.drop(columns = ['MMWR_week', 'Date', 'Second_Booster'], inplace =True)

In [69]:
vax_2022.isna().sum()

Location                                  0
Dist_Per_100K                             0
Distributed_Per_100k_65Plus               0
Admin_Per_100K                            0
Admin_Per_100k_65Plus                     0
Administered_Dose1_Pop_Pct                0
Administered_Dose1_Recip_65PlusPop_Pct    0
Series_Complete_Pop_Pct                   0
Series_Complete_65PlusPop_Pct             0
Additional_Doses_Vax_Pct                  0
Additional_Doses_50Plus_Vax_Pct           0
Additional_Doses_65Plus_Vax_Pct           0
Second_Booster_50Plus_Vax_Pct             0
Second_Booster_65Plus_Vax_Pct             0
Bivalent_Booster_65Plus_Pop_Pct           0
dtype: int64

In [70]:
vax_state = pd.merge(vax_2021, vax_2022, how ='inner', on='Location', suffixes=('_2021', '_2022'))

In [71]:
vax_state.isna().sum()

Location                                       0
Dist_Per_100K_2021                             0
Distributed_Per_100k_65Plus_2021               0
Admin_Per_100K_2021                            0
Admin_Per_100k_65Plus_2021                     0
Administered_Dose1_Pop_Pct_2021                0
Administered_Dose1_Recip_65PlusPop_Pct_2021    0
Series_Complete_Pop_Pct_2021                   0
Series_Complete_65PlusPop_Pct_2021             0
Additional_Doses_Vax_Pct_2021                  0
Additional_Doses_50Plus_Vax_Pct_2021           0
Additional_Doses_65Plus_Vax_Pct_2021           0
Dist_Per_100K_2022                             0
Distributed_Per_100k_65Plus_2022               0
Admin_Per_100K_2022                            0
Admin_Per_100k_65Plus_2022                     0
Administered_Dose1_Pop_Pct_2022                0
Administered_Dose1_Recip_65PlusPop_Pct_2022    0
Series_Complete_Pop_Pct_2022                   0
Series_Complete_65PlusPop_Pct_2022             0
Additional_Doses_Vax

In [73]:
vax_state.to_csv('../Data/Cleaned/vax_state.csv', index=False)

# County Level Data

### Mask Data

In [5]:
county_mask = pd.read_csv('../Ignore/U.S._State_and_Territorial_Public_Mask_Mandates_From_April_10__2020_through_August_15__2021_by_County_by_Day.csv')
county_mask.head()

Unnamed: 0,State_Tribe_Territory,County_Name,FIPS_State,FIPS_County,date,order_code,Face_Masks_Required_in_Public,Source_of_Action,URL,Citation
0,AL,Autauga County,1,1,4/10/2020,2,,,,
1,AL,Autauga County,1,1,4/11/2020,2,,,,
2,AL,Autauga County,1,1,4/12/2020,2,,,,
3,AL,Autauga County,1,1,4/13/2020,2,,,,
4,AL,Autauga County,1,1,4/14/2020,2,,,,


In [6]:
# Chat GPT was consulted to create more efficient code which ran faster
def mandate_length(data):
    county_counts = {}
    for index, row in data.iterrows():
        county_name = row['County_Name']
        mask_required = row['Face_Masks_Required_in_Public']
        state = row['State_Tribe_Territory']
        
        if mask_required == 'Yes':
            if county_name in county_counts:
                county_counts[county_name]['Count'] += 1
            else:
                county_counts[county_name] = {'State': state, 'Count': 1}

    county_counts_list = [{'State': v['State'], 'County': k, 'Count': v['Count']} for k, v in county_counts.items()]

    return pd.DataFrame(county_counts_list)

county_counts_df = mandate_length(county_mask)

In [7]:
county_counts_df.to_csv('../Data/Raw/county_mask_mandata.csv', index=False)

## Covid Vaccination Rates by county

In [78]:
# The initial dataset was downloaded from the CDC site and stored in the Ignore Folder
covid_vax = pd.read_csv('../Ignore/COVID-19_Vaccinations_by_county.csv')
covid_vax.shape

(871062, 15)

In [79]:
covid_vax.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct,Series_Complete_Pop_Pct,Series_Complete_65PlusPop_Pct,Booster_Doses_Vax_Pct,Booster_Doses_50Plus_Vax_Pct,Booster_Doses_65Plus_Vax_Pct,Metro_status,Census2019,Census2019_65PlusPop
0,12/28/2022,2013,52,Aleutians East Borough,AK,86.9,55.0,74.9,49.9,41.4,59.0,80.6,Non-metro,3337.0,351.0
1,12/28/2022,2016,52,Aleutians West Census Area,AK,77.8,69.9,64.2,59.9,38.6,59.8,76.1,Non-metro,5634.0,419.0
2,12/28/2022,2020,52,Anchorage Municipality,AK,79.5,95.0,70.8,95.0,49.8,67.9,77.7,Metro,288000.0,33757.0
3,12/28/2022,2050,52,Bethel Census Area,AK,74.0,89.3,68.7,86.0,51.8,74.9,82.9,Non-metro,18386.0,1448.0
4,12/28/2022,2060,52,Bristol Bay Borough,AK,95.0,95.0,95.0,91.2,41.0,67.6,88.7,Non-metro,836.0,136.0


In [80]:
# keeping 9/18/2021 & 12/28/2022 for point in time data 
early_vax = covid_vax[covid_vax['Date'] == '9/18/2021'].copy()
late_vax = covid_vax[covid_vax['Date'] == '12/28/2022'].copy()

In [81]:
early_vax.drop(columns=['Series_Complete_Pop_Pct',
                        'Series_Complete_65PlusPop_Pct'], inplace = True)
early_vax.head()

Unnamed: 0,Date,FIPS,MMWR_week,Recip_County,Recip_State,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct,Booster_Doses_Vax_Pct,Booster_Doses_50Plus_Vax_Pct,Booster_Doses_65Plus_Vax_Pct,Metro_status,Census2019,Census2019_65PlusPop
866518,9/18/2021,2013,37,Aleutians East Borough,AK,74.1,47.3,,,,Non-metro,3337.0,
866519,9/18/2021,2016,37,Aleutians West Census Area,AK,64.9,65.2,,,,Non-metro,5634.0,
866520,9/18/2021,2020,37,Anchorage Municipality,AK,60.7,92.3,,,,Metro,288000.0,
866521,9/18/2021,2050,37,Bethel Census Area,AK,60.4,87.2,,,,Non-metro,18386.0,
866522,9/18/2021,2060,37,Bristol Bay Borough,AK,99.9,90.4,,,,Non-metro,836.0,


In [82]:
early_vax.dropna(inplace=True)

In [83]:
late_vax.drop(columns=['Date', 'MMWR_week', 'Series_Complete_65PlusPop_Pct', 'Census2019', 'Census2019_65PlusPop'], inplace=True)
late_vax.head()

Unnamed: 0,FIPS,Recip_County,Recip_State,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct,Series_Complete_Pop_Pct,Booster_Doses_Vax_Pct,Booster_Doses_50Plus_Vax_Pct,Booster_Doses_65Plus_Vax_Pct,Metro_status
0,2013,Aleutians East Borough,AK,86.9,55.0,74.9,41.4,59.0,80.6,Non-metro
1,2016,Aleutians West Census Area,AK,77.8,69.9,64.2,38.6,59.8,76.1,Non-metro
2,2020,Anchorage Municipality,AK,79.5,95.0,70.8,49.8,67.9,77.7,Metro
3,2050,Bethel Census Area,AK,74.0,89.3,68.7,51.8,74.9,82.9,Non-metro
4,2060,Bristol Bay Borough,AK,95.0,95.0,95.0,41.0,67.6,88.7,Non-metro


In [86]:
# Saving the smaller csv files into the raw data folder
early_vax.to_csv('../Data/Raw/early_vax_2021.csv', index=False)
late_vax.to_csv('../Data/Raw/late_vax_2022.csv', index=False)

## Population

In [3]:
# Reading in the population data downloaded from usafacts
pop = pd.read_csv('../Ignore/covid_county_population_usafacts.csv')
pop.describe()

Unnamed: 0,countyFIPS,population
count,3195.0,3195.0
mean,29882.22097,102735.4
std,15524.479302,330947.7
min,0.0,0.0
25%,18098.0,10399.5
50%,29123.0,25127.0
75%,45054.0,66941.5
max,56045.0,10039110.0


In [4]:
pop.head()

Unnamed: 0,countyFIPS,County Name,State,population
0,0,Statewide Unallocated,AL,0
1,1001,Autauga County,AL,55869
2,1003,Baldwin County,AL,223234
3,1005,Barbour County,AL,24686
4,1007,Bibb County,AL,22394


In [5]:
# States dictionary
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}

# List of states that can be used in cleaning functions
state_abbreviations = list(state_name.keys())

In [6]:
# Creating a df of populations for each county
state_pop_sum = {}
for index, row in pop.iterrows():
    state_abbr = row['State'] 
    tot_pop = row['population']  
    
    if state_abbr in state_pop_sum:
        state_pop_sum[state_abbr] += tot_pop
    else:
        state_pop_sum[state_abbr] = tot_pop

for state, pop_sum in state_pop_sum.items():
    print(f"{state}: {pop_sum}")

AL: 4903185
AK: 731545
AZ: 7278717
AR: 3017804
CA: 39512223
CO: 5758736
CT: 3565287
DE: 973764
DC: 705749
FL: 21477737
GA: 10617423
HI: 1415872
ID: 1787065
IL: 12671821
IN: 6732219
IA: 3155070
KS: 2913314
KY: 4467673
LA: 4648794
ME: 1344212
MD: 6045680
MA: 6892503
MI: 9986857
MN: 5639632
MS: 2976149
MO: 6137428
MT: 1068778
NE: 1934408
NV: 3080156
NH: 1359711
NJ: 8882190
NM: 2096829
NY: 19453561
NC: 10488084
ND: 762062
OH: 11689100
OK: 3956971
OR: 4217737
PA: 12801989
RI: 1059361
SC: 5148714
SD: 884659
TN: 6829174
TX: 28995881
UT: 3205958
VT: 623989
VA: 8535519
WA: 7614893
WV: 1792147
WI: 5822434
WY: 578759


In [7]:
tot_pop = pd.DataFrame(list(state_pop_sum.items()), columns=['State', 'Total Population'])
tot_pop.head()

Unnamed: 0,State,Total Population
0,AL,4903185
1,AK,731545
2,AZ,7278717
3,AR,3017804
4,CA,39512223


In [8]:
merged_pop = pd.merge(pop, tot_pop, on='State')
merged_pop.head()

Unnamed: 0,countyFIPS,County Name,State,population,Total Population
0,0,Statewide Unallocated,AL,0,4903185
1,1001,Autauga County,AL,55869,4903185
2,1003,Baldwin County,AL,223234,4903185
3,1005,Barbour County,AL,24686,4903185
4,1007,Bibb County,AL,22394,4903185


In [9]:
merged_pop['pop %'] = (merged_pop['population'] / merged_pop['Total Population'])*100
merged_pop.head()

Unnamed: 0,countyFIPS,County Name,State,population,Total Population,pop %
0,0,Statewide Unallocated,AL,0,4903185,0.0
1,1001,Autauga County,AL,55869,4903185,1.139443
2,1003,Baldwin County,AL,223234,4903185,4.552837
3,1005,Barbour County,AL,24686,4903185,0.503469
4,1007,Bibb County,AL,22394,4903185,0.456724


In [11]:
# Saving pop percent 
merged_pop.to_csv('../Data/Cleaned/population_w_percent.csv', index = False)

## Covid Cases by County

In [90]:
cases = pd.read_csv('../Ignore/covid_confirmed_usafacts.csv')
cases.head()

cases['County'] = cases['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()

new_pop = pd.read_csv('../Data/Cleaned/population_w_percent.csv')
new_pop['pop %'] = new_pop['pop %']/100
pop_percent = new_pop.drop(columns=['County Name', 'State', 'population', 'Total Population'])


In [91]:
cases = pd.merge(cases, pop_percent, how = 'left', on='countyFIPS').copy()

In [92]:
cases_2020 = cases[['countyFIPS', 'County', 'State', 'StateFIPS', '2020-12-31', 'pop %']].copy()
cases_2020.rename(columns = {'2020-12-31': 'cases_2020', 'pop %': 'pop_per'}, inplace =True)

In [8]:
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = cases_2020.loc[
        (cases_2020['State'] == state_abbr) & (cases_2020['County'] == 'Statewide Unallocated'),
        'cases_2020'
    ].values[0]
    
    county_list = cases_2020.loc[cases_2020['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        cases_2020.loc[
            (cases_2020['State'] == state_abbr) & (cases_2020['County'] == county),
            'cases_2020'
        ] += total_unallocated * cases_2020.loc[
            (cases_2020['State'] == state_abbr) & (cases_2020['County'] == county),
            'pop_per'
        ]
        
cases_2020 = cases_2020.drop_duplicates()

NameError: name 'state_abbreviations' is not defined

In [95]:
cases_2021 = cases[['countyFIPS', 'County', 'State', 'StateFIPS', '2021-12-31', 'pop %']].copy()
cases_2021.rename(columns = {'2021-12-31': 'cases_2021', 'pop %': 'pop_per'}, inplace =True)

for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = cases_2021.loc[
        (cases_2021['State'] == state_abbr) & (cases_2021['County'] == 'Statewide Unallocated'),
        'cases_2021'
    ].values[0]
    
    county_list = cases_2021.loc[cases_2021['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        cases_2021.loc[
            (cases_2021['State'] == state_abbr) & (cases_2021['County'] == county),
            'cases_2021'
        ] += total_unallocated * cases_2021.loc[
            (cases_2021['State'] == state_abbr) & (cases_2021['County'] == county),
            'pop_per'
        ]

cases_2021 = cases_2021.drop_duplicates()       

In [96]:
cases_2022 = cases[['countyFIPS', 'County', 'State', 'StateFIPS', '2022-12-31', 'pop %']].copy()
cases_2022.rename(columns = {'2022-12-31': 'cases_2022', 'pop %': 'pop_per'}, inplace =True)

In [97]:
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = cases_2022.loc[
        (cases_2022['State'] == state_abbr) & (cases_2022['County'] == 'Statewide Unallocated'),
        'cases_2022'
    ].values[0]
    
    county_list = cases_2022.loc[cases_2022['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        cases_2022.loc[
            (cases_2022['State'] == state_abbr) & (cases_2022['County'] == county),
            'cases_2022'
        ] += total_unallocated * cases_2022.loc[
            (cases_2022['State'] == state_abbr) & (cases_2022['County'] == county),
            'pop_per'
        ]
cases_2022 = cases_2022.drop_duplicates()

merged_cases = pd.merge(cases_2020, cases_2021, how='left', on='countyFIPS')
county_cases = pd.merge(merged_cases, cases_2022, how='left', on='countyFIPS')

county_cases.drop(columns = ['County','State', 'StateFIPS', 'pop_per_x', 'County_y', 'State_y', 'StateFIPS_y',
                  'pop_per_y'], inplace=True)
county_cases.head()

Unnamed: 0,countyFIPS,County_x,State_x,StateFIPS_x,cases_2020,cases_2021,cases_2022,pop_per
0,0,Statewide Unallocated,AL,1,0,0.0,0.0,0.0
1,0,Statewide Unallocated,AL,1,0,0.0,0.0,0.0
2,0,Statewide Unallocated,AL,1,0,0.0,1000.0,0.0
3,0,Statewide Unallocated,AL,1,0,0.0,3033.0,0.0
4,0,Statewide Unallocated,AL,1,0,0.0,4479.0,0.0


In [98]:
columns_to_check_duplicates = ['countyFIPS', 'County_x', 'State_x', 'StateFIPS_x']
county_cases.drop_duplicates(subset=columns_to_check_duplicates, keep='first', inplace=True)
county_cases.rename(columns = {'County_x' : 'County', 'State_x': 'State', 'StateFIPS_x': 'StateFIPS'}, inplace =True)


In [99]:
county_cases = county_cases.drop_duplicates()
county_cases.shape

(3193, 8)

#### Reading in the county Health Rankings Data

In [101]:
rank = pd.read_csv('../Data/Cleaned/2019 County Health Rankings Data - cleaned.csv')

## Deaths by County

In [102]:
deaths = pd.read_csv('../Ignore/covid_deaths_usafacts.csv')
deaths

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2023-07-14,2023-07-15,2023-07-16,2023-07-17,2023-07-18,2023-07-19,2023-07-20,2023-07-21,2023-07-22,2023-07-23
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,0,0,0,0,0,0,...,235,235,235,235,235,235,235,235,235,235
2,1003,Baldwin County,AL,1,0,0,0,0,0,0,...,731,731,731,731,731,731,731,731,731,731
3,1005,Barbour County,AL,1,0,0,0,0,0,0,...,104,104,104,104,104,104,104,104,104,104
4,1007,Bibb County,AL,1,0,0,0,0,0,0,...,111,111,111,111,111,111,111,111,111,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,56037,Sweetwater County,WY,56,0,0,0,0,0,0,...,142,142,142,142,142,142,142,142,142,142
3189,56039,Teton County,WY,56,0,0,0,0,0,0,...,16,16,16,16,16,16,16,16,16,16
3190,56041,Uinta County,WY,56,0,0,0,0,0,0,...,43,43,43,43,43,43,43,43,43,43
3191,56043,Washakie County,WY,56,0,0,0,0,0,0,...,51,51,51,51,51,51,51,51,51,51


In [103]:
deaths['County'] = deaths['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()
deaths = pd.merge(deaths, pop_percent, how = 'left', on='countyFIPS').copy()


In [104]:
deaths_2020 = deaths[['countyFIPS', 'County', 'State', 'StateFIPS', '2020-12-31', 'pop %']].copy()
deaths_2020.rename(columns = {'2020-12-31': 'deaths_2020', 'pop %': 'pop_per'}, inplace =True)
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = deaths_2020.loc[
        (deaths_2020['State'] == state_abbr) & (deaths_2020['County'] == 'Statewide Unallocated'),
        'deaths_2020'
    ].values[0]
    
    county_list = deaths_2020.loc[deaths_2020['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        deaths_2020.loc[
            (deaths_2020['State'] == state_abbr) & (deaths_2020['County'] == county),
            'deaths_2020'
        ] += total_unallocated * deaths_2020.loc[
            (deaths_2020['State'] == state_abbr) & (deaths_2020['County'] == county),
            'pop_per'
        ]

In [105]:
deaths_2020 = deaths_2020.drop_duplicates()

deaths_2021 = deaths[['countyFIPS', 'County', 'State', 'StateFIPS', '2021-12-31', 'pop %']].copy()
deaths_2021.rename(columns = {'2021-12-31': 'deaths_2021', 'pop %': 'pop_per'}, inplace =True)
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = deaths_2021.loc[
        (deaths_2021['State'] == state_abbr) & (deaths_2021['County'] == 'Statewide Unallocated'),
        'deaths_2021'
    ].values[0]
    
    county_list = deaths_2021.loc[deaths_2021['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        deaths_2021.loc[
            (deaths_2021['State'] == state_abbr) & (deaths_2021['County'] == county),
            'deaths_2021'
        ] += total_unallocated * deaths_2021.loc[
            (deaths_2021['State'] == state_abbr) & (deaths_2021['County'] == county),
            'pop_per'
        ]

deaths_2021 = deaths_2021.drop_duplicates()


In [106]:
deaths_2022 = deaths[['countyFIPS', 'County', 'State', 'StateFIPS', '2022-12-31', 'pop %']].copy()
deaths_2022.rename(columns = {'2022-12-31': 'deaths_2022', 'pop %': 'pop_per'}, inplace =True)
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = deaths_2022.loc[
        (deaths_2022['State'] == state_abbr) & (deaths_2022['County'] == 'Statewide Unallocated'),
        'deaths_2022'
    ].values[0]
    
    county_list = deaths_2022.loc[deaths_2022['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        deaths_2022.loc[
            (deaths_2022['State'] == state_abbr) & (deaths_2022['County'] == county),
            'deaths_2022'
        ] += total_unallocated * deaths_2022.loc[
            (deaths_2022['State'] == state_abbr) & (deaths_2022['County'] == county),
            'pop_per'
        ]
        
deaths_2022 = deaths_2022.drop_duplicates()


In [107]:
merged_deaths = pd.merge(deaths_2020, deaths_2021, how='left', on='countyFIPS')
county_deaths = pd.merge(merged_deaths, deaths_2022, how='left', on='countyFIPS')
county_deaths.drop(columns = ['County','State', 'StateFIPS', 'pop_per_x', 'County_y', 'State_y', 'StateFIPS_y',
                  'pop_per_y'], inplace=True)
county_deaths.head()

Unnamed: 0,countyFIPS,County_x,State_x,StateFIPS_x,deaths_2020,deaths_2021,deaths_2022,pop_per
0,0,Statewide Unallocated,AL,1,0.0,0.0,0.0,0.0
1,0,Statewide Unallocated,AL,1,0.0,0.0,0.0,0.0
2,0,Statewide Unallocated,AL,1,0.0,0.0,0.0,0.0
3,0,Statewide Unallocated,AL,1,0.0,0.0,0.0,0.0
4,0,Statewide Unallocated,AL,1,0.0,0.0,0.0,0.0


In [108]:
columns_to_check_duplicates = ['countyFIPS', 'County_x', 'State_x', 'StateFIPS_x']

county_deaths.drop_duplicates(subset=columns_to_check_duplicates, keep='first', inplace=True)

county_deaths.rename(columns = {'County_x' : 'County', 'State_x': 'State', 'StateFIPS_x': 'StateFIPS'}, inplace =True)

county_deaths = county_deaths.drop_duplicates()
county_deaths.head()

Unnamed: 0,countyFIPS,County,State,StateFIPS,deaths_2020,deaths_2021,deaths_2022,pop_per
0,0,Statewide Unallocated,AL,1,0.0,0.0,0.0,0.0
2601,1001,Autauga,AL,1,48.0,160.0,230.0,0.011394
2602,1003,Baldwin,AL,1,161.0,593.0,719.0,0.045528
2603,1005,Barbour,AL,1,32.0,81.0,103.0,0.005035
2604,1007,Bibb,AL,1,46.0,95.0,108.0,0.004567


In [109]:
print(county_deaths.shape)
print(county_cases.shape)

(3193, 8)
(3193, 8)


In [110]:
county_cases.drop(columns = ['StateFIPS','pop_per'], inplace=True)
county_deaths.drop(columns = ['County', 'countyFIPS', 'State','StateFIPS','pop_per'], inplace=True)

In [111]:
cases_deaths = pd.concat([county_cases,county_deaths], axis=1, join='outer')


In [112]:
cases_deaths.rename(columns={'countyFIPS': 'FIPS'},inplace=True)

In [113]:
rcd = pd.merge(rank,cases_deaths, how='inner', on='FIPS')
rcd.shape

(3142, 81)

In [114]:
## County Mask Mandates

In [116]:
mask = pd.read_csv('../Data/Raw/county_mask_mandata.csv')
#We can't drop state, some states have counties with the same name...
mask = mask.rename(columns = {'Count' : 'Masks'})
mask['County'] = mask['County'].str.replace(r'\bCounty\b', '', regex=True).str.strip()


In [117]:
mask.head()

Unnamed: 0,State,County,Masks
0,AL,Autauga,267
1,AL,Baldwin,267
2,AL,Barbour,267
3,AL,Bibb,267
4,AL,Blount,267


In [118]:
rcd.isna().sum()

FIPS                                                      0
State_x                                                   0
County_x                                                  0
Years of Potential Life Lost Rate (premature death)     234
YPLL Rate (Black)                                      1791
                                                       ... 
cases_2021                                                0
cases_2022                                                0
deaths_2020                                            3142
deaths_2021                                            3142
deaths_2022                                            3142
Length: 81, dtype: int64

In [119]:
# Look at just the columns with missing data
missing_columns = rcd.columns[rcd.isnull().any()]
print(missing_columns)

# We will have to address each of these columns and drop

Index(['Years of Potential Life Lost Rate (premature death)',
       'YPLL Rate (Black)', 'YPLL Rate (Hispanic)', 'YPLL Rate (White)',
       'Food Environment Index', 'Number Uninsured', 'Percent Uninsured',
       'Number Primary Care Physicians', 'PCP Rate', 'PCP Ratio',
       'Preventable Hosp stays Rate', 'Preventable Hosp. Rate (Black)',
       'Preventable Hosp. Rate (Hispanic)', 'Preventable Hosp. Rate (White)',
       'Percent Vaccinated Flu', 'Percent Vaccinated Flu (Black)',
       'Percent  Vaccinated (Hispanic) Flu', 'Percent Vaccinated (White) Flu',
       'High School Graduation Rate', 'Number Unemployed', 'Labor Force',
       'Percent Unemployed', 'Average Daily PM2.5',
       'Presence of water violation', 'Life Expectancy', '95% CI - Low',
       '95% CI - High', 'Life Expectancy (Black)',
       'Life Expectancy (Hispanic)', 'Life Expectancy (White)',
       'Number pre-mature Deaths', 'Number HIV Cases', 'HIV Prevalence Rate',
       'Percent Uninsured Adults', 'P

In [120]:
rcd.rename(columns={'County_x': 'County'}, inplace=True)
rcd.rename(columns={'State_y': 'State'}, inplace=True)

In [121]:
county_data = pd.merge(rcd,mask, on=['State', 'County'], how='left')

In [122]:
county_data.State_x.nunique()

51

In [123]:
county_data['Masks'].isna().sum()

910

In [124]:
county_data['Masks'].fillna(0, inplace=True)

In [125]:
county_data['Masks'].isna().sum()

0

In [127]:
county_vax = pd.read_csv('../Data/Cleaned/county_vax_2021.csv')

In [128]:
county_vax.shape

(3274, 5)

In [129]:
county_vax.rename(columns={'Recip_County': 'County', 'Recip_State': 'State'}, inplace=True)

In [130]:
county_vax.head()

Unnamed: 0,FIPS,County,State,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct
0,2013,Aleutians East Borough,AK,74.1,47.3
1,2016,Aleutians West Census Area,AK,64.9,65.2
2,2020,Anchorage Municipality,AK,60.7,92.3
3,2050,Bethel Census Area,AK,60.4,87.2
4,2060,Bristol Bay Borough,AK,99.9,90.4


In [131]:
county_data['FIPS'] = county_data['FIPS'].astype('object')

In [132]:
county_data.drop(columns=['State_x', 'County_y'], inplace=True)

In [133]:
county_vax['County'] = county_vax['County'].str.replace(r'\bCounty\b', '', regex=True).str.strip()

In [134]:
df = pd.merge(county_data, county_vax, how='inner', on=['County', 'State'])
print(df.shape)
df.head()

(3001, 83)


Unnamed: 0,FIPS_x,County,Years of Potential Life Lost Rate (premature death),YPLL Rate (Black),YPLL Rate (Hispanic),YPLL Rate (White),% Fair/Poor Health,percent_smokers,percent_obese,Food Environment Index,...,cases_2020,cases_2021,cases_2022,deaths_2020,deaths_2021,deaths_2022,Masks,FIPS_y,Administered_Dose1_Pop_Pct,Administered_Dose1_Recip_65PlusPop_Pct
0,1001,Autauga,8824.0,10471.0,,8707.0,18,19,38,7.2,...,4190.0,11018.0,18961.0,,,,267.0,1001,42.2,73.8
1,1003,Baldwin,7225.0,10042.0,3087.0,7278.0,18,17,31,8.0,...,13601.0,39911.0,67496.0,,,,267.0,1003,53.2,89.9
2,1005,Barbour,9586.0,11333.0,,7310.0,26,22,44,5.6,...,1514.0,3860.0,7027.0,,,,267.0,1005,44.5,75.3
3,1007,Bibb,11784.0,14813.0,,11328.0,20,20,38,7.6,...,1834.0,4533.0,7692.0,,,,267.0,1007,36.6,64.2
4,1009,Blount,10908.0,,5620.0,11336.0,21,20,34,8.5,...,4641.0,11256.0,17731.0,,,,267.0,1009,31.9,56.6


In [136]:
df.to_csv('../Data/Cleaned/county_df2.csv', index=False)