### Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# State Level Data

### Census Population

##### 2020
Used webscraping to obtain population data for the years 2010 and 2020 from the US census cite.

In [2]:
# Census site url
url = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010220'
res = requests.get(url)

In [3]:
soup = BeautifulSoup(res.content)
table = soup.find(attrs={'class':'qf-graph-scroll'})
# soup.find_all(attrs={'class':'qf-positive'})
tbody = table.find_all(attrs={'class':"qf-graph-geo"})
tr = tbody[1]
state = tr.find('a').attrs['data-title']
tr.find(attrs={'class':'qf-positive'}).attrs['data-value']

'29145505'

In [4]:
pop_2020 = []
for tr in tbody:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2020 Population': pop}
    pop_2020.append(state_pop)

pop_2020 = pd.DataFrame(pop_2020)
pop_2020.head()

Unnamed: 0,State,2020 Population
0,California,39538223
1,Texas,29145505
2,Florida,21538187
3,New York,20201249
4,Pennsylvania,13002700


##### 2010
The same process was followed for the year 2010.

In [5]:
url2 = 'https://www.census.gov/quickfacts/quickfacts/geo/dashboard/US/POP010210'
res2 = requests.get(url2)

In [6]:
soup2 = BeautifulSoup(res2.content)
table2 = soup2.find(attrs={'class':'qf-graph-scroll'})
tbody2 = table2.find_all(attrs={'class':"qf-graph-geo"})

In [7]:
pop_2010 = []
for tr in tbody2:
    state = tr.find('a').attrs['data-title']
    pop = tr.find(attrs={'class':'qf-positive'}).attrs['data-value']
    
    state_pop = {'State': state, '2010 Population': pop}
    pop_2010.append(state_pop)

pop_2010 = pd.DataFrame(pop_2010)

population = pd.merge(pop_2010, pop_2020)
population.head()

Unnamed: 0,State,2010 Population,2020 Population
0,California,37253956,39538223
1,Texas,25145561,29145505
2,New York,19378102,20201249
3,Florida,18801310,21538187
4,Illinois,12830632,12812508


In [8]:
population.dtypes

State              object
2010 Population    object
2020 Population    object
dtype: object

In [10]:
# 
population[['2010 Population', '2020 Population']] = population[['2010 Population',
                                                                 '2020 Population']].astype(int)
population.to_csv('../Data/Population_data_2010_&_2020.csv', index=False)

### Excess Deaths Data

In [None]:
deaths = pd.read_csv('Ignore/Excess_Deaths_Associated_with_COVID-19.csv')
deaths.head(100)

In [None]:
covid = deaths[(deaths['Type']=='Predicted (weighted)')&((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
       (deaths['Year'] == 2020) | (deaths['Year'] == 2021) | (deaths['Year'] == 2022)]

In [None]:
covid['Type'].value_counts()

In [None]:
covid['Outcome'].value_counts()

In [None]:
covid = deaths[(deaths['Type'] == 'Predicted (weighted)') & 
               ((deaths['Outcome'] == 'All causes') | (deaths['Outcome'] == 'All causes, excluding COVID-19')) & 
               (deaths['Year'].isin([2020, 2021, 2022]))]

def covid_deaths(data):
    grouped = data.groupby(['Year', 'State', 'Outcome'])['Excess Estimate'].sum().reset_index()
    pivoted = grouped.pivot(index=['Year', 'State'], columns='Outcome', 
                            values='Excess Estimate').reset_index()
    pivoted = pivoted.fillna(0)
    pivoted['covid'] = pivoted['All causes'] - pivoted['All causes, excluding COVID-19']
    pivoted.rename(columns={'Year': 'Location'}, inplace=True)
    results = pivoted.to_dict(orient='records')

    return results

deaths_covid = pd.DataFrame(covid_deaths(covid))

In [None]:
deaths_covid = deaths_covid.pivot(index='State', columns='Location', 
                                  values=['All causes', 'covid'])

deaths_covid.columns = ['_'.join(map(str, col)) for col in deaths_covid.columns]
deaths_covid = deaths_covid.reset_index()
deaths_covid.rename(columns={'State': 'Location'}, inplace=True)
deaths_covid.rename(columns={
    'All causes_2020': 'all_causes_2020',
    'All causes_2021': 'all_causes_2021',
    'All causes_2022': 'all_causes_2022',
    'covid_2020': 'covid_2020',
    'covid_2021': 'covid_2021',
    'covid_2022': 'covid_2022',
}, inplace=True)
deaths_covid.head()

In [None]:
deaths_covid.dtypes

In [None]:
(deaths_covid[['covid_2020', 'covid_2021', 'covid_2022']].sum()).sum()

In [None]:
deaths_covid['Location_Upper'] = deaths_covid['Location'].str.upper()
population['State_Upper'] = population['State'].str.upper()

for year in [2020, 2021, 2022]:
    deaths_covid[f'Covid_pop_perce_{year}'] = (
        deaths_covid.apply(
            lambda row: (
                row[f'covid_{year}'] /
                population.loc[
                    population['State_Upper'] == row['Location_Upper'],
                    '2020 Population'
                ].values
            )[0] if row['Location_Upper'] in population['State_Upper'].values else None,
            axis=1
        )
    )

deaths_covid.head()

In [None]:
deaths_covid.dropna(inplace=True)
deaths_covid.drop(columns=['Location_Upper'], inplace=True)

In [None]:
deaths_covid.to_csv('Data/Cleaned/cleaned_covid_death_state.csv', index = False)

In [None]:
# getting sum of each states excess deaths 
def excess_deaths(deaths, start_year=2017, end_year=2022):
    state_totals = {state: [0] * (end_year - start_year + 1) for state in 
                    deaths['State'].unique()}
    
    for index, row in deaths.iterrows():
        state = row['State']
        year = row['Year']
        excess_estimate = row['Excess Estimate']
        
        if start_year <= year <= end_year:
            state_totals[state][year - start_year] += excess_estimate

    result = pd.DataFrame(state_totals).T.reset_index()
    result.columns = ['Location'] + [f'Exc_deaths_{year}' for year in 
                                  range(start_year, end_year + 1)]
    
    return result


excess_deaths = excess_deaths(deaths)
excess_deaths.head()


In [None]:
excess_deaths.to_csv('../Data/Cleaned/excess_deaths.csv)

In [None]:
# Getting count for each year and each state where they exceed their threshold
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    result = pd.DataFrame()
    
    for year in range(start_year, end_year + 1):
        
        year_data = deaths[deaths['Year'] == year]
        year_counts = year_data.groupby('State')['Exceeds Threshold'].sum().reset_index()
        year_counts = year_counts.rename(columns={'Exceeds Threshold': f'Exc_count_{year}'})
        year_counts.set_index('State', inplace=True)
        
        if result.empty:
            result = year_counts
        else:
            result = result.merge(year_counts, left_index=True, right_index=True, how='outer')
    
    result = result.reset_index().fillna(0)
    
    return result

In [None]:
def count_exceeds_threshold(deaths, start_year=2017, end_year=2022):
    results = []
    
    for year in range(start_year, end_year + 1):
        year_counts = deaths[deaths['Year'] == year]
        state_counts = {}
        
        for state in year_counts['State'].unique():
            count = year_counts[(year_counts['State'] == state) & (year_counts['Exceeds Threshold'] == 'True')].shape[0]
            state_counts[state] = count
        
        results.append({'Year': year, **state_counts})
    
    return pd.DataFrame(results)

exceeds_threshold = count_exceeds_threshold(deaths, start_year=2017, end_year=2022)
exceeds_threshold

In [None]:
deaths["Week Ending Date"] = pd.to_datetime(deaths["Week Ending Date"])
deaths["Year"] = deaths["Week Ending Date"].dt.year
exceeds_threshold = deaths[deaths["Exceeds Threshold"] == True]
result = exceeds_threshold.groupby(["State", "Year"]).size().reset_index(name="Exceeds Threshold Count")

result

In [None]:
# Merging the data
finaldeaths = pd.merge(excess_deaths, exceeds_threshold, how='left')
finaldeaths.rename(columns={'State':'Location'}, inplace=True)
finaldeaths.head()

In [None]:
#finaldeaths.to_csv('Data/Excess_deaths&Exceeds_Threshold_data.csv')

In [None]:
url3 = 'https://wisevoter.com/state-rankings/population-density-by-state/'
res3 = requests.get(url3)
soup3 = BeautifulSoup(res3.content)

In [None]:
table3 = soup3.find('table', attrs={'id': 'shdb-on-page-table'})
tbody = table3.find('tbody')
trs = tbody.find_all('tr')

In [None]:
pop_density = []
for tr in trs:
    state = tr.find(attrs={'class':'shdb-on-page-table-body-Geo'}).text
    density = tr.find(attrs={'class':'shdb-on-page-table-body-Data'}).text.split()[0]
    
    state_density = {'State':state, 'Population Density per mi²': density}
    pop_density.append(state_density)
Pop_density = pd.DataFrame(pop_density)
Pop_density.head()

In [None]:
Pop_density.to_csv('Data/Population_Density_data.csv')

## Pre-Existing Medical Conditions

### Asthma

CSV files with information on asthma, diabetes, cancer, copd, heart disease, kidney disease incident rates/ prevalence rates were downloaded from the CDC website. These csv files are each approximately 40 MB with responses to various questions, rates by demographic as well as overall population totals for more than a decade. Each dataset must be cleaned and unecessary information dropped to reduce the files to a manageable size.

In [None]:
asthma = pd.read_csv('Ignore/Asthma.csv')
asthma.shape

In [None]:
asthma.head()

In [None]:
asthma.Question.value_counts()

In [None]:
asthma.describe()

In [None]:
# We only need data for pre-existing conditions for 1 year
asthma = asthma[asthma['YearStart'] == 2019]

In [None]:
asthma.shape

In [None]:
asthma.YearStart.value_counts()

In [None]:
asthma = asthma[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue',
                 'Stratification1', 'DataValueType']]
asthma.head(), astha.shape

In [None]:
asthma['LocationDesc'].nunique()

In [None]:
asthma.drop_duplicates(inplace=True)
asthma = asthma[asthma['Stratification1'] == 'Overall']
asthma.shape

### Defining a cleaning function for all pre-existing medical condition data
Data for a range of medical conditions at the state level were all collected from the CDC site. Information for each disease was downloaded as a CSV with similar columns and structure to the asthma dataset. Using the steps above, each dataset was cleaned with a cleaning function.

In [None]:
def cleaning(df):
    print(df.shape)
    df = df[df['YearStart'] == 2019]
    print(df.shape)
    df = df[['YearStart', 'LocationAbbr', 'LocationDesc', 'Question', 'DataValue', 'Stratification1', 'DataValueType']]
    print(df.shape)
    df = df[df['Stratification1'] == 'Overall']
    return df  
               

The pre-existing conditions are:
- cardiovascular disease
- high blood pressure
- COPD
- Diabetes
- Kidney Disease

These conditions were chosen as these conditions are known to exacerbate the impacts of coronavirus and individuals with these conditions were more likely to have more severe cases and to die if infected with coronavirus. 

Immunization data including influenza vaccination rates was also collected. 

In [None]:
heart = pd.read_csv('Ignore/Cardiovascular.csv')
heart.head()

In [None]:
# Splitting cardiac dataset into deaths from cardio diseases and diagnosed prevalence of hypertension
cardiac_mortality = heart[heart['Question'] == 'Mortality from total cardiovascular diseases']
print(cardiac_mortality.shape)

high_blood_pressure = heart[heart['Question']== 'Awareness of high blood pressure among adults aged >= 18 years']
print(high_blood_pressure.shape)

In [None]:
copd = pd.read_csv('Ignore/COPD.csv')
copd.shape

In [None]:
copd.Question.value_counts()
copd = copd[copd['Question']== 'Prevalence of chronic obstructive pulmonary disease among adults >= 18']
copd.shape

In [None]:
diabetes = pd.read_csv('Ignore/Diabetes.csv')
diabetes.shape

In [None]:
diabetes['Question'].value_counts()
diabetes = diabetes[diabetes['Question']== 'Prevalence of diagnosed diabetes among adults aged >= 18 years']
diabetes.shape

In [None]:
kidney = pd.read_csv('Ignore/Kidney.csv')

In [None]:
kidney['Question'].value_counts()
kidney = kidney[kidney['Question']== 'Prevalence of chronic kidney disease among adults aged >= 18 years']
kidney.shape

In [None]:
cardiac_mortality = cleaning(cardiac_mortality)
high_blood_pressure = cleaning(high_blood_pressure)
copd = cleaning(copd)
diabetes = cleaning(diabetes)
immun = cleaning(immun)
kidney = cleaning(kidney)

In [None]:
#Examining the size of each dataset to ensure cleaning function worked

print(asthma.shape)
#print(cancer.shape)
print(cardiac_mortality.shape)
print(high_blood_pressure.shape)
print(copd.shape)
print(diabetes.shape)
print(immun.shape)
print(kidney.shape)

In [None]:
def adj(df):
    adj_df = df[df['DataValueType'] == 'Age-adjusted Prevalence'].copy()
    adj_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return adj_df

def crude(df):
    crude_df = df[df['DataValueType'] == 'Prevalence'].copy()
    crude_df.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
    return crude_df

# Cardiac mortality is 'Age-adjusted Rate' and 'Crude Rate' not 'Prevalence'- fix this in final code

### Saving the datasets to the Data folder
Once the file size of each dataset was reduced enough and the features were reduced to those pertaining to the problem, the pre-existing conditions were saved as csvs separated as raw and age-adjusted numbers. 

In [None]:
def process_dataframes(dataframe_list):
    results = {} 
    for name, df in dataframe_list.items():
        adj_result = adj(df)
        crude_result = crude(df)
        results[name] = {'adj': adj_result, 'crude': crude_result}
    return results

# Create a list of DataFrames 
dataframe_list = {
    'copd': copd,
    'asthma': asthma,
    'high_blood_pressure': high_blood_pressure,
    'diabetes': diabetes,
    'immun': immun,
    'kidney': kidney
}
# Process the list of DataFrames
results = process_dataframes(dataframe_list)

# Save the results to CSV
for name, result in results.items():
    result['adj'].to_csv(f'Data/Raw/{name}_adj.csv', index = False)
    result['crude'].to_csv(f'Data/Raw/{name}_crude.csv', index = False)

In [None]:
cardiac_mortality = cardiac_mortality[cardiac_mortality['DataValueType'] == 'Age-adjusted Rate']
cardiac_mortality.drop(columns=['LocationAbbr', 'DataValueType', 'Stratification1'], inplace=True)
cardiac_mortality.to_csv('Data/Raw/cardiac_mortality_adj.csv', index = False)

## Vaccination Rates by Stae

In [None]:
vax = pd.read_csv('Ignore/COVID-19_Vaccinations_in_the_United_States_Jurisdiction_20231101.csv')

In [None]:
vax_2021 = vax[vax['Date'] == '12/31/2021'].copy()

In [None]:
vax_2021.isna().sum()

In [None]:
vax_2021.drop(columns = ['MMWR_week', 'Date', 'Second_Booster', 'Second_Booster_50Plus_Vax_Pct', 'Second_Booster_65Plus_Vax_Pct', 'Bivalent_Booster_65Plus_Pop_Pct'], inplace=True) 

In [None]:
vax_2022 = vax[vax['Date'] == '12/28/2022'].copy()

In [None]:
vax_2022.drop(columns = ['MMWR_week', 'Date', 'Second_Booster'], inplace =True)

In [None]:
vax_2022.isna().sum()

In [None]:
vax_state = pd.merge(vax_2021, vax_2022, how ='inner', on='Location', suffixes=('_2021', '_2022'))

In [None]:
vax_state.isna().sum()

In [None]:
vax_state.to_csv('Data/Cleaned/vax_state.csv', index=False)

# County Level Data

### Mask Data

In [None]:
county_mask = pd.read_csv('Ignore/U.S._State_and_Territorial_Public_Mask_Mandates_From_April_10__2020_through_August_15__2021_by_County_by_Day.csv')
county_mask

In [None]:
# Chat GPT was consulted to create more efficient code which ran faster
def mandate_length(data):
    county_counts = {}
    for index, row in data.iterrows():
        county_name = row['County_Name']
        mask_required = row['Face_Masks_Required_in_Public']
        state = row['State_Tribe_Territory']
        
        if mask_required == 'Yes':
            if county_name in county_counts:
                county_counts[county_name]['Count'] += 1
            else:
                county_counts[county_name] = {'State': state, 'Count': 1}

    county_counts_list = [{'State': v['State'], 'County': k, 'Count': v['Count']} for k, v in county_counts.items()]

    return pd.DataFrame(county_counts_list)

county_counts_df = mandate_length(county_mask)

In [None]:
county_counts_df.to_csv('Data/Raw/county_mask_mandata.csv', index=False)

## Covid Vaccination Rates by county

In [None]:
# The initial dataset was downloaded from the CDC site and stored in the Ignore Folder
covid_vax = pd.read_csv('Ignore/COVID-19_Vaccinations_by_county.csv')
covid_vax.shape()

In [None]:
covid_vax.head()

In [None]:
# keeping 9/18/2021 & 12/28/2022 for point in time data 
early_vax = covid_vax[covid_vax['Date'] == '9/18/2021'].copy()
late_vax = covid_vax[covid_vax['Date'] == '12/28/2022'].copy()

In [None]:
early_vax.drop(columns=['Series_Complete_Pop_Pct',
                        'Series_Complete_65PlusPop_Pct'], inplace = True)
early_vax.head()

In [None]:
early_vax.dropna(inplace=True)

In [None]:
late_vax.drop(columns=['Date', 'MMWR_week', 'Series_Complete_65PlusPop_Pct', 'Census2019', 'Census2019_65PlusPop'], inplace=True)
late_vax.head()

In [None]:
# Saving the smaller csv files into the raw data folder
early_vax.to_csv('../Data/Raw/early_vax_2021.csv', index=False)
late_vax.tocsv('../Data/Raw/late_vax_2022.csv', index=False)

## Population

In [1]:
# Reading in the population data downloaded from usafacts
pop = pd.read_csv('./Ignore/covid_county_population_usafacts.csv')
pop.describe()

NameError: name 'pd' is not defined

In [None]:
pop.head()

In [None]:
# States dictionary
state_name = {
    'AL':'Alabama',
    'AK':'Alaska',
    'AZ':'Arizona',
    'AR':'Arkansas',
    'CA':'California',
    'CO':'Colorado',
    'CT':'Connecticut',
    'DE':'Delaware',
    'DC':'District of Columbia',
    'FL':'Florida',
    'GA':'Georgia',
    'HI':'Hawaii',
    'ID':'Idaho',
    'IL':'Illinois',
    'IN':'Indiana',
    'IA':'Iowa',
    'KS':'Kansas',
    'KY':'Kentucky',
    'LA':'Louisiana',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'MI':'Michigan',
    'MN':'Minnesota',
    'MS':'Mississippi',
    'MO':'Missouri',
    'MT':'Montana',
    'NE':'Nebraska',
    'NV':'Nevada',
    'NH':'New Hampshire',
    'NJ':'New Jersey',
    'NM':'New Mexico',
    'NY':'New York',
    'NC':'North Carolina',
    'ND':'North Dakota',
    'OH':'Ohio',
    'OK':'Oklahoma',
    'OR':'Oregon',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'SD':'South Dakota',
    'TN':'Tennessee',
    'TX':'Texas',
    'UT':'Utah',
    'VT':'Vermont',
    'VA':'Virginia',
    'WA':'Washington',
    'WV':'West Virginia',
    'WI':'Wisconsin',
    'WY':'Wyoming'
}

# List of states that can be used in cleaning functions
state_abbreviations = list(state_name.keys())

In [None]:
# Creating a df of populations for each county
state_pop_sum = {}
for index, row in pop.iterrows():
    state_abbr = row['State'] 
    tot_pop = row['population']  
    
    if state_abbr in state_pop_sum:
        state_pop_sum[state_abbr] += tot_pop
    else:
        state_pop_sum[state_abbr] = tot_pop

for state, pop_sum in state_pop_sum.items():
    print(f"{state}: {pop_sum}")

In [None]:
tot_pop = pd.DataFrame(list(state_pop_sum.items()), columns=['State', 'Total Population'])
tot_pop.head()

In [None]:
merged_pop = pd.merge(pop, tot_pop, on='State')
merged_pop.head()

In [None]:
merged_pop['pop %'] = (merged_pop['population'] / merged_pop['Total Population'])*100
merged_pop.head()

In [None]:
# Saving pop percent 
merged_pop.to_csv('./Data/Cleaned/population_w_percent.csv', index = False)

## Covid Cases by County

In [None]:
cases = pd.read_csv('Ignore/covid_confirmed_usafacts.csv')
cases.head()

cases['County'] = cases['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()

new_pop = pd.read_csv('./Data/Cleaned/population_w_percent.csv')
new_pop['pop %'] = new_pop['pop %']/100
pop_percent = new_pop.drop(columns=['County Name', 'State', 'population', 'Total Population'])


In [None]:
cases = pd.merge(cases, pop_percent, how = 'left', on='countyFIPS').copy()

In [None]:
cases_2020 = cases[['countyFIPS', 'County', 'State', 'StateFIPS', '2020-12-31', 'pop %']].copy()
cases_2020.rename(columns = {'2020-12-31': 'cases_2020', 'pop %': 'pop_per'}, inplace =True)

In [None]:
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = cases_2020.loc[
        (cases_2020['State'] == state_abbr) & (cases_2020['County'] == 'Statewide Unallocated'),
        'cases_2020'
    ].values[0]
    
    county_list = cases_2020.loc[cases_2020['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        cases_2020.loc[
            (cases_2020['State'] == state_abbr) & (cases_2020['County'] == county),
            'cases_2020'
        ] += total_unallocated * cases_2020.loc[
            (cases_2020['State'] == state_abbr) & (cases_2020['County'] == county),
            'pop_per'
        ]
        
cases_2020 = cases_2020.drop_duplicates()

In [None]:
cases_2021 = cases[['countyFIPS', 'County', 'State', 'StateFIPS', '2021-12-31', 'pop %']].copy()
cases_2021.rename(columns = {'2021-12-31': 'cases_2021', 'pop %': 'pop_per'}, inplace =True)

for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = cases_2021.loc[
        (cases_2021['State'] == state_abbr) & (cases_2021['County'] == 'Statewide Unallocated'),
        'cases_2021'
    ].values[0]
    
    county_list = cases_2021.loc[cases_2021['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        cases_2021.loc[
            (cases_2021['State'] == state_abbr) & (cases_2021['County'] == county),
            'cases_2021'
        ] += total_unallocated * cases_2021.loc[
            (cases_2021['State'] == state_abbr) & (cases_2021['County'] == county),
            'pop_per'
        ]

cases_2021 = cases_2021.drop_duplicates()       

In [None]:
cases_2022 = cases[['countyFIPS', 'County', 'State', 'StateFIPS', '2022-12-31', 'pop %']].copy()
cases_2022.rename(columns = {'2022-12-31': 'cases_2022', 'pop %': 'pop_per'}, inplace =True)

In [None]:
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = cases_2022.loc[
        (cases_2022['State'] == state_abbr) & (cases_2022['County'] == 'Statewide Unallocated'),
        'cases_2022'
    ].values[0]
    
    county_list = cases_2022.loc[cases_2022['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        cases_2022.loc[
            (cases_2022['State'] == state_abbr) & (cases_2022['County'] == county),
            'cases_2022'
        ] += total_unallocated * cases_2022.loc[
            (cases_2022['State'] == state_abbr) & (cases_2022['County'] == county),
            'pop_per'
        ]
cases_2022 = cases_2022.drop_duplicates()

merged_cases = pd.merge(cases_2020, cases_2021, how='left', on='countyFIPS')
county_cases = pd.merge(merged_cases, cases_2022, how='left', on='countyFIPS')

county_cases.drop(columns = ['County','State', 'StateFIPS', 'pop_per_x', 'County_y', 'State_y', 'StateFIPS_y',
                  'pop_per_y'], inplace=True)
county_cases.head()

In [None]:
columns_to_check_duplicates = ['countyFIPS', 'County_x', 'State_x', 'StateFIPS_x']
county_cases.drop_duplicates(subset=columns_to_check_duplicates, keep='first', inplace=True)
county_cases.rename(columns = {'County_x' : 'County', 'State_x': 'State', 'StateFIPS_x': 'StateFIPS'}, inplace =True)


In [None]:
county_cases = county_cases.drop_duplicates()
county_cases.shape

#### Reading in the county Health Rankings Data

In [None]:
rank = pd.read_csv('./Data/2019 County Health Rankings Data - cleaned.csv')

## Deaths by County

In [None]:
deaths = pd.read_csv('./Ignore/covid_deaths_usafacts.csv')
deaths

In [None]:
deaths['County'] = deaths['County Name'].str.replace(r'\bCounty\b', '', regex=True).str.strip()
deaths = pd.merge(deaths, pop_percent, how = 'left', on='countyFIPS').copy()


In [None]:
deaths_2020 = deaths[['countyFIPS', 'County', 'State', 'StateFIPS', '2020-12-31', 'pop %']].copy()
deaths_2020.rename(columns = {'2020-12-31': 'deaths_2020', 'pop %': 'pop_per'}, inplace =True)
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = deaths_2020.loc[
        (deaths_2020['State'] == state_abbr) & (deaths_2020['County'] == 'Statewide Unallocated'),
        'deaths_2020'
    ].values[0]
    
    county_list = deaths_2020.loc[deaths_2020['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        deaths_2020.loc[
            (deaths_2020['State'] == state_abbr) & (deaths_2020['County'] == county),
            'deaths_2020'
        ] += total_unallocated * deaths_2020.loc[
            (deaths_2020['State'] == state_abbr) & (deaths_2020['County'] == county),
            'pop_per'
        ]

In [None]:
deaths_2020 = deaths_2020.drop_duplicates()

deaths_2021 = deaths[['countyFIPS', 'County', 'State', 'StateFIPS', '2021-12-31', 'pop %']].copy()
deaths_2021.rename(columns = {'2021-12-31': 'deaths_2021', 'pop %': 'pop_per'}, inplace =True)
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = deaths_2021.loc[
        (deaths_2021['State'] == state_abbr) & (deaths_2021['County'] == 'Statewide Unallocated'),
        'deaths_2021'
    ].values[0]
    
    county_list = deaths_2021.loc[deaths_2021['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        deaths_2021.loc[
            (deaths_2021['State'] == state_abbr) & (deaths_2021['County'] == county),
            'deaths_2021'
        ] += total_unallocated * deaths_2021.loc[
            (deaths_2021['State'] == state_abbr) & (deaths_2021['County'] == county),
            'pop_per'
        ]

deaths_2021 = deaths_2021.drop_duplicates()


In [None]:
deaths_2022 = deaths[['countyFIPS', 'County', 'State', 'StateFIPS', '2022-12-31', 'pop %']].copy()
deaths_2022.rename(columns = {'2022-12-31': 'deaths_2022', 'pop %': 'pop_per'}, inplace =True)
for state_abbr in state_abbreviations:
    # Calculate total_unallocated for the current state
    total_unallocated = deaths_2022.loc[
        (deaths_2022['State'] == state_abbr) & (deaths_2022['County'] == 'Statewide Unallocated'),
        'deaths_2022'
    ].values[0]
    
    county_list = deaths_2022.loc[deaths_2022['State'] == state_abbr, 'County'].unique()
    
    for county in county_list:
        deaths_2022.loc[
            (deaths_2022['State'] == state_abbr) & (deaths_2022['County'] == county),
            'deaths_2022'
        ] += total_unallocated * deaths_2022.loc[
            (deaths_2022['State'] == state_abbr) & (deaths_2022['County'] == county),
            'pop_per'
        ]
        
deaths_2022 = deaths_2022.drop_duplicates()


In [None]:
merged_deaths = pd.merge(deaths_2020, deaths_2021, how='left', on='countyFIPS')
county_deaths = pd.merge(merged_deaths, deaths_2022, how='left', on='countyFIPS')
county_deaths.drop(columns = ['County','State', 'StateFIPS', 'pop_per_x', 'County_y', 'State_y', 'StateFIPS_y',
                  'pop_per_y'], inplace=True)
county_deaths.head()

In [None]:
columns_to_check_duplicates = ['countyFIPS', 'County_x', 'State_x', 'StateFIPS_x']

county_deaths.drop_duplicates(subset=columns_to_check_duplicates, keep='first', inplace=True)

county_deaths.rename(columns = {'County_x' : 'County', 'State_x': 'State', 'StateFIPS_x': 'StateFIPS'}, inplace =True)

county_deaths = county_deaths.drop_duplicates()
county_deaths.head()

In [None]:
print(county_deaths.shape)
print(county_cases.shape)

In [None]:
county_cases.drop(columns = ['StateFIPS','pop_per'], inplace=True)
county_deaths.drop(columns = ['County', 'countyFIPS', 'State','StateFIPS','pop_per'], inplace=True)

In [None]:
cases_deaths = pd.concat([county_cases,county_deaths], axis=1, join='outer')


In [None]:
cases_deaths.rename(columns={'countyFIPS': 'FIPS'},inplace=True)

In [None]:
rcd = pd.merge(rank,cases_deaths, how='inner', on='FIPS')
rcd.shape

In [None]:
## County Mask Mandates

In [None]:
mask = pd.read_csv('./Data/Raw/county_mask_mandata.csv')
#We can't drop state, some states have counties with the same name...
mask = mask.rename(columns = {'Count' : 'Masks'})
mask['County'] = mask['County'].str.replace(r'\bCounty\b', '', regex=True).str.strip()


In [1]:
mask.head()

In [None]:
rcd.isna().sum()

In [None]:
# Look at just the columns with missing data
missing_columns = rcd.columns[rcd.isnull().any()]
print(missing_columns)

# We will have to address each of these columns and drop

In [None]:
rcd.rename(columns={'County_x': 'County'}, inplace=True)
rcd.rename(columns={'State_y': 'State'}, inplace=True)

In [None]:
county_data = pd.merge(rcd,mask, on=['State', 'County'], how='left')

In [None]:
county_data.State_x.nunique()

In [None]:
county_data['Masks'].isna().sum()

In [None]:
county_data['Masks'].fillna(0, inplace=True)

In [None]:
county_data['Masks'].isna().sum()

In [None]:
county_vax = pd.read_csv('Data/Cleaned/county_vax_2021.csv')

In [None]:
county_vax.shape

In [None]:
county_vax.rename(columns={'Recip_County': 'County', 'Recip_State': 'State'}, inplace=True)

In [None]:
county_vax.head()

In [None]:
county_data['FIPS'] = county_data['FIPS'].astype('object')

In [None]:
county_data.drop(columns=['State_x', 'County_y'], inplace=True)

In [None]:
county_vax['County'] = county_vax['County'].str.replace(r'\bCounty\b', '', regex=True).str.strip()

In [None]:
df = pd.merge(county_data, county_vax, how='inner', on=['County', 'State'])
print(df.shape)
df.head()

In [None]:
df.to_csv('Data/Cleaned/county_df2.csv', index=False)