In [60]:
import pandas as pd

## Approach
We want to clean and combine all of our various datasets and export it as one CSV. This will primarily involve dropping columns that we aren't interested in and merging onto a central Pandas DataFrame. After some exploratory analysis, we will determine what rows will need to be dropped or interpolated. Our primary key for all these datasets that we will be merging on after cleaning is the FIPS code, a unique integer value representing each county/state.

### County Info
The vaccine hesitancy dataset has values for multiple segments (ethnicity, social vulnerability, vaccine hesitancy), which we will split out into separate variables and look at each. Our primary index of county codes is given by Federal Information Processing Standards (FIPS), which we extract with useful identifiers of the county (its name and state). For additional avenues for data exploration, we add in the census defined region and division values for each county.

In [61]:
vaccine_hesitancy_data = pd.read_csv('datasets/raw/Vaccine_Hesitancy_for_COVID-19__County_and_local_estimates.csv', encoding='utf-8').rename(columns = {'FIPS Code':'fips'})
county = vaccine_hesitancy_data[['fips', 'County Name', 'State']].rename(columns={'County Name': 'county_name', 'State': 'state'})
county['state'] = county['state'].str.title()
county

Unnamed: 0,fips,county_name,state
0,1131,"Wilcox County, Alabama",Alabama
1,1129,"Washington County, Alabama",Alabama
2,1133,"Winston County, Alabama",Alabama
3,1127,"Walker County, Alabama",Alabama
4,2013,"Aleutians East Borough, Alaska",Alaska
...,...,...,...
3137,55079,"Milwaukee County, Wisconsin",Wisconsin
3138,55121,"Trempealeau County, Wisconsin",Wisconsin
3139,56001,"Albany County, Wyoming",Wyoming
3140,55067,"Langlade County, Wisconsin",Wisconsin


In [62]:
regions = [
    (['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont'], 'New England', 'Northeast'), 
    (['New Jersey', 'New York', 'Pennsylvania'], 'Midatlantic', 'Northeast'), 
    (['Indiana', 'Illinois', 'Michigan', 'Ohio', 'Wisconsin'], 'East North Central', 'Midwest'), 
    (['Iowa', 'Nebraska', 'Kansas', 'North Dakota', 'Minnesota', 'South Dakota', 'Missouri'], 'West North Central', 'Midwest'), 
    (['Delaware', 'District Of Columbia', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'West Virginia'], 'South Atlantic', 'South'), 
    (['Alabama', 'Kentucky', 'Mississippi', 'Tennessee'], 'East South Central', 'South'), 
    (['Arkansas', 'Louisiana', 'Oklahoma', 'Texas'], 'West South Central', 'South'), 
    (['Arizona', 'Colorado', 'Idaho', 'New Mexico', 'Montana', 'Utah', 'Nevada', 'Wyoming'], 'Mountain', 'West'), 
    (['Alaska', 'California', 'Hawaii', 'Oregon', 'Washington'], 'Pacific', 'West')]

for (states, division, region) in regions:
    county.loc[county['state'].isin(states), ['division', 'region']] = [division, region]
county

Unnamed: 0,fips,county_name,state,division,region
0,1131,"Wilcox County, Alabama",Alabama,East South Central,South
1,1129,"Washington County, Alabama",Alabama,East South Central,South
2,1133,"Winston County, Alabama",Alabama,East South Central,South
3,1127,"Walker County, Alabama",Alabama,East South Central,South
4,2013,"Aleutians East Borough, Alaska",Alaska,Pacific,West
...,...,...,...,...,...
3137,55079,"Milwaukee County, Wisconsin",Wisconsin,East North Central,Midwest
3138,55121,"Trempealeau County, Wisconsin",Wisconsin,East North Central,Midwest
3139,56001,"Albany County, Wyoming",Wyoming,Mountain,West
3140,55067,"Langlade County, Wisconsin",Wisconsin,East North Central,Midwest


### Ethnicity 
Percentage of ethnicity for each county are given. For readability and simplicity, we rename them with the most abundant ethnic group as primary and assume non-Hispanic for all of the non-Hispanic groups.

In [63]:
ethnicity = vaccine_hesitancy_data[['fips', 'Percent Hispanic', 'Percent non-Hispanic American Indian/Alaska Native', 'Percent non-Hispanic Asian', 'Percent non-Hispanic Black', 'Percent non-Hispanic Native Hawaiian/Pacific Islander', 'Percent non-Hispanic White']].rename(columns = {'Percent Hispanic': 'ethnicity_hispanic', 'Percent non-Hispanic American Indian/Alaska Native': 'ethnicity_native', 'Percent non-Hispanic Asian': 'ethnicity_asian', 'Percent non-Hispanic Black': 'ethnicity_black', 'Percent non-Hispanic Native Hawaiian/Pacific Islander': 'ethnicity_hawaiian', 'Percent non-Hispanic White': 'ethnicity_white'})
ethnicity

Unnamed: 0,fips,ethnicity_hispanic,ethnicity_native,ethnicity_asian,ethnicity_black,ethnicity_hawaiian,ethnicity_white
0,1131,0.0053,0.0009,0.0003,0.6938,0.0000,0.2684
1,1129,0.0146,0.0731,0.0025,0.2354,0.0000,0.6495
2,1133,0.0315,0.0034,0.0016,0.0073,0.0005,0.9370
3,1127,0.0249,0.0015,0.0049,0.0617,0.0000,0.8895
4,2013,0.0901,0.4588,0.1968,0.0322,0.0100,0.1321
...,...,...,...,...,...,...,...
3137,55079,0.1500,0.0047,0.0428,0.2606,0.0002,0.5124
3138,55121,0.0840,0.0034,0.0043,0.0051,0.0000,0.8953
3139,56001,0.0953,0.0091,0.0327,0.0150,0.0003,0.8248
3140,55067,0.0197,0.0069,0.0022,0.0125,0.0002,0.9383


### Social Vulnerability

In [64]:
social_vulnerability_index = vaccine_hesitancy_data[['fips', 'Social Vulnerability Index (SVI)']].rename(columns= {'Social Vulnerability Index (SVI)': 'social_vulnerability_index'})
social_vulnerability_index

Unnamed: 0,fips,social_vulnerability_index
0,1131,0.93
1,1129,0.73
2,1133,0.70
3,1127,0.75
4,2013,0.58
...,...,...
3137,55079,0.81
3138,55121,0.28
3139,56001,0.25
3140,55067,0.35


### Vaccine Hesitancy

In [65]:
vaccine_hesitancy = vaccine_hesitancy_data[['fips', 'Estimated hesitant', 'Estimated strongly hesitant']].rename(columns = {'Estimated hesitant': 'vaccine_hesitant', 'Estimated strongly hesitant': 'vaccine_hesitant_strong'})
vaccine_hesitant_mean, vaccine_hesitant_std = vaccine_hesitancy['vaccine_hesitant'].mean(), vaccine_hesitancy['vaccine_hesitant'].std()
vaccine_hesitancy.loc[vaccine_hesitancy['vaccine_hesitant'] > vaccine_hesitant_mean + vaccine_hesitant_std, ['vaccine_hesitant_category']] = 'High'
vaccine_hesitancy.loc[vaccine_hesitancy['vaccine_hesitant'] < vaccine_hesitant_mean - vaccine_hesitant_std, ['vaccine_hesitant_category']] = 'Low'
vaccine_hesitancy['vaccine_hesitant_category'] = vaccine_hesitancy['vaccine_hesitant_category'].fillna('Medium')
vaccine_hesitancy

Unnamed: 0,fips,vaccine_hesitant,vaccine_hesitant_strong,vaccine_hesitant_category
0,1131,0.23,0.11,Medium
1,1129,0.23,0.11,Medium
2,1133,0.22,0.11,Medium
3,1127,0.23,0.11,Medium
4,2013,0.26,0.12,High
...,...,...,...,...
3137,55079,0.18,0.11,Medium
3138,55121,0.18,0.10,Medium
3139,56001,0.30,0.16,High
3140,55067,0.17,0.10,Medium


### Education

We keep four columns from the education dataset representing percentage of the entire adult population with specific educational attainment signifiers (less than high school degree, high school diploma only, some post-secondary degree, completion of bachelor's degree) and make sure to represent all the percentages as decimal values.

In [66]:
education = pd.read_csv('datasets/raw/Education.csv', encoding='iso-8859-1')
education = education[['FIPS Code', 'Percent of adults with less than a high school diploma, 2015-19', 'Percent of adults with a high school diploma only, 2015-19', "Percent of adults completing some college or associate's degree, 2015-19", "Percent of adults with a bachelor's degree or higher, 2015-19"]]
education = education.rename(columns = {'FIPS Code': 'fips', 'Percent of adults with less than a high school diploma, 2015-19': 'education_high_school_less', 'Percent of adults with a high school diploma only, 2015-19': 'education_high_school_only', "Percent of adults completing some college or associate's degree, 2015-19": 'education_degree_some', "Percent of adults with a bachelor's degree or higher, 2015-19": 'education_bachelors_degree'})
education_cols = ['education_high_school_less', 'education_high_school_only', 'education_degree_some', 'education_bachelors_degree']
education[education_cols] = education[education_cols].div(100)
education

Unnamed: 0,fips,education_high_school_less,education_high_school_only,education_degree_some,education_bachelors_degree
0,0,0.120,0.270,0.289,0.321
1,1000,0.138,0.308,0.299,0.255
2,1001,0.115,0.336,0.284,0.266
3,1003,0.092,0.277,0.313,0.319
4,1005,0.268,0.356,0.260,0.116
...,...,...,...,...,...
3278,72145,0.284,0.262,0.241,0.212
3279,72147,0.288,0.392,0.140,0.180
3280,72149,0.220,0.384,0.197,0.199
3281,72151,0.290,0.257,0.272,0.180


#### Finding missing data
Now that we've extracted out the useful data from that dataset, we check if we're missing any data from our master vaccine hesitancy dataset of county info. Using pandas  `isin` as a boolean mask on the two county index keys, we do what is more or less a difference between the two sets. Luckily, all our education data is complete and nothing is missing.

In [67]:
# Show missing county rows in education dataset
county[~county['fips'].isin(education['fips'])]

Unnamed: 0,fips,county_name,state,division,region


### Poverty
From the Poverty Estimate dataset, we keep the column of values for the percentage of the entire population that is in poverty in 2019. The data is in a narrow format, so we use pivot and make sure to represent the percentage as a decimal.

In [68]:
poverty = pd.read_csv('datasets/raw/PovertyEstimates.csv')
poverty = poverty[['FIPStxt', 'Attribute', 'Value']].pivot(index='FIPStxt', columns='Attribute', values='Value').reset_index()
poverty = poverty[['FIPStxt', 'PCTPOVALL_2019']].rename(columns = {'FIPStxt':'fips', 'PCTPOVALL_2019': 'poverty'})
poverty['poverty'] = poverty['poverty'].div(100)
poverty

Attribute,fips,poverty
0,0,0.123
1,1000,0.156
2,1001,0.121
3,1003,0.101
4,1005,0.271
...,...,...
3188,56037,0.083
3189,56039,0.060
3190,56041,0.085
3191,56043,0.111


In [69]:
# Show missing county rows in poverty dataset
county[~county['fips'].isin(poverty['fips'])]

Unnamed: 0,fips,county_name,state,division,region
48,15005,"Kalawao County, Hawaii",Hawaii,Pacific,West


### Natality

In [70]:
natality = pd.read_csv('datasets/raw/PopulationEstimates.csv', thousands=',', encoding='iso-8859-1') # thousands arg is used since population column has integers as thousands comma separated number that is read as a string if not specified
natality = natality[['FIPStxt', 'POP_ESTIMATE_2019', 'R_birth_2019']].rename(columns = {'FIPStxt': 'fips', 'POP_ESTIMATE_2019': 'population', 'R_birth_2019': 'birth_rate'})
natality['birth_rate'] = natality['birth_rate'].div(100)
natality

Unnamed: 0,fips,population,birth_rate
0,0,328239523,
1,1000,4903185,0.117
2,1001,55869,0.112
3,1003,223234,0.104
4,1005,24686,0.103
...,...,...,...
3268,72145,50023,
3269,72147,8386,
3270,72149,21372,
3271,72151,32282,


In [71]:
# Show missing county rows in natality dataset
county[~county['fips'].isin(natality['fips'])]

Unnamed: 0,fips,county_name,state,division,region


### Elections
The original election dataset is given as Democrat, Republican and Other votes for each presidential election from 2008 to 2020. We simplify this data into something easier for us to analyze by finding the number of times the Democrat party has won in the past four presidential elections and similarly for the Republican party. This gives a sense of the political leaning for each county.

In [72]:
election_years = [2008, 2012, 2016, 2020]
def find_election_winner(row, year):
    if row['dem_' + str(year)] > row['gop_' + str(year)]:
        return 'Democrat'
    else:
        return 'Republican'
elections_data = pd.read_csv('datasets/raw/US_County_Level_Presidential_Results_08-16.csv').rename(columns={'fips_code': 'fips'})
elections_data_2020 = pd.read_csv('datasets/raw/2020_US_County_Level_Presidential_Results.csv').rename(columns={'votes_gop': 'gop_2020', 'votes_dem': 'dem_2020', 'county_fips': 'fips'})
elections_data = elections_data.merge(elections_data_2020[['fips', 'gop_2020', 'dem_2020']], left_on='fips', right_on='fips').drop(columns=['county'])

elections_winner = pd.DataFrame()
elections_winner['fips'] = elections_data['fips']
for year in election_years:
    elections_winner['election_' + str(year)] = elections_data.apply(lambda row: find_election_winner(row, year), axis=1)
elections_data = elections_data.rename(lambda col: 'election_' + col, axis='columns').rename(columns={'election_fips': 'fips'})
elections_winner

Unnamed: 0,fips,election_2008,election_2012,election_2016,election_2020
0,26041,Democrat,Republican,Republican,Republican
1,48295,Republican,Republican,Republican,Republican
2,1127,Republican,Republican,Republican,Republican
3,48389,Democrat,Democrat,Democrat,Republican
4,56017,Republican,Republican,Republican,Republican
...,...,...,...,...,...
3106,17115,Democrat,Republican,Republican,Republican
3107,29215,Republican,Republican,Republican,Republican
3108,46051,Republican,Republican,Republican,Republican
3109,17103,Republican,Republican,Republican,Republican


In [73]:
elections_winner['election_democrat_wins'] = sum([elections_winner['election_' + str(year)].str.count('Democrat') for year in election_years])
elections_winner['election_republican_wins'] = sum([elections_winner['election_' + str(year)].str.count('Republican') for year in election_years])
elections = elections_winner[['fips', 'election_democrat_wins', 'election_republican_wins']]
elections = elections.merge(elections_data, left_on='fips', right_on='fips')
elections

Unnamed: 0,fips,election_democrat_wins,election_republican_wins,election_total_2008,election_dem_2008,election_gop_2008,election_oth_2008,election_total_2012,election_dem_2012,election_gop_2012,election_oth_2012,election_total_2016,election_dem_2016,election_gop_2016,election_oth_2016,election_gop_2020,election_dem_2020
0,26041,1,3,19064,9974,8763,327,18043,8330,9533,180,18467,6431,11112,924,13207,7606
1,48295,0,4,1256,155,1093,8,1168,119,1044,5,1322,135,1159,28,1205,131
2,1127,0,4,28652,7420,20722,510,28497,6551,21633,313,29243,4486,24208,549,26002,4834
3,48389,3,1,3077,1606,1445,26,2867,1649,1185,33,3184,1659,1417,108,2254,1395
4,56017,0,4,2546,619,1834,93,2495,523,1894,78,2535,400,1939,196,1999,482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,17115,1,3,51216,25487,24948,781,48742,22688,25249,805,47283,18211,26782,2290,28589,19847
3107,29215,0,4,10851,3410,7215,226,10764,2871,7618,275,10935,1728,8875,332,9478,1716
3108,46051,0,4,3830,1786,1951,93,3606,1493,2034,79,3562,970,2381,211,2618,1056
3109,17103,0,4,16318,7765,8258,295,15275,6932,8046,297,15215,5499,8597,1119,9630,6407


In [74]:
# Show missing county rows in elections dataset
county[~county['fips'].isin(elections['fips'])]

Unnamed: 0,fips,county_name,state,division,region
4,2013,"Aleutians East Borough, Alaska",Alaska,Pacific,West
5,2016,"Aleutians West Census Area, Alaska",Alaska,Pacific,West
24,2050,"Bethel Census Area, Alaska",Alaska,Pacific,West
48,15005,"Kalawao County, Hawaii",Hawaii,Pacific,West
371,2020,"Anchorage Municipality, Alaska",Alaska,Pacific,West
375,2198,"Prince of Wales-Hyder Census Area, Alaska",Alaska,Pacific,West
392,2275,"Wrangell City and Borough, Alaska",Alaska,Pacific,West
394,2122,"Kenai Peninsula Borough, Alaska",Alaska,Pacific,West
398,2180,"Nome Census Area, Alaska",Alaska,Pacific,West
402,2261,"Valdez-Cordova Census Area, Alaska",Alaska,Pacific,West


### Unemployment
From the Unemployment dataset, we have several useful data points involving geography (rural vs urban continuum code, urban influence code), income (median household, and represented as a percent of median state total) and unemployment rate.

In [75]:
unemployment = pd.read_csv('datasets/raw/Unemployment.csv').pivot(index='fips_txt', columns='Attribute', values='Value').reset_index().rename(columns = {'fips_txt':'fips'})
geography = unemployment[['fips', 'Rural_urban_continuum_code_2013', 'Urban_influence_code_2013']].rename(columns={'Rural_urban_continuum_code_2013': 'rural_urban_code', 'Urban_influence_code_2013': 'urban_influence_code'})
# TODO: convert urban/rural codes into z-scores
geography

Attribute,fips,rural_urban_code,urban_influence_code
0,0,,
1,1000,,
2,1001,2.0,2.0
3,1003,3.0,2.0
4,1005,6.0,6.0
...,...,...,...
3270,72145,1.0,1.0
3271,72147,7.0,12.0
3272,72149,2.0,2.0
3273,72151,1.0,1.0


In [76]:
# Show missing county rows in geography dataset
county[~county['fips'].isin(geography['fips'])]

Unnamed: 0,fips,county_name,state,division,region
48,15005,"Kalawao County, Hawaii",Hawaii,Pacific,West


### Income
To look at the county's economic factors, we keep two columns representing the estimated median household income in 2019 and the county household median income as a percent of the state total median household income. We represent this percent as a decimal.

In [77]:
income = unemployment[['fips', 'Med_HH_Income_Percent_of_State_Total_2019', 'Median_Household_Income_2019']].rename(columns={'Med_HH_Income_Percent_of_State_Total_2019': 'median_income_percent_state', 'Median_Household_Income_2019': 'median_income'})
income['median_income_percent_state'] = income['median_income_percent_state'].div(100)
income

Attribute,fips,median_income_percent_state,median_income
0,0,,65712.0
1,1000,1.000000,51771.0
2,1001,1.124819,58233.0
3,1003,1.156458,59871.0
4,1005,0.694829,35972.0
...,...,...,...
3270,72145,,
3271,72147,,
3272,72149,,
3273,72151,,


In [78]:
# Show missing county rows in income dataset
county[~county['fips'].isin(income['fips'])]

Unnamed: 0,fips,county_name,state,division,region
48,15005,"Kalawao County, Hawaii",Hawaii,Pacific,West


### Unemployment

In [79]:
unemployment = unemployment[['fips', 'Unemployment_rate_2019']].rename(columns={'Unemployment_rate_2019': 'unemployment'})
unemployment['unemployment'] = unemployment['unemployment'].div(100)
unemployment

Attribute,fips,unemployment
0,0,0.036694
1,1000,0.030000
2,1001,0.027000
3,1003,0.027000
4,1005,0.038000
...,...,...
3270,72145,0.096000
3271,72147,0.069000
3272,72149,0.159000
3273,72151,0.131000


In [80]:
# Show missing county rows in unemployment dataset
county[~county['fips'].isin(unemployment['fips'])]

Unnamed: 0,fips,county_name,state,division,region
48,15005,"Kalawao County, Hawaii",Hawaii,Pacific,West


### Religion

In [81]:
religion = pd.read_csv('datasets/raw/U.S. Religion Census Religious Congregations and Membership Study, 2010 (County File).csv')
religion = religion[['FIPS', 'TOTRATE', 'EVANRATE', 'MPRTRATE']].rename(columns = {'FIPS': 'fips', 'TOTRATE': 'religion_total', 'EVANRATE': 'religion_evangelical', 'MPRTRATE': 'religion_mainline_protestant'})
religion
# TODO: decide to drop black_protestant and/or orthodox columns

Unnamed: 0,fips,religion_total,religion_evangelical,religion_mainline_protestant
0,1001,676.878889,503.990000,82.858889
1,1003,531.740000,318.138889,110.140000
2,1005,549.990000,320.250000,77.938889
3,1007,498.800000,443.328889,13.178889
4,1009,651.620000,509.800000,52.950000
...,...,...,...,...
3144,56037,477.220000,99.120000,31.230000
3145,56039,260.360000,42.030000,85.468889
3146,56041,606.830000,68.658889,23.488889
3147,56043,471.818889,155.870000,88.708889


In [82]:
# Show missing county rows in religion dataset
county[~county['fips'].isin(religion['fips'])]

Unnamed: 0,fips,county_name,state,division,region
456,2158,"Kusilvak Census Area, Alaska",Alaska,Pacific,West
2724,46102,"Oglala Lakota County, South Dakota",South Dakota,West North Central,Midwest


In [83]:
# Show missing value county rows in religion_total column
county[county['fips'].isin(religion[religion['religion_total'].isnull()]['fips'])]

Unnamed: 0,fips,county_name,state,division,region


In [84]:
# Show missing value county rows in religion_evangelical column
county[county['fips'].isin(religion[religion['religion_evangelical'].isnull()]['fips'])]

Unnamed: 0,fips,county_name,state,division,region
48,15005,"Kalawao County, Hawaii",Hawaii,Pacific,West
492,2164,"Lake and Peninsula Borough, Alaska",Alaska,Pacific,West
635,8047,"Gilpin County, Colorado",Colorado,Mountain,West
643,8023,"Costilla County, Colorado",Colorado,Mountain,West
889,16033,"Clark County, Idaho",Idaho,Mountain,West
1883,30069,"Petroleum County, Montana",Montana,Mountain,West
1976,31113,"Logan County, Nebraska",Nebraska,West North Central,Midwest
2043,31005,"Arthur County, Nebraska",Nebraska,West North Central,Midwest
2231,38087,"Slope County, North Dakota",North Dakota,West North Central,Midwest
2282,38065,"Oliver County, North Dakota",North Dakota,West North Central,Midwest


In [85]:
# Show missing value county rows in religion_evangelical column
county[county['fips'].isin(religion[religion['religion_mainline_protestant'].isnull()]['fips'])]

Unnamed: 0,fips,county_name,state,division,region
4,2013,"Aleutians East Borough, Alaska",Alaska,Pacific,West
25,55078,"Menominee County, Wisconsin",Wisconsin,East North Central,Midwest
208,48301,"Loving County, Texas",Texas,West South Central,South
221,48311,"McMullen County, Texas",Texas,West South Central,South
290,51685,"Manassas Park city, Virginia",Virginia,South Atlantic,South
449,2060,"Bristol Bay Borough, Alaska",Alaska,Pacific,West
476,2068,"Denali Borough, Alaska",Alaska,Pacific,West
492,2164,"Lake and Peninsula Borough, Alaska",Alaska,Pacific,West
503,2282,"Yakutat City and Borough, Alaska",Alaska,Pacific,West
553,6003,"Alpine County, California",California,Pacific,West


## Data Completeness
As noted above we are missing a few recurring pattern of certain counties being missing in most of our datasets.
* Oglala Lakota County, South Dakota (FIPS 46102)
* Kalawao County, Hawaii (FIPS 15005)
* Various parts of Alaska, especially in the election dataset

Upon further investigation into those areas, we learned that Oglala Lakota County does not have a functioning county seat and remains unorganized, which explains the difficulty government surveyors would have with gathering data there. However, this county is entirely on an Indian reservation, which would give us valuable insight on Native American vaccine hesitancy.

Kalawao County because of its small population does not have many of the functions that a normal county would have.

While Alaska does administer using county divisions, for elections they use a different geographic boundary of boroughs, which do not conveniently align with counties. This makes a county level political correlation with vaccine hesitancy impossible for us. Similar to the note above, Alaska is home to a lot of Native Americans, 15% of the population, which means our analysis will lose insight into Native American vaccine hesitancy.

Because of the issues surrounding population size, county organization and governmental issues, we decide to drop those two data points and focus our analysis on the central parts of the USA, ignoring Alaska.

In [86]:
county = county.drop(county[county['fips'] == 46102].index)
county = county.drop(county[county['fips'] == 15005].index)
county = county.drop(county[county['state'] == 'Alaska'].index)
county

Unnamed: 0,fips,county_name,state,division,region
0,1131,"Wilcox County, Alabama",Alabama,East South Central,South
1,1129,"Washington County, Alabama",Alabama,East South Central,South
2,1133,"Winston County, Alabama",Alabama,East South Central,South
3,1127,"Walker County, Alabama",Alabama,East South Central,South
6,1125,"Tuscaloosa County, Alabama",Alabama,East South Central,South
...,...,...,...,...,...
3137,55079,"Milwaukee County, Wisconsin",Wisconsin,East North Central,Midwest
3138,55121,"Trempealeau County, Wisconsin",Wisconsin,East North Central,Midwest
3139,56001,"Albany County, Wyoming",Wyoming,Mountain,West
3140,55067,"Langlade County, Wisconsin",Wisconsin,East North Central,Midwest


## Aggregation
We set the fips as index for all of our dataframes and then concatenate them along it with an inner join.

In [87]:
dfs = [df.set_index('fips') for df in [county, vaccine_hesitancy, social_vulnerability_index, ethnicity, natality, unemployment, geography, income, poverty, education, religion, elections]]
df = pd.concat(dfs, axis=1, join='inner').reset_index()
df

Unnamed: 0,fips,county_name,state,division,region,vaccine_hesitant,vaccine_hesitant_strong,vaccine_hesitant_category,social_vulnerability_index,ethnicity_hispanic,...,election_total_2012,election_dem_2012,election_gop_2012,election_oth_2012,election_total_2016,election_dem_2016,election_gop_2016,election_oth_2016,election_gop_2020,election_dem_2020
0,1131,"Wilcox County, Alabama",Alabama,East South Central,South,0.23,0.11,Medium,0.93,0.0053,...,6547,4867,1676,4,6095,4329,1737,29,1833,4048
1,1129,"Washington County, Alabama",Alabama,East South Central,South,0.23,0.11,Medium,0.73,0.0146,...,8761,2971,5749,41,8492,2366,6031,95,6564,2258
2,1133,"Winston County, Alabama",Alabama,East South Central,South,0.22,0.11,Medium,0.70,0.0315,...,9712,1286,8310,116,10255,871,9225,159,10195,974
3,1127,"Walker County, Alabama",Alabama,East South Central,South,0.23,0.11,Medium,0.75,0.0249,...,28497,6551,21633,313,29243,4486,24208,549,26002,4834
4,1125,"Tuscaloosa County, Alabama",Alabama,East South Central,South,0.22,0.10,Medium,0.63,0.0372,...,78495,32003,45703,789,81708,31746,47701,2261,51117,37765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3106,55079,"Milwaukee County, Wisconsin",Wisconsin,East North Central,Midwest,0.18,0.11,Medium,0.81,0.1500,...,490944,328090,158430,4424,434970,288986,126091,19893,134357,317270
3107,55121,"Trempealeau County, Wisconsin",Wisconsin,East North Central,Midwest,0.18,0.10,Medium,0.28,0.0840,...,13456,7601,5703,152,13581,5645,7370,566,8833,6285
3108,56001,"Albany County, Wyoming",Wyoming,Mountain,West,0.30,0.16,High,0.25,0.0953,...,16052,7445,7851,756,16420,6888,7601,1931,8579,9092
3109,55067,"Langlade County, Wisconsin",Wisconsin,East North Central,Midwest,0.17,0.10,Medium,0.35,0.0197,...,10482,4569,5810,103,10093,3260,6436,397,7330,3704


## Output

In [88]:
df.to_csv('datasets/clean/interim_clean_dataset_2021-07-19.csv', index=False)