In [2]:
# Import necessary libraries
import pandas as pd

We want to clean and combine all of our various datasets and export it as one CSV. This will primarily involve dropping columns that we aren't interested in and merging onto a central Pandas DataFrame. After some exploratory analysis, we will determine what rows will beed to be dropped or interpolated.

In [26]:
vaccine_hesitancy = pd.read_csv('datasets/raw/Vaccine_Hesitancy_for_COVID-19__County_and_local_estimates.csv').rename(columns = {'FIPS Code':'fips'})
fips = vaccine_hesitancy[['fips']]
ethnicity = vaccine_hesitancy[['fips', 'Percent Hispanic', 'Percent non-Hispanic American Indian/Alaska Native', 'Percent non-Hispanic Asian', 'Percent non-Hispanic Black', 'Percent non-Hispanic Native Hawaiian/Pacific Islander', 'Percent non-Hispanic White']].rename(columns = {'Percent Hispanic': 'ethnicity_hispanic', 'Percent non-Hispanic American Indian/Alaska Native': 'ethnicity_native', 'Percent non-Hispanic Asian': 'ethnicity_asian', 'Percent non-Hispanic Black': 'ethnicity_black', 'Percent non-Hispanic Native Hawaiian/Pacific Islander': 'ethnicity_hawaiian', 'Percent non-Hispanic White': 'ethnicity_white'})
social_vulnerability_index = vaccine_hesitancy[['fips', 'Social Vulnerability Index (SVI)', 'County Name', 'State']].rename(columns= {'Social Vulnerability Index (SVI)': 'social_vulnerability_index', 'County Name': 'county_name', 'State': 'state'})
vaccine_hesitancy = vaccine_hesitancy[['fips', 'Estimated hesitant', 'Estimated strongly hesitant']].rename(columns = {'Estimated hesitant': 'vaccine_hesitant', 'Estimated strongly hesitant': 'vaccine_hesitant_strong'})

In [25]:
education = pd.read_csv('datasets/raw/Education.csv')
education = education[['FIPS Code', 'Percent of adults with less than a high school diploma, 2015-19', 'Percent of adults with a high school diploma only, 2015-19', "Percent of adults completing some college or associate's degree, 2015-19", "Percent of adults with a bachelor's degree or higher, 2015-19"]]
education = education.rename(columns = {'FIPS Code': 'fips', 'Percent of adults with less than a high school diploma, 2015-19': 'education_high_school_less', 'Percent of adults with a high school diploma only, 2015-19': 'education_high_school_only', "Percent of adults completing some college or associate's degree, 2015-19": 'education_degree_some', "Percent of adults with a bachelor's degree or higher, 2015-19": 'education_bachelors_degree'})
education

Unnamed: 0,fips,education_high_school_less,education_high_school_only,education_degree_some,education_bachelors_degree
0,0,12.0,27.0,28.9,32.1
1,1000,13.8,30.8,29.9,25.5
2,1001,11.5,33.6,28.4,26.6
3,1003,9.2,27.7,31.3,31.9
4,1005,26.8,35.6,26.0,11.6
...,...,...,...,...,...
3278,72145,28.4,26.2,24.1,21.2
3279,72147,28.8,39.2,14.0,18.0
3280,72149,22.0,38.4,19.7,19.9
3281,72151,29.0,25.7,27.2,18.0


In [11]:
poverty = pd.read_csv('datasets/raw/PovertyEstimates.csv')
poverty = poverty[['FIPStxt', 'Attribute', 'Value']].pivot(index='FIPStxt', columns='Attribute', values='Value').reset_index()
poverty = poverty[['FIPStxt', 'PCTPOVALL_2019']].rename(columns = {'FIPStxt':'fips', 'PCTPOVALL_2019': 'poverty'})
poverty

Attribute,fips,poverty_percent
0,0,12.3
1,1000,15.6
2,1001,12.1
3,1003,10.1
4,1005,27.1
...,...,...
3188,56037,8.3
3189,56039,6.0
3190,56041,8.5
3191,56043,11.1


In [21]:
unemployment = pd.read_csv('datasets/raw/Unemployment.csv').pivot(index='fips_txt', columns='Attribute', values='Value').reset_index().rename(columns = {'fips_txt':'fips'})
geography = unemployment[['fips', 'Rural_urban_continuum_code_2013', 'Urban_influence_code_2013']].rename(columns={'Rural_urban_continuum_code_2013': 'rural_urban_code', 'Urban_influence_code_2013': 'urban_influence_code'})

income = unemployment[['fips', 'Med_HH_Income_Percent_of_State_Total_2019', 'Median_Household_Income_2019']].rename(columns={'Med_HH_Income_Percent_of_State_Total_2019': 'median_income_percent_state', 'Median_Household_Income_2019': 'median_income'})
unemployment = unemployment[['fips', 'Unemployment_rate_2019']].rename(columns={'Unemployment_rate_2019': 'unemployment'})


In [27]:
dfs = [df.set_index('fips') for df in [vaccine_hesitancy, social_vulnerability_index, ethnicity, unemployment, geography, income, poverty, education]]
df = pd.concat(dfs, axis=1, join='inner').reset_index()
df

Unnamed: 0,fips,vaccine_hesitant,vaccine_hesitant_strong,social_vulnerability_index,county_name,state,ethnicity_hispanic,ethnicity_native,ethnicity_asian,ethnicity_black,...,unemployment,rural_urban_code,urban_influence_code,median_income_percent_state,median_income,poverty_percent,education_high_school_less,education_high_school_only,education_degree_some,education_bachelors_degree
0,1131,0.23,0.11,0.93,"Wilcox County, Alabama",ALABAMA,0.0053,0.0009,0.0003,0.6938,...,7.1,9.0,10.0,59.875221,30998.0,32.5,23.5,39.5,24.5,12.5
1,1129,0.23,0.11,0.73,"Washington County, Alabama",ALABAMA,0.0146,0.0731,0.0025,0.2354,...,4.6,8.0,7.0,94.384888,48864.0,18.6,17.4,43.1,26.9,12.7
2,1133,0.22,0.11,0.70,"Winston County, Alabama",ALABAMA,0.0315,0.0034,0.0016,0.0073,...,3.3,6.0,4.0,78.860756,40827.0,16.7,21.2,38.2,27.8,12.8
3,1127,0.23,0.11,0.75,"Walker County, Alabama",ALABAMA,0.0249,0.0015,0.0049,0.0617,...,3.3,1.0,1.0,88.835449,45991.0,17.3,18.2,37.5,33.0,11.3
4,2013,0.26,0.12,0.58,"Aleutians East Borough, Alaska",ALASKA,0.0901,0.4588,0.1968,0.0322,...,2.8,9.0,12.0,86.684456,66923.0,14.8,14.5,43.5,30.5,11.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3136,55079,0.18,0.11,0.81,"Milwaukee County, Wisconsin",WISCONSIN,0.1500,0.0047,0.0428,0.2606,...,4.0,1.0,1.0,83.370987,53505.0,16.9,11.7,28.4,29.0,31.0
3137,55121,0.18,0.10,0.28,"Trempealeau County, Wisconsin",WISCONSIN,0.0840,0.0034,0.0043,0.0051,...,3.6,6.0,6.0,96.120102,61687.0,8.9,9.3,39.0,32.4,19.3
3138,56001,0.30,0.16,0.25,"Albany County, Wyoming",WYOMING,0.0953,0.0091,0.0327,0.0150,...,3.1,4.0,5.0,78.933365,52216.0,16.0,4.1,14.7,29.5,51.8
3139,55067,0.17,0.10,0.35,"Langlade County, Wisconsin",WISCONSIN,0.0197,0.0069,0.0022,0.0125,...,4.2,6.0,6.0,77.281578,49597.0,13.0,9.7,42.3,31.3,16.7
