In [2]:
# Import necessary libraries
import pandas as pd

We want to clean and combine all of our various datasets and export it as one CSV. This will primarily involve dropping columns that we aren't interested in and merging onto a central Pandas DataFrame. After some exploratory analysis, we will determine what rows will beed to be dropped or interpolated.

In [41]:
vaccine_hesitancy = pd.read_csv('datasets/raw/Vaccine_Hesitancy_for_COVID-19__County_and_local_estimates.csv').rename(columns = {'FIPS Code':'fips'})
fips = vaccine_hesitancy[['fips']]
ethnicity = vaccine_hesitancy[['fips', 'Percent Hispanic', 'Percent non-Hispanic American Indian/Alaska Native', 'Percent non-Hispanic Asian', 'Percent non-Hispanic Black', 'Percent non-Hispanic Native Hawaiian/Pacific Islander', 'Percent non-Hispanic White']].rename(columns = {'Percent Hispanic': 'ethnicity_hispanic', 'Percent non-Hispanic American Indian/Alaska Native': 'ethnicity_native', 'Percent non-Hispanic Asian': 'ethnicity_asian', 'Percent non-Hispanic Black': 'ethnicity_black', 'Percent non-Hispanic Native Hawaiian/Pacific Islander': 'ethnicity_hawaiian', 'Percent non-Hispanic White': 'ethnicity_white'})
social_vulnerability_index = vaccine_hesitancy[['fips', 'Social Vulnerability Index (SVI)', 'County Name', 'State']].rename(columns= {'Social Vulnerability Index (SVI)': 'social_vulnerability_index', 'County Name': 'county_name', 'State': 'state'})
vaccine_hesitancy = vaccine_hesitancy[['fips', 'Estimated hesitant', 'Estimated strongly hesitant']].rename(columns = {'Estimated hesitant': 'vaccine_hesitant', 'Estimated strongly hesitant': 'vaccine_hesitant_strong'})
vaccine_hesitancy

Unnamed: 0,fips,vaccine_hesitant,vaccine_hesitant_strong
0,1131,0.23,0.11
1,1129,0.23,0.11
2,1133,0.22,0.11
3,1127,0.23,0.11
4,2013,0.26,0.12
...,...,...,...
3137,55079,0.18,0.11
3138,55121,0.18,0.10
3139,56001,0.30,0.16
3140,55067,0.17,0.10


In [35]:
education = pd.read_csv('datasets/raw/Education.csv')
education = education[['FIPS Code', 'Percent of adults with less than a high school diploma, 2015-19', 'Percent of adults with a high school diploma only, 2015-19', "Percent of adults completing some college or associate's degree, 2015-19", "Percent of adults with a bachelor's degree or higher, 2015-19"]]
education = education.rename(columns = {'FIPS Code': 'fips', 'Percent of adults with less than a high school diploma, 2015-19': 'education_high_school_less', 'Percent of adults with a high school diploma only, 2015-19': 'education_high_school_only', "Percent of adults completing some college or associate's degree, 2015-19": 'education_degree_some', "Percent of adults with a bachelor's degree or higher, 2015-19": 'education_bachelors_degree'})
education_cols = ['education_high_school_less', 'education_high_school_only', 'education_degree_some', 'education_bachelors_degree']
education[education_cols] = education[education_cols].div(100)
education

Unnamed: 0,fips,education_high_school_less,education_high_school_only,education_degree_some,education_bachelors_degree
0,0,0.120,0.270,0.289,0.321
1,1000,0.138,0.308,0.299,0.255
2,1001,0.115,0.336,0.284,0.266
3,1003,0.092,0.277,0.313,0.319
4,1005,0.268,0.356,0.260,0.116
...,...,...,...,...,...
3278,72145,0.284,0.262,0.241,0.212
3279,72147,0.288,0.392,0.140,0.180
3280,72149,0.220,0.384,0.197,0.199
3281,72151,0.290,0.257,0.272,0.180


In [36]:
poverty = pd.read_csv('datasets/raw/PovertyEstimates.csv')
poverty = poverty[['FIPStxt', 'Attribute', 'Value']].pivot(index='FIPStxt', columns='Attribute', values='Value').reset_index()
poverty = poverty[['FIPStxt', 'PCTPOVALL_2019']].rename(columns = {'FIPStxt':'fips', 'PCTPOVALL_2019': 'poverty'})
poverty['poverty'] = poverty['poverty'].div(100)
poverty

Attribute,fips,poverty
0,0,0.123
1,1000,0.156
2,1001,0.121
3,1003,0.101
4,1005,0.271
...,...,...
3188,56037,0.083
3189,56039,0.060
3190,56041,0.085
3191,56043,0.111


Attribute,fips,unemployment
0,0,0.036694
1,1000,0.030000
2,1001,0.027000
3,1003,0.027000
4,1005,0.038000
...,...,...
3270,72145,0.096000
3271,72147,0.069000
3272,72149,0.159000
3273,72151,0.131000


In [None]:
unemployment = pd.read_csv('datasets/raw/Unemployment.csv').pivot(index='fips_txt', columns='Attribute', values='Value').reset_index().rename(columns = {'fips_txt':'fips'})
geography = unemployment[['fips', 'Rural_urban_continuum_code_2013', 'Urban_influence_code_2013']].rename(columns={'Rural_urban_continuum_code_2013': 'rural_urban_code', 'Urban_influence_code_2013': 'urban_influence_code'})
# TODO: convert urban/rural codes into z-scores

income = unemployment[['fips', 'Med_HH_Income_Percent_of_State_Total_2019', 'Median_Household_Income_2019']].rename(columns={'Med_HH_Income_Percent_of_State_Total_2019': 'median_income_percent_state', 'Median_Household_Income_2019': 'median_income'})
income['median_income_percent_state'] = income['median_income_percent_state'].div(100)
unemployment = unemployment[['fips', 'Unemployment_rate_2019']].rename(columns={'Unemployment_rate_2019': 'unemployment'})
unemployment['unemployment'] = unemployment['unemployment'].div(100)
unemployment

In [44]:
dfs = [df.set_index('fips') for df in [vaccine_hesitancy, social_vulnerability_index, ethnicity, unemployment, geography, income, poverty, education]]
df = pd.concat(dfs, axis=1, join='inner').reset_index()
df

Unnamed: 0,fips,vaccine_hesitant,vaccine_hesitant_strong,social_vulnerability_index,county_name,state,ethnicity_hispanic,ethnicity_native,ethnicity_asian,ethnicity_black,...,unemployment,rural_urban_code,urban_influence_code,median_income_percent_state,median_income,poverty,education_high_school_less,education_high_school_only,education_degree_some,education_bachelors_degree
0,1131,0.23,0.11,0.93,"Wilcox County, Alabama",ALABAMA,0.0053,0.0009,0.0003,0.6938,...,0.071,9.0,10.0,0.598752,30998.0,0.325,0.235,0.395,0.245,0.125
1,1129,0.23,0.11,0.73,"Washington County, Alabama",ALABAMA,0.0146,0.0731,0.0025,0.2354,...,0.046,8.0,7.0,0.943849,48864.0,0.186,0.174,0.431,0.269,0.127
2,1133,0.22,0.11,0.70,"Winston County, Alabama",ALABAMA,0.0315,0.0034,0.0016,0.0073,...,0.033,6.0,4.0,0.788608,40827.0,0.167,0.212,0.382,0.278,0.128
3,1127,0.23,0.11,0.75,"Walker County, Alabama",ALABAMA,0.0249,0.0015,0.0049,0.0617,...,0.033,1.0,1.0,0.888354,45991.0,0.173,0.182,0.375,0.330,0.113
4,2013,0.26,0.12,0.58,"Aleutians East Borough, Alaska",ALASKA,0.0901,0.4588,0.1968,0.0322,...,0.028,9.0,12.0,0.866845,66923.0,0.148,0.145,0.435,0.305,0.115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3136,55079,0.18,0.11,0.81,"Milwaukee County, Wisconsin",WISCONSIN,0.1500,0.0047,0.0428,0.2606,...,0.040,1.0,1.0,0.833710,53505.0,0.169,0.117,0.284,0.290,0.310
3137,55121,0.18,0.10,0.28,"Trempealeau County, Wisconsin",WISCONSIN,0.0840,0.0034,0.0043,0.0051,...,0.036,6.0,6.0,0.961201,61687.0,0.089,0.093,0.390,0.324,0.193
3138,56001,0.30,0.16,0.25,"Albany County, Wyoming",WYOMING,0.0953,0.0091,0.0327,0.0150,...,0.031,4.0,5.0,0.789334,52216.0,0.160,0.041,0.147,0.295,0.518
3139,55067,0.17,0.10,0.35,"Langlade County, Wisconsin",WISCONSIN,0.0197,0.0069,0.0022,0.0125,...,0.042,6.0,6.0,0.772816,49597.0,0.130,0.097,0.423,0.313,0.167
