In [89]:
# Dependancies
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

# File load
file1 = os.path.join('..', 'resources', '1-covid-data.csv')
file2 = os.path.join('..', 'resources', '2-vulnerability-index.csv')
file3 = os.path.join('..', 'resources', '3-median-income.csv')
file4 = os.path.join('..', 'resources', '4-population.csv')
file5 = os.path.join('..', 'resources', '5-uninsured-poor-health.csv')

# File read
covid_df = pd.read_csv(file1)
vulnerability_df = pd.read_csv(file2)
med_income_df = pd.read_csv(file3)
population_df = pd.read_csv(file4)
uninsured_df = pd.read_csv(file5)

In [86]:
# Cleaning of population data
pop_df = population_df
# Splitting county column into state and county
pop_df['state'] = pop_df['county'].str.split(',').str[1]
pop_df['county_'] = pop_df['county'].str.split(',').str[0]
pop_df = pop_df.drop(['county'], axis=1)
pop_df = pop_df.rename(columns = {'county_':'county'})

# Removing unneeded characters
pop_df['county'] = pop_df['county'].str.strip('.')
pop_df['county'] = pop_df['county'].str.strip(' County')

# Check for null values
pop_df.isnull().values.any()

False

In [87]:
# Final population DF
pop_df

Unnamed: 0,population,state,county
0,55869,Alabama,Autauga
1,223234,Alabama,Baldwi
2,24686,Alabama,Barbour
3,22394,Alabama,Bibb
4,57826,Alabama,Bl
...,...,...,...
3137,42343,Wyoming,Sweetwater
3138,23464,Wyoming,Te
3139,20226,Wyoming,Uinta
3140,7805,Wyoming,Washakie


In [71]:
# Cleaning vulnerability data
v_df = vulnerability_df
v_df = v_df.rename(columns = {'stateAbb': 'abbr',
                              'theme1SocioeconomicStatus': 'socioeconomic status', 
                              'theme2HouseholdCompositionAndDisability':'household composition and disability',
                              'THEME 3: \nMinority Status & Language':'minority status and language',
                              'THEME 4: \nHousing Type & Transportation': 'housing type and transportation',
                              'THEME 5: Epidemiological Factors': 'epidemiological factors',
                              'THEME 6: Healthcare System Factors': 'healthcare system factors' ,
                              'ccviScore': 'ccvi score'})

# Check for null values
v_df.isnull().values.any()

True

In [82]:
v_df.dropna(inplace = True)
v_df.count()

state                                   3141
abbr                                    3141
county                                  3141
fips                                    3141
socioeconomic status                    3141
household composition and disability    3141
minority status and language            3141
housing type and transportation         3141
epidemiological factors                 3141
healthcare system factors               3141
ccvi score                              3141
dtype: int64

In [73]:
# Final vulnerability DF
v_df

Unnamed: 0,state,abbr,county,fips,socioeconomic status,household composition and disability,minority status and language,housing type and transportation,epidemiological factors,healthcare system factors,ccvi score
0,Texas,TX,Brooks,48047,0.987,0.990,0.993,0.981,0.808,0.864,1.000
1,Texas,TX,Dimmit,48127,0.995,0.999,0.994,0.991,0.428,0.655,0.999
2,Texas,TX,Zavala,48507,0.988,0.997,0.996,0.981,0.380,0.834,0.999
3,Georgia,GA,Clay,13061,0.996,0.962,0.646,0.987,0.876,0.962,0.999
4,Texas,TX,Jim Hogg,48247,0.983,0.999,0.993,0.884,0.452,0.865,0.998
...,...,...,...,...,...,...,...,...,...,...,...
3136,Nebraska,NE,Rock,31149,0.010,0.059,0.228,0.235,0.075,0.067,0.001
3137,Vermont,VT,Grand Isle,50013,0.070,0.024,0.226,0.010,0.161,0.087,0.001
3138,North Dakota,ND,Steele,38091,0.003,0.105,0.096,0.012,0.007,0.330,0.001
3139,Wisconsin,WI,Ozaukee,55089,0.006,0.032,0.289,0.048,0.060,0.038,0.000


In [91]:
# Cleaning median income data
income_df = med_income_df

# Splitting county column into state and county
income_df['state'] = income_df['county'].str.split(',').str[1]
income_df['county_'] = income_df['county'].str.split(',').str[0]
income_df = income_df.drop(['county'], axis=1)
income_df = income_df.rename(columns = {'county_':'county'})

# Drop Nulls
income_df.dropna(inplace = True)
income_df.count()

# Drop state column to prevent future merging conflict 
income_df  = income_df.drop(['state'], axis=1 )

# Removing unneeded characters
income_df['county'] = income_df ['county'].str.strip(' County')

income_df

Unnamed: 0,fips,abbr,median-income-2018,county
2,1001,AL,59338,Autauga
3,1003,AL,57588,Baldwi
4,1005,AL,34382,Barbour
5,1007,AL,46064,Bibb
6,1009,AL,50412,Bl
...,...,...,...,...
3191,56037,WY,73315,Sweetwater
3192,56039,WY,99087,Te
3193,56041,WY,63401,Uinta
3194,56043,WY,55190,Washakie


In [64]:
pop_df.dropna(inplace = True)
pop_df.count()

population    3142
state         3142
county        3142
dtype: int64