In [1]:
# Dependancies
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

# File load
file1 = os.path.join('..', 'resources', '1-covid-data.csv')
file2 = os.path.join('..', 'resources', '2-vulnerability-index.csv')
file3 = os.path.join('..', 'resources', '3-median-income.csv')
file4 = os.path.join('..', 'resources', '4-population.csv')
file5 = os.path.join('..', 'resources', '5-uninsured-poor-health.csv')

# File read
covid_df = pd.read_csv(file1)
vulnerability_df = pd.read_csv(file2)
med_income_df = pd.read_csv(file3)
population_df = pd.read_csv(file4)
uninsured_df = pd.read_csv(file5)

In [2]:
c_df = covid_df
c_df.count()

date      586302
county    586302
state     586302
fips      580658
cases     586302
deaths    586302
dtype: int64

In [3]:
# Cleaning of covid data
c_df = covid_df
# Filter all rows relevant to last date of dataset - 9/30/2020
c_df = c_df.loc[c_df['date'] == '9/30/2020']

# Check for null values
c_df.isnull().values.any()

True

In [4]:
# Fill missing cells with 0 in fips column
c_df['fips'] = c_df['fips'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
# Drop Nan Values
c_df = c_df.dropna()
c_df

Unnamed: 0,date,county,state,fips,cases,deaths
583062,9/30/2020,Autauga,Alabama,1001.0,1791,27
583063,9/30/2020,Baldwin,Alabama,1003.0,5640,52
583064,9/30/2020,Barbour,Alabama,1005.0,896,7
583065,9/30/2020,Bibb,Alabama,1007.0,664,11
583066,9/30/2020,Blount,Alabama,1009.0,1629,15
...,...,...,...,...,...,...
586297,9/30/2020,Sweetwater,Wyoming,56037.0,339,2
586298,9/30/2020,Teton,Wyoming,56039.0,577,1
586299,9/30/2020,Uinta,Wyoming,56041.0,357,2
586300,9/30/2020,Washakie,Wyoming,56043.0,115,6


In [6]:
# Converting fips column to int 
c_df['fips'] = c_df['fips'].astype(int)

In [7]:
# Final covid DF
c_df

Unnamed: 0,date,county,state,fips,cases,deaths
583062,9/30/2020,Autauga,Alabama,1001,1791,27
583063,9/30/2020,Baldwin,Alabama,1003,5640,52
583064,9/30/2020,Barbour,Alabama,1005,896,7
583065,9/30/2020,Bibb,Alabama,1007,664,11
583066,9/30/2020,Blount,Alabama,1009,1629,15
...,...,...,...,...,...,...
586297,9/30/2020,Sweetwater,Wyoming,56037,339,2
586298,9/30/2020,Teton,Wyoming,56039,577,1
586299,9/30/2020,Uinta,Wyoming,56041,357,2
586300,9/30/2020,Washakie,Wyoming,56043,115,6


In [8]:
# Cleaning vulnerability data
v_df = vulnerability_df
v_df = v_df.rename(columns = {'stateAbb': 'abbr',
                              'theme1SocioeconomicStatus': 'socioeconomic status', 
                              'theme2HouseholdCompositionAndDisability':'household composition and disability',
                              'THEME 3: \nMinority Status & Language':'minority status and language',
                              'THEME 4: \nHousing Type & Transportation': 'housing type and transportation',
                              'THEME 5: Epidemiological Factors': 'epidemiological factors',
                              'THEME 6: Healthcare System Factors': 'healthcare system factors' ,
                              'ccviScore': 'ccvi score'})

# Check for null values
v_df.isnull().values.any()

True

In [9]:
# Drop Nulls
v_df.dropna(inplace = True)
v_df.count()

state                                   3141
abbr                                    3141
county                                  3141
fips                                    3141
socioeconomic status                    3141
household composition and disability    3141
minority status and language            3141
housing type and transportation         3141
epidemiological factors                 3141
healthcare system factors               3141
ccvi score                              3141
dtype: int64

In [10]:
# Final vulnerability DF
v_df

Unnamed: 0,state,abbr,county,fips,socioeconomic status,household composition and disability,minority status and language,housing type and transportation,epidemiological factors,healthcare system factors,ccvi score
0,Texas,TX,Brooks,48047,0.987,0.990,0.993,0.981,0.808,0.864,1.000
1,Texas,TX,Dimmit,48127,0.995,0.999,0.994,0.991,0.428,0.655,0.999
2,Texas,TX,Zavala,48507,0.988,0.997,0.996,0.981,0.380,0.834,0.999
3,Georgia,GA,Clay,13061,0.996,0.962,0.646,0.987,0.876,0.962,0.999
4,Texas,TX,Jim Hogg,48247,0.983,0.999,0.993,0.884,0.452,0.865,0.998
...,...,...,...,...,...,...,...,...,...,...,...
3136,Nebraska,NE,Rock,31149,0.010,0.059,0.228,0.235,0.075,0.067,0.001
3137,Vermont,VT,Grand Isle,50013,0.070,0.024,0.226,0.010,0.161,0.087,0.001
3138,North Dakota,ND,Steele,38091,0.003,0.105,0.096,0.012,0.007,0.330,0.001
3139,Wisconsin,WI,Ozaukee,55089,0.006,0.032,0.289,0.048,0.060,0.038,0.000


In [11]:
# Cleaning median income data
income_df = med_income_df

# Splitting county column into state and county
# Create function
def split_state_county(df_name, col, state_pos, county_pos):
    # Create new state column
    df_name['state'] = df_name[col].str.split(',').str[state_pos]
    # Create new county column
    df_name['county'] = df_name[col].str.split(',').str[county_pos]

# Apply function
split_state_county(income_df, 'county', 1, 0)

# Rename columns
income_df = income_df.rename(columns={'median-income-2018': 
                                      'median income 2018'})

# Drop Nulls
income_df.dropna(inplace = True)
income_df.count()

# Drop state column to prevent future merging conflict 
income_df  = income_df.drop(['state'], axis=1 )

# Removing unneeded characters
income_df['county'] = income_df ['county'].str.strip(' County')

# Final median income DF
income_df

Unnamed: 0,fips,abbr,county,median income 2018
2,1001,AL,Autauga,59338
3,1003,AL,Baldwi,57588
4,1005,AL,Barbour,34382
5,1007,AL,Bibb,46064
6,1009,AL,Bl,50412
...,...,...,...,...
3191,56037,WY,Sweetwater,73315
3192,56039,WY,Te,99087
3193,56041,WY,Uinta,63401
3194,56043,WY,Washakie,55190


In [12]:
# Cleaning of population data
pop_df = population_df
# Splitting county column into state and county
split_state_county(pop_df, 'county', 1, 0)

# Removing unneeded characters
pop_df['county'] = pop_df['county'].str.strip('.')
pop_df['county'] = pop_df['county'].str.strip(' County')

In [13]:
# Check for null values
pop_df.isnull().values.any()

False

In [14]:
# Final population DF
pop_df

Unnamed: 0,county,population,state
0,Autauga,55869,Alabama
1,Baldwi,223234,Alabama
2,Barbour,24686,Alabama
3,Bibb,22394,Alabama
4,Bl,57826,Alabama
...,...,...,...
3137,Sweetwater,42343,Wyoming
3138,Te,23464,Wyoming
3139,Uinta,20226,Wyoming
3140,Washakie,7805,Wyoming


In [15]:
# Cleaning uninsured data
unins_df = uninsured_df

# Drop null values
unins_df.dropna(inplace = True)

# Rename columns
unins_df = unins_df.rename(columns = {'UninsuredPerc': 'uninsured percent',
                                     'fair-or-poor-health-perc': 'fair or poor health percent'})

# Convert uninsuredperc column to decimal
unins_df.iloc[:, 2:4] = (unins_df.iloc[:, 2:4])/100 

# Final uninsured DF
unins_df

Unnamed: 0,state,county,uninsured percent,fair or poor health percent
1,Alabama,Autauga,0.09,0.21
2,Alabama,Baldwin,0.11,0.18
3,Alabama,Barbour,0.12,0.30
4,Alabama,Bibb,0.10,0.19
5,Alabama,Blount,0.13,0.22
...,...,...,...,...
554,Georgia,Wilkes,0.18,0.24
555,Georgia,Wilkinson,0.13,0.21
556,Georgia,Worth,0.17,0.20
558,Hawaii,Hawaii,0.06,0.14


In [16]:
# Final DFs
# c_df, v_df, income_df, pop_df, unins_df 
