In [1]:
# Dependancies
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

# File load
file1 = os.path.join('..', 'resources', '1-covid-data.csv')
file2 = os.path.join('..', 'resources', '2-vulnerability-index.csv')
file3 = os.path.join('..', 'resources', '3-median-income.csv')
file4 = os.path.join('..', 'resources', '4-population.csv')
file5 = os.path.join('..', 'resources', '5-uninsured-poor-health.csv')

# File read
covid_df = pd.read_csv(file1)
vulnerability_df = pd.read_csv(file2)
med_income_df = pd.read_csv(file3)
population_df = pd.read_csv(file4)
uninsured_df = pd.read_csv(file5)

In [2]:
c_df = covid_df
c_df.count()

date      586302
county    586302
state     586302
fips      580658
cases     586302
deaths    586302
dtype: int64

In [3]:
# Cleaning of covid data
c_df = covid_df
# Filter all rows relevant to last date of dataset - 9/30/2020
c_df = c_df.loc[c_df['date'] == '9/30/2020']

# Check for null values
c_df.isnull().values.any()

True

In [4]:
# Fill missing cells with 0 in fips column
c_df['fips'] = c_df['fips'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
# Drop Nan Values
c_df = c_df.dropna()
c_df

Unnamed: 0,date,county,state,fips,cases,deaths
583062,9/30/2020,Autauga,Alabama,1001.0,1791,27
583063,9/30/2020,Baldwin,Alabama,1003.0,5640,52
583064,9/30/2020,Barbour,Alabama,1005.0,896,7
583065,9/30/2020,Bibb,Alabama,1007.0,664,11
583066,9/30/2020,Blount,Alabama,1009.0,1629,15
...,...,...,...,...,...,...
586297,9/30/2020,Sweetwater,Wyoming,56037.0,339,2
586298,9/30/2020,Teton,Wyoming,56039.0,577,1
586299,9/30/2020,Uinta,Wyoming,56041.0,357,2
586300,9/30/2020,Washakie,Wyoming,56043.0,115,6


In [6]:
# Converting fips column to int 
c_df['fips'] = c_df['fips'].astype(int)

c_df

Unnamed: 0,date,county,state,fips,cases,deaths
583062,9/30/2020,Autauga,Alabama,1001,1791,27
583063,9/30/2020,Baldwin,Alabama,1003,5640,52
583064,9/30/2020,Barbour,Alabama,1005,896,7
583065,9/30/2020,Bibb,Alabama,1007,664,11
583066,9/30/2020,Blount,Alabama,1009,1629,15
...,...,...,...,...,...,...
586297,9/30/2020,Sweetwater,Wyoming,56037,339,2
586298,9/30/2020,Teton,Wyoming,56039,577,1
586299,9/30/2020,Uinta,Wyoming,56041,357,2
586300,9/30/2020,Washakie,Wyoming,56043,115,6


In [7]:
# Cleaning of population data
pop_df = population_df
# Splitting county column into state and county
pop_df['state'] = pop_df['county'].str.split(',').str[1]
pop_df['county_'] = pop_df['county'].str.split(',').str[0]
pop_df = pop_df.drop(['county'], axis=1)
pop_df = pop_df.rename(columns = {'county_':'county'})

# Removing unneeded characters
pop_df['county'] = pop_df['county'].str.strip('.')
pop_df['county'] = pop_df['county'].str.strip(' County')

# Check for null values
pop_df.isnull().values.any()

False

In [8]:
# Final population DF
pop_df.dtypes

population    object
state         object
county        object
dtype: object

In [9]:
# Cleaning vulnerability data
v_df = vulnerability_df
v_df = v_df.rename(columns = {'stateAbb': 'abbr',
                              'theme1SocioeconomicStatus': 'socioeconomic status', 
                              'theme2HouseholdCompositionAndDisability':'household composition and disability',
                              'THEME 3: \nMinority Status & Language':'minority status and language',
                              'THEME 4: \nHousing Type & Transportation': 'housing type and transportation',
                              'THEME 5: Epidemiological Factors': 'epidemiological factors',
                              'THEME 6: Healthcare System Factors': 'healthcare system factors' ,
                              'ccviScore': 'ccvi score'})

# Check for null values
v_df.isnull().values.any()

True

In [10]:
v_df.dropna(inplace = True)
v_df.count()

state                                   3141
abbr                                    3141
county                                  3141
fips                                    3141
socioeconomic status                    3141
household composition and disability    3141
minority status and language            3141
housing type and transportation         3141
epidemiological factors                 3141
healthcare system factors               3141
ccvi score                              3141
dtype: int64

In [11]:
# Final vulnerability DF
v_df.dtypes

state                                    object
abbr                                     object
county                                   object
fips                                      int64
socioeconomic status                    float64
household composition and disability    float64
minority status and language            float64
housing type and transportation         float64
epidemiological factors                 float64
healthcare system factors               float64
ccvi score                              float64
dtype: object

In [12]:
# Cleaning median income data
income_df = med_income_df

# Splitting county column into state and county
income_df['state'] = income_df['county'].str.split(',').str[1]
income_df['county_'] = income_df['county'].str.split(',').str[0]
income_df = income_df.drop(['county'], axis=1)
income_df = income_df.rename(columns = {'county_':'county'})

# Drop Nulls
income_df.dropna(inplace = True)
income_df.count()

# Drop state column to prevent future merging conflict 
income_df  = income_df.drop(['state'], axis=1 )

# Removing unneeded characters
income_df['county'] = income_df ['county'].str.strip(' County')

income_df.dtypes

fips                   int64
abbr                  object
median-income-2018    object
county                object
dtype: object

In [13]:
# pop_df.dropna(inplace = True)
# pop_df.count()