In [1]:
#Import dependencies 
import pandas as pd
import chardet
from sqlalchemy import create_engine
import os

COVID data cleaning 

In [2]:
#Import COVID CSV
with open("Resources/covid.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

data = pd.read_csv("Resources/covid.csv", encoding='UTF-8-SIG')

{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}


In [3]:
#View data table
data.head()

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k
0,1/21/2020,USA-53061,Snohomish,Washington,1,0.14,0.02,0,0.0,0.0
1,1/22/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
2,1/23/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
3,1/24/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
4,1/24/2020,USA-17031,Cook,Illinois,1,0.14,0.0,0,0.0,0.0


In [4]:
print(data.columns.tolist())

['date', 'geoid', 'county', 'state', 'cases', 'cases_avg', 'cases_avg_per_100k', 'deaths', 'deaths_avg', 'deaths_avg_per_100k']


In [5]:
# dropping null value columns to avoid errors
data.dropna(inplace = True)

In [6]:
# new data frame with split value columns
new = data["geoid"].str.split("-", n = 1, expand = True)
  
# making separate first name column from new data frame
data["drop_USA"]= new[0]
  
# making separate last name column from new data frame
data["FIPS_Code"]= new[1]
  
# Dropping old Name columns
data.drop(columns =["drop_USA"], inplace = True)
  
# df display
data

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k,FIPS_Code
0,1/21/2020,USA-53061,Snohomish,Washington,1,0.14,0.02,0,0.0,0.0,53061
1,1/22/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0,53061
2,1/23/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0,53061
3,1/24/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0,53061
4,1/24/2020,USA-17031,Cook,Illinois,1,0.14,0.00,0,0.0,0.0,17031
...,...,...,...,...,...,...,...,...,...,...,...
888550,12/31/2020,USA-69120,Tinian,Northern Mariana Islands,0,0.00,0.00,0,0.0,0.0,69120
888551,12/31/2020,USA-69110,Saipan,Northern Mariana Islands,0,0.57,1.19,0,0.0,0.0,69110
888553,12/31/2020,USA-78030,St. Thomas,Virgin Islands,8,3.71,7.19,0,0.0,0.0,78030
888554,12/31/2020,USA-78020,St. John,Virgin Islands,6,1.14,27.41,0,0.0,0.0,78020


In [7]:
#Transform COVID dataframe
data_columns = ["county", "state", "cases_avg_per_100k", "deaths_avg_per_100k","FIPS_Code"]
covid_transformed = data[data_columns].copy()
covid_transformed.head()

Unnamed: 0,county,state,cases_avg_per_100k,deaths_avg_per_100k,FIPS_Code
0,Snohomish,Washington,0.02,0.0,53061
1,Snohomish,Washington,0.02,0.0,53061
2,Snohomish,Washington,0.02,0.0,53061
3,Snohomish,Washington,0.02,0.0,53061
4,Cook,Illinois,0.0,0.0,17031


In [8]:
#Rename columns
covid_transformed.rename(columns={"cases_avg_per_100k": "Cases/100K Ave", "deaths_avg_per_100k": "Deaths/100K Ave"})

Unnamed: 0,county,state,Cases/100K Ave,Deaths/100K Ave,FIPS_Code
0,Snohomish,Washington,0.02,0.0,53061
1,Snohomish,Washington,0.02,0.0,53061
2,Snohomish,Washington,0.02,0.0,53061
3,Snohomish,Washington,0.02,0.0,53061
4,Cook,Illinois,0.00,0.0,17031
...,...,...,...,...,...
888550,Tinian,Northern Mariana Islands,0.00,0.0,69120
888551,Saipan,Northern Mariana Islands,1.19,0.0,69110
888553,St. Thomas,Virgin Islands,7.19,0.0,78030
888554,St. John,Virgin Islands,27.41,0.0,78020


In [9]:
#Check FIPS_Code dtype 
covid_transformed.dtypes

county                  object
state                   object
cases_avg_per_100k     float64
deaths_avg_per_100k    float64
FIPS_Code               object
dtype: object

In [10]:
#Convert FIPS_Code to INT
covid_transformed['FIPS_Code'] = covid_transformed['FIPS_Code'].astype(str).astype(int)

In [11]:
#Check FIPS_Code dtype 
covid_transformed.dtypes

county                  object
state                   object
cases_avg_per_100k     float64
deaths_avg_per_100k    float64
FIPS_Code                int32
dtype: object

In [12]:
#export csv
covid_transformed.to_csv('Resources/covid_cleaned.csv', index=True)  

Education data cleaning

In [None]:
#Import file and view dataframe

file_string = "Resources/education.csv"

edf2 = pd.read_csv(file_string)
edf2

In [None]:
#Transform education data
   #Pull only desired columns
edf_filter = edf2[['FIPS Code', 'State',
                    'Percent of adults with less than a high school diploma, 2015-19', 
                   'Percent of adults with a high school diploma only, 2015-19',
                   "Percent of adults completing some college or associate's degree, 2015-19",
                  "Percent of adults with a bachelor's degree or higher, 2015-19"]].copy()
edf_filter

In [None]:
#export cleaned data to csv
edf_filter.csv('Resources/education_cl')

Unemployment data cleaning

In [None]:
#Import file and view dataframe
unemployment_file = "Resources/unemployment2019.csv"
unemployment_df = pd.read_csv(unemployment_file)
unemployment_df.head()

In [None]:
#Drop unwanted columns
unemployment_clean_df=unemployment_df.drop(['Rural_urban_continuum_code_2013', 'Urban_influence_code_2013', 'City/Suburb/Town/Rural', 'Metro_2013', 'Civilian_labor_force_2019', 'Employed_2019', 'Median_Household_Income_2019', 'Med_HH_Income_Percent_of_State_Total_2019' ], axis=1)
unemployment_clean_df.head()

In [None]:
#Rename columns
unemployment_clean_df.rename(columns={'FIPS_Code': 'FIPS Code', 'Area_name': 'Area Name', 'Unemployed_2019': 'Unemployed 2019', 'Unemployment_rate_2019':'Unemployment Rate 2019'}, inplace=True)
unemployment_clean_df.head()