In [1]:
import pandas as pd
import chardet

In [2]:
#Import COVID CSV
with open("Resources/covid.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

data = pd.read_csv("Resources/covid.csv", encoding='UTF-8-SIG')

{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}


In [3]:
data.head()

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k
0,1/21/2020,USA-53061,Snohomish,Washington,1,0.14,0.02,0,0.0,0.0
1,1/22/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
2,1/23/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
3,1/24/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0
4,1/24/2020,USA-17031,Cook,Illinois,1,0.14,0.0,0,0.0,0.0


In [4]:
print(data.columns.tolist())

['date', 'geoid', 'county', 'state', 'cases', 'cases_avg', 'cases_avg_per_100k', 'deaths', 'deaths_avg', 'deaths_avg_per_100k']


In [5]:
# dropping null value columns to avoid errors
data.dropna(inplace = True)

In [6]:
# new data frame with split value columns
new = data["geoid"].str.split("-", n = 1, expand = True)
  
# making separate first name column from new data frame
data["drop_USA"]= new[0]
  
# making separate last name column from new data frame
data["FIPS_Code"]= new[1]
  
# Dropping old Name columns
data.drop(columns =["drop_USA"], inplace = True)
  
# df display
data

Unnamed: 0,date,geoid,county,state,cases,cases_avg,cases_avg_per_100k,deaths,deaths_avg,deaths_avg_per_100k,FIPS_Code
0,1/21/2020,USA-53061,Snohomish,Washington,1,0.14,0.02,0,0.0,0.0,53061
1,1/22/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0,53061
2,1/23/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0,53061
3,1/24/2020,USA-53061,Snohomish,Washington,0,0.14,0.02,0,0.0,0.0,53061
4,1/24/2020,USA-17031,Cook,Illinois,1,0.14,0.00,0,0.0,0.0,17031
...,...,...,...,...,...,...,...,...,...,...,...
888550,12/31/2020,USA-69120,Tinian,Northern Mariana Islands,0,0.00,0.00,0,0.0,0.0,69120
888551,12/31/2020,USA-69110,Saipan,Northern Mariana Islands,0,0.57,1.19,0,0.0,0.0,69110
888553,12/31/2020,USA-78030,St. Thomas,Virgin Islands,8,3.71,7.19,0,0.0,0.0,78030
888554,12/31/2020,USA-78020,St. John,Virgin Islands,6,1.14,27.41,0,0.0,0.0,78020


In [10]:
#Transfor COVID dataframe
data_columns = ["county", "state", "cases_avg_per_100k", "deaths_avg_per_100k","FIPS_Code"]
covid_transformed = data[data_columns].copy()
covid_transformed.head()

Unnamed: 0,county,state,cases_avg_per_100k,deaths_avg_per_100k,FIPS_Code
0,Snohomish,Washington,0.02,0.0,53061
1,Snohomish,Washington,0.02,0.0,53061
2,Snohomish,Washington,0.02,0.0,53061
3,Snohomish,Washington,0.02,0.0,53061
4,Cook,Illinois,0.0,0.0,17031


In [11]:
#Rename columns
covid_transformed.rename(columns={"cases_avg_per_100k": "Cases/100K Ave", "deaths_avg_per_100k": "Deaths/100K Ave"})

Unnamed: 0,county,state,Cases/100K Ave,Deaths/100K Ave,FIPS_Code
0,Snohomish,Washington,0.02,0.0,53061
1,Snohomish,Washington,0.02,0.0,53061
2,Snohomish,Washington,0.02,0.0,53061
3,Snohomish,Washington,0.02,0.0,53061
4,Cook,Illinois,0.00,0.0,17031
...,...,...,...,...,...
888550,Tinian,Northern Mariana Islands,0.00,0.0,69120
888551,Saipan,Northern Mariana Islands,1.19,0.0,69110
888553,St. Thomas,Virgin Islands,7.19,0.0,78030
888554,St. John,Virgin Islands,27.41,0.0,78020


In [14]:
covid_transformed.to_csv('Resources/covid_cleaned.csv', index=True)  