In [1]:
import pandas as pd
import datetime as dt

In [2]:
clean_df = pd.read_csv('coronavirus_states_raw.csv',index_col=False).drop(["#","Source","Projections"],axis=1)
clean_df

Unnamed: 0,USAState,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/ 1M pop,Population
0,USA Total,30365111,+6231,552556.0,86.0,22524111.0,7288444.0,91737.0,1669.0,385732183.0,1165345.0,
1,California,3635598,,56824.0,,1914453.0,1664321.0,92012.0,1438.0,51601716.0,1305968.0,39512223.0
2,Texas,2749440,,47257.0,,2590824.0,111359.0,94822.0,1630.0,24507017.0,845190.0,28995881.0
3,Florida,1994117,,32613.0,,1329665.0,631839.0,92846.0,1518.0,24357867.0,1134098.0,21477737.0
4,New York,1811288,,49539.0,,932935.0,828814.0,93108.0,2547.0,42015555.0,2159787.0,19453561.0
...,...,...,...,...,...,...,...,...,...,...,...,...
60,Navajo Nation,29987,,1228.0,,16289.0,12470.0,,,250472.0,,
61,Grand Princess Ship,122,,7.0,,115.0,0.0,,,,,
62,Wuhan Repatriated,3,,,,3.0,0.0,,,3.0,,
63,Diamond Princess Ship,46,,,,46.0,0.0,,,46.0,,


In [3]:
# Drop rows which are totals or non-state/territories e.g. Grand Princess Ship
clean_df.drop(clean_df[(clean_df["USAState"]=="Total:") | (clean_df["USAState"]=="USA Total") | (clean_df["USAState"]=="Federal Prisons") | (clean_df["USAState"]=="Navajo Nation") | (clean_df["USAState"]=="Grand Princess Ship")| (clean_df["USAState"]=="Wuhan Repatriated")| (clean_df["USAState"]=="Diamond Princess Ship")| (clean_df["USAState"]=="Veteran Affairs")| (clean_df["USAState"]=="US Military")].index, inplace=True)
clean_df.rename(columns = {"USAState": "state_name","Population": "state_population"},inplace=True)

In [4]:
# Drop columns which could be calculated in the database
cols=[7,8,10]
clean_df.drop(clean_df.columns[cols],axis=1,inplace=True)

In [5]:
# Convert all data types to integers/floats by removing commas, special characters etc. 
clean_new_cases = pd.concat([clean_df["NewCases"].str.split()
                               .str[0]
                               .str.replace(',','').astype(float) for col in clean_df], axis=1)

In [6]:
# replace the old columns with the new
clean_df["NewCases"] = clean_new_cases

In [7]:
# Add a 'date' column to the data frame to show that the covid data is as of 2021-03-20
clean_df['date'] = pd.Timestamp('2021-03-20')

In [8]:
# check the types
clean_df.dtypes

state_name                  object
TotalCases                   int64
NewCases                   float64
TotalDeaths                float64
NewDeaths                  float64
TotalRecovered             float64
ActiveCases                float64
TotalTests                 float64
state_population           float64
date                datetime64[ns]
dtype: object

In [10]:
# Merge the data frame with states_id_df to add the 'state_id' column
states_id_df = pd.read_csv('States.csv',index_col=False)
clean_df = clean_df.merge(states_id_df, on='state_name')

# Add 'pop_earnings_id' column to the data frame
clean_df['covid_id'] = list(range(1,57))
clean_df = clean_df[['covid_id','state_id','TotalCases','NewCases','TotalDeaths','NewDeaths','TotalRecovered','ActiveCases','TotalTests','state_population','date']]
clean_df

Unnamed: 0,covid_id,state_id,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,TotalTests,state_population,date
0,1,1,3635598,,56824.0,,1914453.0,1664321.0,51601716.0,39512223.0,2021-03-20
1,2,2,2749440,,47257.0,,2590824.0,111359.0,24507017.0,28995881.0,2021-03-20
2,3,3,1994117,,32613.0,,1329665.0,631839.0,24357867.0,21477737.0,2021-03-20
3,4,4,1811288,,49539.0,,932935.0,828814.0,42015555.0,19453561.0,2021-03-20
4,5,5,1216090,,23287.0,,1135756.0,57047.0,19389098.0,12671821.0,2021-03-20
5,6,6,1040817,,18420.0,,596487.0,425910.0,8058303.0,10617423.0,2021-03-20
6,7,7,995785,,17992.0,,946602.0,31191.0,10591209.0,11689100.0,2021-03-20
7,8,8,985066,2944.0,24886.0,32.0,898699.0,61481.0,11158837.0,12801989.0,2021-03-20
8,9,9,891314,,11783.0,,852732.0,26799.0,10900126.0,10488084.0,2021-03-20
9,10,10,853349,,24076.0,,627532.0,201741.0,11474078.0,8882190.0,2021-03-20


In [11]:
# Export the clean data to a csv
clean_df.to_csv('covid_data.csv',index=False)