# Process Daily Confirmed Cases John Hopkins Data
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports

In [1]:
import pandas as pd
import numpy as np
import states 

#### Configurations

In [27]:
daily_datafile = './jh-daily-data/03-28-2020.csv'
daily_date = '3/28/20'
death_datafile = 'COVID-19-Deaths-USA-By-State.csv'
cases_datafile = 'COVID-19-Confirmed-Cases-USA-By-State.csv'

#### Load John Hopkins Daily Covid-19 File

In [28]:
df = pd.read_csv(daily_datafile, encoding='utf-8', index_col=False)
df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-28 23:05:37,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-28 23:05:37,30.295065,-92.414197,9,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-28 23:05:37,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-28 23:05:37,43.452658,-116.241552,76,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-28 23:05:37,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"
5,21001.0,Adair,Kentucky,US,2020-03-28 23:05:37,37.104598,-85.281297,0,0,0,0,"Adair, Kentucky, US"
6,29001.0,Adair,Missouri,US,2020-03-28 23:05:37,40.190586,-92.600782,1,0,0,0,"Adair, Missouri, US"
7,40001.0,Adair,Oklahoma,US,2020-03-28 23:05:37,35.884942,-94.658593,3,0,0,0,"Adair, Oklahoma, US"
8,8001.0,Adams,Colorado,US,2020-03-28 23:05:37,39.874321,-104.336258,71,0,0,0,"Adams, Colorado, US"
9,16003.0,Adams,Idaho,US,2020-03-28 23:05:37,44.893336,-116.454525,0,0,0,0,"Adams, Idaho, US"


#### Select only US rows

In [29]:
df = df[df['Country_Region'] == 'US']
df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-28 23:05:37,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-28 23:05:37,30.295065,-92.414197,9,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-28 23:05:37,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-28 23:05:37,43.452658,-116.241552,76,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-28 23:05:37,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"
5,21001.0,Adair,Kentucky,US,2020-03-28 23:05:37,37.104598,-85.281297,0,0,0,0,"Adair, Kentucky, US"
6,29001.0,Adair,Missouri,US,2020-03-28 23:05:37,40.190586,-92.600782,1,0,0,0,"Adair, Missouri, US"
7,40001.0,Adair,Oklahoma,US,2020-03-28 23:05:37,35.884942,-94.658593,3,0,0,0,"Adair, Oklahoma, US"
8,8001.0,Adams,Colorado,US,2020-03-28 23:05:37,39.874321,-104.336258,71,0,0,0,"Adams, Colorado, US"
9,16003.0,Adams,Idaho,US,2020-03-28 23:05:37,44.893336,-116.454525,0,0,0,0,"Adams, Idaho, US"


#### Group By States, Sum by Confirmed Cases, Deaths, Recoved, Active

In [30]:
df_daily_sum = df.groupby('Province_State').agg({'Confirmed':'sum','Deaths':'sum','Recovered':'sum'})

# Drop rows
if 'Wuhan Evacuee' in df.index:
    df_daily_sum = df_daily_sum.drop(['Wuhan Evacuee'])
if 'Recovered' in df.index:
    df_daily_sum = df_daily_sum.drop(['Recovered'])

df_daily_sum

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,694,4,0
Alaska,85,2,0
American Samoa,0,0,0
Arizona,773,15,0
Arkansas,409,5,0
California,5095,110,0
Colorado,1740,31,0
Connecticut,1524,33,0
Delaware,214,5,0
Diamond Princess,49,0,0


#### Get daily confirmed cases by State

In [31]:
df_daily_cases = df_daily_sum.iloc[:, [0]]
df_daily_cases

Unnamed: 0_level_0,Confirmed
Province_State,Unnamed: 1_level_1
Alabama,694
Alaska,85
American Samoa,0
Arizona,773
Arkansas,409
California,5095
Colorado,1740
Connecticut,1524
Delaware,214
Diamond Princess,49


#### Get daily deaths by State

In [32]:
df_daily_deaths = df_daily_sum.iloc[:, [1]]
df_daily_deaths

Unnamed: 0_level_0,Deaths
Province_State,Unnamed: 1_level_1
Alabama,4
Alaska,2
American Samoa,0
Arizona,15
Arkansas,5
California,110
Colorado,31
Connecticut,33
Delaware,5
Diamond Princess,0


#### Load Confirmed Cases by State File

In [33]:
df_cases = pd.read_csv(cases_datafile, encoding='utf-8', index_col='State')
df_cases

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,83,131,138,196,242,381,517,587,694,694
Alaska,0,0,0,0,0,0,0,0,0,0,...,12,15,21,30,34,41,56,58,85,85
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,1,1,1,1,1,1,...,78,118,152,235,326,401,508,665,773,773
Arkansas,0,0,0,0,0,0,0,0,0,0,...,96,122,165,192,219,280,335,381,409,409
California,0,0,0,0,2,2,2,2,2,3,...,1177,1364,1642,2108,2538,2998,3899,4657,5095,5095
Colorado,0,0,0,0,0,0,0,0,0,0,...,363,390,476,704,723,1021,1430,1433,1740,1740
Connecticut,0,0,0,0,0,0,0,0,0,0,...,194,194,223,415,618,875,1012,1291,1524,1524
Delaware,0,0,0,0,0,0,0,0,0,0,...,38,45,47,68,104,119,130,163,214,214
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,49,49,49,49,49,49,49,49,49,49


#### Insert Empty Column into df_cases_cases for new date

In [34]:
dft = pd.DataFrame({ daily_date :  np.array([0] * df_cases.shape[0], dtype='int32'), })
df_cases.insert(df_cases.shape[1], daily_date, dft.values)
df_cases

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,131,138,196,242,381,517,587,694,694,0
Alaska,0,0,0,0,0,0,0,0,0,0,...,15,21,30,34,41,56,58,85,85,0
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,1,1,1,1,1,1,...,118,152,235,326,401,508,665,773,773,0
Arkansas,0,0,0,0,0,0,0,0,0,0,...,122,165,192,219,280,335,381,409,409,0
California,0,0,0,0,2,2,2,2,2,3,...,1364,1642,2108,2538,2998,3899,4657,5095,5095,0
Colorado,0,0,0,0,0,0,0,0,0,0,...,390,476,704,723,1021,1430,1433,1740,1740,0
Connecticut,0,0,0,0,0,0,0,0,0,0,...,194,223,415,618,875,1012,1291,1524,1524,0
Delaware,0,0,0,0,0,0,0,0,0,0,...,45,47,68,104,119,130,163,214,214,0
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,49,49,49,49,49,49,49,49,49,0


#### Insert daily cases totals in to df_cases

In [35]:
for index, row in df_daily_cases.iterrows():    
    if index in df_cases.index:
        df_cases.at[index, daily_date] = row['Confirmed']       
df_cases

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,131,138,196,242,381,517,587,694,694,694
Alaska,0,0,0,0,0,0,0,0,0,0,...,15,21,30,34,41,56,58,85,85,85
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,1,1,1,1,1,1,...,118,152,235,326,401,508,665,773,773,773
Arkansas,0,0,0,0,0,0,0,0,0,0,...,122,165,192,219,280,335,381,409,409,409
California,0,0,0,0,2,2,2,2,2,3,...,1364,1642,2108,2538,2998,3899,4657,5095,5095,5095
Colorado,0,0,0,0,0,0,0,0,0,0,...,390,476,704,723,1021,1430,1433,1740,1740,1740
Connecticut,0,0,0,0,0,0,0,0,0,0,...,194,223,415,618,875,1012,1291,1524,1524,1524
Delaware,0,0,0,0,0,0,0,0,0,0,...,45,47,68,104,119,130,163,214,214,214
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,49,49,49,49,49,49,49,49,49,49


#### Load Deaths by State File

In [36]:
df_deaths = pd.read_csv(death_datafile, encoding='utf-8', index_col='State')
df_deaths

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,4,4,4
Alaska,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,2,2
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,0,1,2,2,5,6,8,13,15,15
Arkansas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,2,2,3,5,5
California,0,0,0,0,0,0,0,0,0,0,...,23,24,30,39,50,65,81,94,110,110
Colorado,0,0,0,0,0,0,0,0,0,0,...,4,4,6,7,8,16,19,27,31,31
Connecticut,0,0,0,0,0,0,0,0,0,0,...,3,4,8,10,12,19,21,27,33,33
Delaware,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,5,5
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Insert Empty Column into df_deaths for new date

In [37]:
dft = pd.DataFrame({ daily_date :  np.array([0] * df_deaths.shape[0], dtype='int32'), })
df_deaths.insert(df_deaths.shape[1], daily_date, dft.values)
df_deaths

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,4,4,4,0
Alaska,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,2,2,0
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,1,2,2,5,6,8,13,15,15,0
Arkansas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,2,2,3,5,5,0
California,0,0,0,0,0,0,0,0,0,0,...,24,30,39,50,65,81,94,110,110,0
Colorado,0,0,0,0,0,0,0,0,0,0,...,4,6,7,8,16,19,27,31,31,0
Connecticut,0,0,0,0,0,0,0,0,0,0,...,4,8,10,12,19,21,27,33,33,0
Delaware,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2,5,5,0
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Insert daily death totals in to df_deaths

In [38]:
for index, row in df_daily_deaths.iterrows():    
    if index in df_deaths.index:
        df_deaths.at[index, daily_date] = row['Deaths']       
df_deaths

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,4,4,4,4
Alaska,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,2,2,2
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,1,2,2,5,6,8,13,15,15,15
Arkansas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,2,2,3,5,5,5
California,0,0,0,0,0,0,0,0,0,0,...,24,30,39,50,65,81,94,110,110,110
Colorado,0,0,0,0,0,0,0,0,0,0,...,4,6,7,8,16,19,27,31,31,31
Connecticut,0,0,0,0,0,0,0,0,0,0,...,4,8,10,12,19,21,27,33,33,33
Delaware,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2,5,5,5
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Copy Death and Cases Dataframe to csv

In [39]:
df_deaths.to_csv(death_datafile, encoding='utf-8')
df_cases.to_csv(cases_datafile, encoding='utf-8')

In [None]:
df_daily_deaths