# Process Daily Confirmed Cases John Hopkins Data
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports

In [433]:
import pandas as pd
import states 

#### Configurations

In [434]:
daily_datafile = './jh-daily-data/03-25-2020.csv'
daily_date = '3/25/20'
death_datafile = 'COVID-19-Deaths-USA-By-State.csv'

#### Load John Hopkins Daily Covid-19 File

In [435]:
df = pd.read_csv(daily_datafile, encoding='utf-8', index_col=False)
df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-25 23:33:19,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-25 23:33:19,30.295065,-92.414197,2,0,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-25 23:33:19,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-25 23:33:19,43.452658,-116.241552,24,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-25 23:33:19,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"
5,21001.0,Adair,Kentucky,US,2020-03-25 23:33:19,37.104598,-85.281297,0,0,0,0,"Adair, Kentucky, US"
6,29001.0,Adair,Missouri,US,2020-03-25 23:33:19,40.190586,-92.600782,1,0,0,0,"Adair, Missouri, US"
7,40001.0,Adair,Oklahoma,US,2020-03-25 23:33:19,35.884942,-94.658593,2,0,0,0,"Adair, Oklahoma, US"
8,8001.0,Adams,Colorado,US,2020-03-25 23:33:19,39.874321,-104.336258,27,0,0,0,"Adams, Colorado, US"
9,16003.0,Adams,Idaho,US,2020-03-25 23:33:19,44.893336,-116.454525,0,0,0,0,"Adams, Idaho, US"


#### Select only US rows

In [436]:
df = df[df['Country_Region'] == 'US']
df

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-25 23:33:19,34.223334,-82.461707,3,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-25 23:33:19,30.295065,-92.414197,2,0,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-25 23:33:19,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-25 23:33:19,43.452658,-116.241552,24,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-25 23:33:19,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"
5,21001.0,Adair,Kentucky,US,2020-03-25 23:33:19,37.104598,-85.281297,0,0,0,0,"Adair, Kentucky, US"
6,29001.0,Adair,Missouri,US,2020-03-25 23:33:19,40.190586,-92.600782,1,0,0,0,"Adair, Missouri, US"
7,40001.0,Adair,Oklahoma,US,2020-03-25 23:33:19,35.884942,-94.658593,2,0,0,0,"Adair, Oklahoma, US"
8,8001.0,Adams,Colorado,US,2020-03-25 23:33:19,39.874321,-104.336258,27,0,0,0,"Adams, Colorado, US"
9,16003.0,Adams,Idaho,US,2020-03-25 23:33:19,44.893336,-116.454525,0,0,0,0,"Adams, Idaho, US"


#### Group By States, Sum by Confirmed Cases, Deaths, Recoved, Active

In [437]:
df = df.groupby('Province_State').agg({'Confirmed':'sum','Deaths':'sum','Recovered':'sum'})

# Drop rows
if 'Wuhan Evacuee' in df.index:
    df = df.drop(['Wuhan Evacuee'])
if 'Recovered' in df.index:
    df = df.drop(['Recovered'])

df

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Province_State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,381,1,0
Alaska,41,1,0
American Samoa,0,0,0
Arizona,401,6,0
Arkansas,280,2,0
California,2998,65,0
Colorado,1021,16,0
Connecticut,875,19,0
Delaware,119,0,0
Diamond Princess,49,0,0


#### Get daily confirmed cases by State

In [438]:
df_daily = df.iloc[:, [1]]
df_daily

Unnamed: 0_level_0,Deaths
Province_State,Unnamed: 1_level_1
Alabama,1
Alaska,1
American Samoa,0
Arizona,6
Arkansas,2
California,65
Colorado,16
Connecticut,19
Delaware,0
Diamond Princess,0


#### Load Confirmed Cases by State File

In [439]:
df_deaths = pd.read_csv(death_datafile, encoding='utf-8', index_col='State')
df_deaths

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alaska,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,2,2,5
Arkansas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
California,0,0,0,0,0,0,0,0,0,0,...,6,7,12,13,18,23,24,30,39,50
Colorado,0,0,0,0,0,0,0,0,0,0,...,1,1,2,2,4,4,4,6,7,8
Connecticut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,3,4,8,10,12
Delaware,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Make sure number of rows match before insert

In [440]:
print(df_daily.shape[0])
#assert(df_daily.shape[0]==df_confirmed.shape[0])
# import numpy as np
# d = pd.DataFrame(np.zeros((df_daily.shape[0], 1), dtype=int))
# d
# dft = pd.DataFrame({ '1/22/20' :  np.array([0] * df_daily.shape[0], dtype='int32'), })
# dft

58


#### Insert Daily Confirmed Cases to Main File

In [441]:
import numpy as np

df_deaths.insert(df_confirmed.shape[1], daily_date , dft.values)

In [442]:
df_deaths

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Alaska,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2,2,5,0
Arkansas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
California,0,0,0,0,0,0,0,0,0,0,...,7,12,13,18,23,24,30,39,50,0
Colorado,0,0,0,0,0,0,0,0,0,0,...,1,2,2,4,4,4,6,7,8,0
Connecticut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,3,4,8,10,12,0
Delaware,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [443]:
for index, row in df_daily.iterrows():
    print(index)
    if index in df_deaths.index:
        df_deaths.at[index, daily_date] = row['Deaths']       

Alabama
Alaska
American Samoa
Arizona
Arkansas
California
Colorado
Connecticut
Delaware
Diamond Princess
District of Columbia
Florida
Georgia
Grand Princess
Guam
Hawaii
Idaho
Illinois
Indiana
Iowa
Kansas
Kentucky
Louisiana
Maine
Maryland
Massachusetts
Michigan
Minnesota
Mississippi
Missouri
Montana
Nebraska
Nevada
New Hampshire
New Jersey
New Mexico
New York
North Carolina
North Dakota
Northern Mariana Islands
Ohio
Oklahoma
Oregon
Pennsylvania
Puerto Rico
Rhode Island
South Carolina
South Dakota
Tennessee
Texas
Utah
Vermont
Virgin Islands
Virginia
Washington
West Virginia
Wisconsin
Wyoming


In [444]:
df_deaths

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
Alaska,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
American Samoa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,2,2,5,6
Arkansas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,2
California,0,0,0,0,0,0,0,0,0,0,...,7,12,13,18,23,24,30,39,50,65
Colorado,0,0,0,0,0,0,0,0,0,0,...,1,2,2,4,4,4,6,7,8,16
Connecticut,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,3,4,8,10,12,19
Delaware,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Diamond Princess,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Copy Confirmed Cases File to csv

In [445]:
df_deaths.to_csv(death_datafile, encoding='utf-8')