In [98]:
import os
import glob
import pandas as pd
from git import Repo
import plotly.express as px
import plotly.graph_objects as go

In [82]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# download data
git_url = 'https://github.com/CSSEGISandData/COVID-19'
repo_dir = 'data'
Repo.clone_from(git_url, repo_dir)

<git.repo.base.Repo '/home/jovyan/data/.git'>

In [117]:
# read data into pandas
path = os.path.join(os.getcwd(),repo_dir,'csse_covid_19_data/csse_covid_19_daily_reports')
all_files = glob.iglob(os.path.join(path, "*.csv"))  
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [118]:
df.sample(10)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Active,Combined_Key
26275,,,,0.0,0.0,0.0,,,46079.0,Lake,South Dakota,US,2020-03-27 22:14:55,44.021931,-97.129264,0.0,"Lake, South Dakota, US"
133806,,,,25.0,1.0,0.0,,,13075.0,Cook,Georgia,US,2020-05-03 02:32:28,31.160685,-83.429472,24.0,"Cook, Georgia, US"
137382,,,,59.0,1.0,0.0,,,13127.0,Glynn,Georgia,US,2020-05-04 02:32:28,31.232793,-81.538114,58.0,"Glynn, Georgia, US"
110527,,,,522.0,6.0,0.0,,,90009.0,Unassigned,Connecticut,US,2020-04-25 06:30:53,,,516.0,"Unassigned, Connecticut, US"
158135,,,,131.0,1.0,0.0,,,51175.0,Southampton,Virginia,US,2020-05-10 02:32:30,36.722311,-77.106427,130.0,"Southampton, Virginia, US"
46827,,,,2.0,1.0,0.0,,,13149.0,Heard,Georgia,US,2020-04-03 22:46:37,33.296866,-85.128868,0.0,"Heard, Georgia, US"
111249,,,,31.0,0.0,0.0,,,37013.0,Beaufort,North Carolina,US,2020-04-26 02:30:51,35.485319,-76.843258,31.0,"Beaufort, North Carolina, US"
38442,,,,4.0,0.0,0.0,,,40001.0,Adair,Oklahoma,US,2020-03-31 23:43:56,35.884942,-94.658593,0.0,"Adair, Oklahoma, US"
9212,,,,0.0,0.0,0.0,,,18099.0,Marshall,Indiana,US,3/22/20 23:45,41.324674,-86.261851,0.0,"Marshall, Indiana, US"
53202,,,,4.0,0.0,0.0,,,36097.0,Schuyler,New York,US,2020-04-05 23:06:45,42.39184,-76.87733,0.0,"Schuyler, New York, US"


In [120]:
# fix location and time due to schema changes since start of data collection
def fix_location(state_1,state_2,region_1,region_2,county):
    '''param: state_1: original field Province/State
       param: state_2: recent field Province_State
       param: region_1: original field Country/Region
       param: region_2: recent field Country_Region
       param: county: recent field Admin2 (US Counties)
       return: location: concatenated location string similar to Combined_Key but valid for all rows
    '''
    if not pd.isnull(state_1):
        state = state_1
    elif not pd.isnull(state_2):
        state = state_2
    else:
        state = None
    
    if not pd.isnull(region_1):
        region = region_1
    elif not pd.isnull(region_2):
        region = region_2
    else:
        region = None
    
    if not pd.isnull(county):
        county = county
    else:
        county = None
    location = ', '.join(filter(None,[county,state,region]))
    
    return location

# fix time
def fix_time(update_1, update_2):
    '''param: update_1: original update timestamp (Last Update)
       param: udpate_2: newer update timestamp (Last_Update)
       return: time: python datetime object 
    '''
    if not pd.isnull(update_1):
        time = parser.parse(update_1)
    elif not pd.isnull(update_2):
        time = parser.parse(update_2)
    else:
        time = np.nan
    
    return time

df['Location'] = df.apply(lambda x: fix_location(x['Province/State'],
                                                 x['Province_State'],
                                                 x['Country/Region'],
                                                 x['Country_Region'],
                                                 x['Admin2']), axis=1)
df['Time'] = df.apply(lambda x: fix_time(x['Last Update'], x['Last_Update']), axis=1)

In [122]:
# generate cases ('Confirmed') and deaths ('Deaths') over time grouped by location
df_by_location = df.groupby('Location').agg({'Confirmed': list, 'Deaths': list, 'Time': list}).reset_index()

In [123]:
# tuncate to minimum case count
MIN_CASE_COUNT = 100
def threshold(s, min_case_count):
    
    for idx, cnt in enumerate(s['Confirmed']):
        if cnt > min_case_count:
            idx = idx-1
            break
    s['Confirmed'] = s['Confirmed'][idx+1:]
    s['Deaths'] = s['Deaths'][idx+1:]
    s['Time'] = s['Time'][idx+1:]
    return s

df_by_location = df_by_location.apply(lambda x: threshold(x,MIN_CASE_COUNT), axis=1)

In [158]:
# plotting utility
def add_traces(df, location, pop, col, fig):
    y = df[df['Location']==location][col].iloc[0]
    y = [cases/pop for cases in y ]
    time = df[df['Location']==location]['Time'].iloc[0]
    fig.add_trace(go.Scatter(y=y,
                    mode='lines+markers',
                    name=location,
                    text=time))
    return fig

In [166]:
# plot cases / deaths
plot_col = 'Confirmed'
#plot_col = 'Deaths'
fig = go.Figure()
fig = add_traces(df_by_location, 'Alameda, California, US', 1.671, plot_col, fig)
fig = add_traces(df_by_location, 'San Francisco, California, US', 0.881549, plot_col, fig)
fig = add_traces(df_by_location, 'Santa Clara, California, US', 1.933, plot_col, fig)
fig = add_traces(df_by_location, 'San Mateo, California, US', 0.766573, plot_col, fig)
fig = add_traces(df_by_location, 'Contra Costa, California, US', 1.154, plot_col, fig)
fig = add_traces(df_by_location, 'Los Angeles, California, US', 10.039107, plot_col, fig)
fig = add_traces(df_by_location, 'District of Columbia, District of Columbia, US', 0.702455, plot_col, fig)
fig = add_traces(df_by_location, 'Miami-Dade, Florida, US', 2.716940, plot_col, fig)
fig = add_traces(df_by_location, 'Suffolk, Massachusetts, US', 0.803907, plot_col, fig)
fig = add_traces(df_by_location, 'King, Washington, US', 2.253, plot_col, fig)
fig = add_traces(df_by_location, 'Cook, Illinois, US', 5.15, plot_col, fig)
fig = add_traces(df_by_location, 'Wayne, Michigan, US', 1.749343, plot_col, fig)
fig = add_traces(df_by_location, 'Clark, Nevada, US', 2.267, plot_col, fig)
fig = add_traces(df_by_location, 'Cumberland, Maine, US', 0.295003, plot_col, fig)
fig = add_traces(df_by_location, 'New York City, New York, US', 8.336817, plot_col, fig)
fig = add_traces(df_by_location, 'Hubei, China', 58.5, plot_col, fig)
fig = add_traces(df_by_location, 'Germany', 83.02, plot_col, fig)
fig = add_traces(df_by_location, 'Italy', 60.36, plot_col, fig)
fig = add_traces(df_by_location, 'Spain', 46.94, plot_col, fig)


fig.update_layout(
    title="COVID-19 Confirmed Cases",
    xaxis_title="Days since {} confirmed cases".format(MIN_CASE_COUNT),
    yaxis_title="Number of Cases",
    showlegend=False,
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()