In [11]:
import os
import glob
import shutil
import pandas as pd
from git import Repo
from dateutil import parser
import plotly.graph_objects as go

In [6]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [7]:
# download data
git_url = 'https://github.com/CSSEGISandData/COVID-19'
repo_dir = 'data'
shutil.rmtree(repo_dir)
Repo.clone_from(git_url, repo_dir)

<git.repo.base.Repo '/home/jovyan/data/.git'>

In [8]:
# read data into pandas
path = os.path.join(os.getcwd(),repo_dir,'csse_covid_19_data/csse_covid_19_daily_reports')
all_files = glob.iglob(os.path.join(path, "*.csv"))  
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [9]:
df.sample(10)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Active,Combined_Key
168343,,,,31.0,0.0,0.0,,,46135.0,Yankton,South Dakota,US,2020-05-13 03:32:26,43.009245,-97.394676,31.0,"Yankton, South Dakota, US"
68547,,,,89.0,0.0,0.0,,,54003.0,Berkeley,West Virginia,US,2020-04-11 22:45:33,39.467046,-78.024146,0.0,"Berkeley, West Virginia, US"
177301,,,,0.0,0.0,0.0,,,56031.0,Platte,Wyoming,US,2020-05-16 02:32:19,42.132991,-104.966331,0.0,"Platte, Wyoming, US"
48049,,,,1.0,0.0,0.0,,,12125.0,Union,Florida,US,2020-04-03 22:46:37,30.04413,-82.374974,0.0,"Union, Florida, US"
171676,,,,198.0,0.0,198.0,,,,,Shanxi,China,2020-05-14 03:32:28,37.5777,112.2922,0.0,"Shanxi, China"
115131,,,,13.0,0.0,0.0,,,18049.0,Fulton,Indiana,US,2020-04-27 02:30:52,41.046991,-86.262527,13.0,"Fulton, Indiana, US"
27719,,,,0.0,0.0,0.0,,,27165.0,Watonwan,Minnesota,US,2020-03-27 22:14:55,43.978239,-94.614138,0.0,"Watonwan, Minnesota, US"
66656,,,,2.0,0.0,0.0,,,27067.0,Kandiyohi,Minnesota,US,2020-04-10 22:54:07,45.152606,-95.005864,0.0,"Kandiyohi, Minnesota, US"
129287,,,,247.0,22.0,0.0,,,12111.0,St. Lucie,Florida,US,2020-05-01 02:32:28,27.377639,-80.471066,225.0,"St. Lucie, Florida, US"
130650,,,,53.0,4.0,0.0,,,26039.0,Crawford,Michigan,US,2020-05-02 02:32:27,44.682556,-84.610062,49.0,"Crawford, Michigan, US"


In [12]:
# fix location and time due to schema changes since start of data collection
def fix_location(state_1,state_2,region_1,region_2,county):
    '''param: state_1: original field Province/State
       param: state_2: recent field Province_State
       param: region_1: original field Country/Region
       param: region_2: recent field Country_Region
       param: county: recent field Admin2 (US Counties)
       return: location: concatenated location string similar to Combined_Key but valid for all rows
    '''
    if not pd.isnull(state_1):
        state = state_1
    elif not pd.isnull(state_2):
        state = state_2
    else:
        state = None
    
    if not pd.isnull(region_1):
        region = region_1
    elif not pd.isnull(region_2):
        region = region_2
    else:
        region = None
    
    if not pd.isnull(county):
        county = county
    else:
        county = None
    location = ', '.join(filter(None,[county,state,region]))
    
    return location

# fix time
def fix_time(update_1, update_2):
    '''param: update_1: original update timestamp (Last Update)
       param: udpate_2: newer update timestamp (Last_Update)
       return: time: python datetime object 
    '''
    if not pd.isnull(update_1):
        time = parser.parse(update_1)
    elif not pd.isnull(update_2):
        time = parser.parse(update_2)
    else:
        time = np.nan
    
    return time

df['Location'] = df.apply(lambda x: fix_location(x['Province/State'],
                                                 x['Province_State'],
                                                 x['Country/Region'],
                                                 x['Country_Region'],
                                                 x['Admin2']), axis=1)
df['Time'] = df.apply(lambda x: fix_time(x['Last Update'], x['Last_Update']), axis=1)

In [13]:
# generate cases ('Confirmed') and deaths ('Deaths') over time grouped by location
df_by_location = df.groupby('Location').agg({'Confirmed': list, 'Deaths': list, 'Time': list}).reset_index()

In [14]:
# tuncate to minimum case count
MIN_CASE_COUNT = 100
def threshold(s, min_case_count):
    
    for idx, cnt in enumerate(s['Confirmed']):
        if cnt > min_case_count:
            idx = idx-1
            break
    s['Confirmed'] = s['Confirmed'][idx+1:]
    s['Deaths'] = s['Deaths'][idx+1:]
    s['Time'] = s['Time'][idx+1:]
    return s

df_by_location = df_by_location.apply(lambda x: threshold(x,MIN_CASE_COUNT), axis=1)

In [16]:
# plotting utility
def add_traces(df, location, pop, col, fig):
    y = df[df['Location']==location][col].iloc[0]
    y = [cases/pop for cases in y ]
    time = df[df['Location']==location]['Time'].iloc[0]
    fig.add_trace(go.Scatter(y=y,
                    mode='lines+markers',
                    name=location,
                    text=time))
    return fig

In [21]:
# plot cases / deaths
#plot_col = 'Confirmed'
plot_col = 'Deaths'
# get popluation: https://api.census.gov/data/2019/pep/population?get=POP&for=county:037&in=state:06
fig = go.Figure()
fig = add_traces(df_by_location, 'Alameda, California, US', 1.671, plot_col, fig)
fig = add_traces(df_by_location, 'San Francisco, California, US', 0.881549, plot_col, fig)
fig = add_traces(df_by_location, 'Santa Clara, California, US', 1.933, plot_col, fig)
fig = add_traces(df_by_location, 'San Mateo, California, US', 0.766573, plot_col, fig)
fig = add_traces(df_by_location, 'Contra Costa, California, US', 1.154, plot_col, fig)
fig = add_traces(df_by_location, 'Los Angeles, California, US', 10.039107, plot_col, fig)
fig = add_traces(df_by_location, 'District of Columbia, District of Columbia, US', 0.702455, plot_col, fig)
fig = add_traces(df_by_location, 'Miami-Dade, Florida, US', 2.716940, plot_col, fig)
fig = add_traces(df_by_location, 'Suffolk, Massachusetts, US', 0.803907, plot_col, fig)
fig = add_traces(df_by_location, 'King, Washington, US', 2.253, plot_col, fig)
fig = add_traces(df_by_location, 'Cook, Illinois, US', 5.15, plot_col, fig)
fig = add_traces(df_by_location, 'Wayne, Michigan, US', 1.749343, plot_col, fig)
fig = add_traces(df_by_location, 'Clark, Nevada, US', 2.267, plot_col, fig)
fig = add_traces(df_by_location, 'Cumberland, Maine, US', 0.295003, plot_col, fig)
fig = add_traces(df_by_location, 'New York City, New York, US', 8.336817, plot_col, fig)
fig = add_traces(df_by_location, 'Hubei, China', 58.5, plot_col, fig)
fig = add_traces(df_by_location, 'Germany', 83.02, plot_col, fig)
fig = add_traces(df_by_location, 'Italy', 60.36, plot_col, fig)
fig = add_traces(df_by_location, 'Spain', 46.94, plot_col, fig)


fig.update_layout(
    title="COVID-19 Confirmed {}".format(plot_col),
    xaxis_title="Days since {} confirmed cases".format(MIN_CASE_COUNT),
    yaxis_title="Number of Cases",
    showlegend=False,
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()