In [61]:
import glob
import json
import os
import requests
import shutil

import numpy as np
import pandas as pd
import plotly.graph_objects as go

from dateutil import parser
from git import Repo
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

In [62]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [63]:
# download data
git_url = 'https://github.com/CSSEGISandData/COVID-19'
repo_dir = 'data'
shutil.rmtree(repo_dir)
Repo.clone_from(git_url, repo_dir)

<git.repo.base.Repo '/home/jovyan/data/.git'>

In [64]:
# read data into pandas
path = os.path.join(os.getcwd(),repo_dir,'csse_covid_19_data/csse_covid_19_daily_reports')
all_files = glob.iglob(os.path.join(path, "*.csv"))  
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [65]:
# fix location and time due to schema changes since start of data collection
def fix_location(state_1,state_2,region_1,region_2,county):
    '''param: state_1: original field Province/State
       param: state_2: recent field Province_State
       param: region_1: original field Country/Region
       param: region_2: recent field Country_Region
       param: county: recent field Admin2 (US Counties)
       return: location: concatenated location string similar to Combined_Key but valid for all rows
    '''
    if not pd.isnull(state_1):
        state = state_1
    elif not pd.isnull(state_2):
        state = state_2
    else:
        state = None
    
    if not pd.isnull(region_1):
        region = region_1
    elif not pd.isnull(region_2):
        region = region_2
    else:
        region = None
    
    if not pd.isnull(county):
        county = county
    else:
        county = None
    location = ', '.join(filter(None,[county,state,region]))
    
    return location

# fix time
def fix_time(update_1, update_2):
    '''param: update_1: original update timestamp (Last Update)
       param: udpate_2: newer update timestamp (Last_Update)
       return: time: python datetime object 
    '''
    if not pd.isnull(update_1):
        time = parser.parse(update_1)
    elif not pd.isnull(update_2):
        time = parser.parse(update_2)
    else:
        time = np.nan
    
    return time

df['Location'] = df.apply(lambda x: fix_location(x['Province/State'],
                                                 x['Province_State'],
                                                 x['Country/Region'],
                                                 x['Country_Region'],
                                                 x['Admin2']), axis=1)
df['Time'] = df.apply(lambda x: fix_time(x['Last Update'], x['Last_Update']), axis=1)

In [66]:
# generate cases ('Confirmed') and deaths ('Deaths') over time grouped by location
df_by_location = df.groupby(['Location','FIPS']).agg({'Confirmed': list, 'Deaths': list, 'Time': list}).reset_index()

In [67]:
# add population data from US census API
# https://www.census.gov/data/developers/guidance/api-user-guide.html

# get population data for all US counties
response = requests.get("https://api.census.gov/data/2019/pep/population?get=POP&for=county:*&in=state:*")
pop_data = pd.DataFrame(response.json()[1:],columns=['population','state_code','county_code'])
pop_data['FIPS_str'] = pop_data.apply(lambda x: x['state_code'] + x['county_code'], axis=1)
pop_data['population'] = pop_data['population'].apply(int)

def fix_FIPS(fips):
    '''
    convert FIPS fields to string
    '''
    if not fips:
        return np.NaN
    fips_str = str(int(fips))
    if len(fips_str) < 5:
        fips_str = '0' + fips_str
        
    return fips_str

df_by_location['FIPS_str'] = df_by_location.apply(lambda x: fix_FIPS(x['FIPS']), axis=1)
df_by_location = df_by_location.merge(pop_data[['population','FIPS_str']], \
                                      how='left', \
                                      left_on='FIPS_str', \
                                      right_on='FIPS_str')


In [68]:
# fix population for NYC - data set assigns all Boroughs to Manhatten FIPS
nyc_fips = ['36005','36047','36061','36081','36085']
nyc_pop = pop_data[pop_data['FIPS_str'].isin(nyc_fips)]['population'].sum()
df_by_location.loc[df_by_location['FIPS_str']=='36061', 'population'] = nyc_pop

In [69]:
# tuncate to minimum case count
MIN_CASE_COUNT = 100
def threshold(s, min_case_count):
    
    for idx, cnt in enumerate(s['Confirmed']):
        if cnt > min_case_count:
            idx = idx-1
            break
    s['Confirmed'] = s['Confirmed'][idx+1:]
    s['Deaths'] = s['Deaths'][idx+1:]
    s['Time'] = s['Time'][idx+1:]
    return s

df_by_location = df_by_location.apply(lambda x: threshold(x,MIN_CASE_COUNT), axis=1)

In [70]:
# plotting
def add_traces(row, col, fig, do_diff, pop_upper_thresh, pop_lower_thresh, visible_locations):
    y = row[col]
    pop = row['population']
    y = [1000000*cases/pop for cases in y ]
    time = row['Time']
    location = row['Location']
    visible = True if location in visible_locations else "legendonly"
    if do_diff:
        y = np.diff(y)
    
    if (pop > pop_lower_thresh) and (pop < pop_upper_thresh):
        fig.add_trace(go.Scatter(y=y,
                        mode='lines+markers',
                        name=location,
                        text=time,
                        visible=visible))
    return fig

In [71]:
# ploting configuration
plot_col = 'Confirmed'
#plot_col = 'Deaths'
visible_locations = ['Cumberland, Maine, US',
                    'Alameda, California, US',
                    'Santa Clara, California, US',
                    'San Mateo, California, US',
                    'San Francisco, California, US',
                    'Contra Costa, California, US',
                    'Maricopa, Arizona, US',
                    'Los Angeles, California, US']
population_threshold = 100000
title={
    'text': "COVID-19 by US Counties - Add / Remove Counties by Clicking on the Legend",
    'y':0.9,
    'x':0.5,
    'xanchor': 'center',
    'yanchor': 'top'}
font=dict(
    family="Courier New, monospace",
    size=10,
    color="#7f7f7f"
)

In [72]:
# Plot cases over time
plot_new_cases = False
yaxis_label = "{}Cases per Million".format('Daily ' if plot_new_cases else '')

fig1 = go.Figure()
for index, row in df_by_location.iterrows():
    fig1 = add_traces(row, plot_col, fig1, plot_new_cases, np.inf, population_threshold, visible_locations)

fig1 = fig1.update_layout(
    title=title,
    xaxis_title="Days since {} cases.".format(MIN_CASE_COUNT),
    yaxis_title=yaxis_label,
    font=font
)

iplot(fig1)

In [73]:
# Plot new cases over time
plot_new_cases = True
yaxis_label = "{}Cases per Million".format('Daily ' if plot_new_cases else '')

fig2 = go.Figure()
for index, row in df_by_location.iterrows():
    fig2 = add_traces(row, plot_col, fig2, plot_new_cases, np.inf, population_threshold, visible_locations)

fig2 = fig2.update_layout(
    title=title,
    xaxis_title="Days since {} cases.".format(MIN_CASE_COUNT),
    yaxis_title=yaxis_label,
    font=font
)

iplot(fig2)