## Modeling the New York Times Covid-19 Data

Implement a simple Bayesian hierarchical model to forecast cases and deaths over the next few days.


In [271]:
import numpy as np
import pandas as pd
import json
import herepy
import plotly.graph_objects as go
import datetime
import dateutil.parser

geocoderApi = herepy.GeocoderApi('VbY-MyI6ZT9U8h-Y5GP5W1YaOzQuvNnL4aSTulNEyEQ')
df_counties = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv", dtype={"fips": str})
df_states = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv", dtype={"fips": str})
df_census = pd.read_csv('data/county_2019_census.csv')


In [272]:

def compute_lat_lon(df, geocoder):
    county = list(df['county'])
    state = list(df['state'])
    fips = list(df['fips'])
    cases = list(df['cases'])
    deaths = list(df['deaths'])
    lat = list(np.zeros(len(county)))
    lon = list(np.zeros(len(county)))

    num_to_print=20
    for i in range(len(county)):
        if type(fips[i])==str:
            response = geocoder.free_form('%s, %s' % (county[i], state[i]))
            result = response.as_json_string()
            res = eval(result)
            (clat, clon) = (res['Response']['View'][0]['Result'][0]['Location']['DisplayPosition']['Latitude'],
                      res['Response']['View'][0]['Result'][0]['Location']['DisplayPosition']['Longitude'])
            lat[i] = clat
            lon[i] = clon
            if (i <= num_to_print) | (i % int(len(county)/20) == 0):
                print("%s, %s: cases=%d, deaths=%d, lat=%f, lon=%f" % \
                        (county[i], state[i], cases[i], deaths[i], lat[i], lon[i]))
                if (i>=num_to_print):
                    print("...")            
    return (lat,lon)


def lat_lon_of_address(addr):
    response = geocoderApi.free_form(addr)
    type(response)
    result = response.as_json_string()
    res = eval(result)
    (lat, lon) = (res['Response']['View'][0]['Result'][0]['Location']['DisplayPosition']['Latitude'],
                  res['Response']['View'][0]['Result'][0]['Location']['DisplayPosition']['Longitude'])
    return (lat, lon)


def county_state_of_address(addr):
    response = geocoderApi.free_form(addr)
    type(response)
    result = response.as_json_string()
    res = eval(result)
    state = res['Response']['View'][0]['Result'][0]['Location']['Address']['AdditionalData'][1]['value']
    county = res['Response']['View'][0]['Result'][0]['Location']['Address']['AdditionalData'][2]['value']
    return (county, state)


In [273]:
recompute_geocodes = False
if (recompute_geocodes):
    lat, lon = compute_lat_lon(df_recent, geocoderApi)
    df_geo = df_recent
    df_geo['lat'] = lat
    df_geo['lon'] = lon
    lat_lon = df_recent
    lat_lon = lat_lon.drop('cases',1)
    lat_lon = lat_lon.drop('deaths',1)
    lat_lon = lat_lon.drop('date',1)
    lat_lon.to_csv('geo-counties.csv', header=True, index=False) 
    
df_geo = pd.read_csv("https://raw.githubusercontent.com/jdlafferty/covid-19/master/geo-counties.csv", dtype={"fips": str})


In [274]:
def process_most_recent_data():
    last_date = max([dateutil.parser.parse(d) for d in np.array(df_counties['date'])])
    most_recent_date = last_date.strftime("%Y-%m-%d")
    most_recent_date_long = last_date.strftime("%A %B %d, %Y")
    print("Most recent data: %s" % most_recent_date_long)
    
    df_recent = df_counties[df_counties['date']==most_recent_date]
    df_recent = df_recent.sort_values('cases', ascending=False)
    df_recent = df_recent.reset_index().drop('index',1)
    
    df_recent = pd.merge(df_recent, df_geo)
    df_recent = pd.merge(df_recent, df_census, how='left', on=['county','state'])
    df_recent = df_recent[df_recent['county'] != 'Unknown']
    df_recent['population'] = np.array(df_recent['population'], dtype='int')
    
    cases = np.array(df_recent['cases'])
    population = np.array(df_recent['population'])
    cases_per_100k = np.round(100000*np.array(cases/population),1)
    df_recent['cases_per_100k'] = cases_per_100k
    
    return df_recent

df_recent = process_most_recent_data()
df_recent[df_recent['state']=='New York'].head(50)

Most recent data: Wednesday April 01, 2020


Unnamed: 0,date,county,state,fips,cases,deaths,lat,lon,population,cases_per_100k
0,2020-04-01,New York City,New York,,47440,1374,40.71455,-74.00714,8900000,533.0
1,2020-04-01,Westchester,New York,36119.0,10683,25,41.11909,-73.7887,967506,1104.2
2,2020-04-01,Nassau,New York,36059.0,9555,76,42.51642,-73.61127,1356924,704.2
3,2020-04-01,Suffolk,New York,36103.0,7605,69,40.96009,-72.83434,1476601,515.0
9,2020-04-01,Rockland,New York,36087.0,3321,29,41.89899,-74.83049,325789,1019.4
17,2020-04-01,Orange,New York,36071.0,1756,25,42.30562,-77.04872,384940,456.2
52,2020-04-01,Erie,New York,36029.0,553,12,42.76823,-78.77651,918702,60.2
53,2020-04-01,Dutchess,New York,36027.0,547,5,41.75992,-73.72712,294218,185.9
84,2020-04-01,Monroe,New York,36055.0,349,9,41.32813,-74.18713,741770,47.0
112,2020-04-01,Onondaga,New York,36067.0,249,1,43.00487,-76.1836,460528,54.1


In [275]:
def render_map(show=True, min_cases=1, scale=3.0):
    df = df_recent

    df['text'] = df['county'] + ', ' + df['state'] + '<br>' + \
        (df['cases']).astype(str) + ' cases, ' + (df['deaths']).astype(str) + ' deaths<br>' + \
        (df['cases_per_100k']).astype(str) + ' cases per 100k people'
    
    df_top = df[df['cases'] >= min_cases]
    df_top = df_top[df_top['county']!='Unknown']

    fig = go.Figure()

    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = df_top['lon'],
        lat = df_top['lat'],
        text = df_top['text'],
        name = '',
        marker = dict(
            size = df_top['cases']/scale,
            color = 'rgba(255, 0, 0, 0.2)',
            line_color='black',
            line_width=0.5,
            sizemode = 'area'
        ),
    ))
    
    fig.update_layout(
            width = 1000,
            height = 700,
            margin={"r":0,"t":0,"l":0,"b":0},
            showlegend = False,
            geo = dict(
                scope = 'usa',
                landcolor = 'rgb(230, 230, 230)',
            )
    )
    
    if show:
        fig.show(config={'scrollZoom': False})
        
    return(fig)
    
    
def render_map_with_address(addr=None, show=True, scale=3.0):
    fig = render_map(show=False)
    
    this_lat, this_lon = lat_lon_of_address(addr)
    this_county, this_state = county_state_of_address(addr)
    county_record = df_recent[(df_recent['county']==this_county) & (df_recent['state']==this_state)]
    this_text = '%s<br>County: %s' % (addr, np.array(county_record['text'])[0])
    
    td = pd.DataFrame()
    td['lat']=np.array([this_lat])
    td['lon']=np.array([this_lon])
    td['text']=np.array([this_text])
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = td['lon'],
        lat = td['lat'],
        text = td['text'],
        name = '',
        marker = dict(
            size = 100/scale,
            color = 'rgba(0,255,0,0.2)',
            line_color='black',
            line_width=0.5,
            sizemode = 'area'
        ),
    ))

    fig.update_layout(
            width = 1000,
            height = 700,
            margin={"r":0,"t":0,"l":0,"b":0},
            showlegend = False,
            geo = dict(
                scope = 'usa',
                landcolor = 'rgb(230, 230, 230)',
            )
    )
    
    fig.update_layout(
        title={
            'text': "Data from The New York Times<br>https://github.com/nytimes/covid-19-data<br>%s" % most_recent_date_long,
            'y':0.05,
            'x':0.85,
            'xanchor': 'left',
            'yanchor': 'bottom'},
       font=dict(
            family="Times New Roman",
            size=6,
            color="#7f7f7f")
    )
    
    if show:
        fig.show(config={'scrollZoom': False})
    



In [276]:
render_map_with_address('Sun Valley, ID')