# Exploration of NY Times COVID-19 data - pick your county

Data from The New York Times, based on reports from state and local health agencies.
The Times is reporting at https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html.

**Remember to `git pull upstream master` every day.** Data includes up to yesterday's total.

Start date: 2020-07-08

In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np

import bokeh.plotting
import bokeh.models
import bokeh.io

import colorcet

import tqdm

bokeh.io.output_notebook()

In [2]:
%load_ext blackcellmagic

In [3]:
yesterday = pd.to_datetime(pd.to_datetime("today").date()) - pd.DateOffset(days=1)

In [58]:
df_counties = pd.read_csv(
    "us-counties.csv"
).rename(
    columns={"cases": "total cases", "deaths": "total deaths"}
)
df_counties['date'] = pd.to_datetime(df_counties['date'])

df_pop = pd.read_excel(
    "co-est2019-annres.xlsx",
    usecols="A,M",
    skiprows=[0, 1, 2, 4],
    skipfooter=6,
).rename(
    columns={"Unnamed: 0": "geographic area", 2019: "population"}
)
df_pop[['county', 'state']] = df_pop['geographic area'].str.split("County,", expand=True)[[0, 1]]
df_pop['county']=df_pop['county'].str.strip(' .')
df_pop['state']=df_pop['state'].str.strip(' ')
df_pop = df_pop.drop(columns='geographic area')

df_counties = pd.merge(df_counties, df_pop)

In [77]:
def get_county_data(county, state):
    """
    Makes a Pandas Data Frame with data for a given county.
    Arguments:
    `county`: county name
    `state`: state name
    """
    if county not in df_counties['county'].values:
        raise RuntimeError(f"{county} is not a valid county name.")
    if state not in df_counties['state'].values:
        raise RuntimeError(f"{state} is not a valid state.")
    
    df = df_counties.loc[(df_counties['county']==county) & (df_counties['state']==state)]
    if len(df) == 0:
        raise RuntimeError(f"{county} is not in {state}.")
        
    return df

In [60]:
def add_new_per_day(df):
    """
    Adds columns to Dataframe for new cases/deaths per day and 7 day average.
    Argument `df` is Pandas Dataframe output from `get_county_data()`.
    """
    # Add new cases
    cases_array = np.array(df["total cases"])
    new_cases_array = np.empty(np.shape(cases_array))
    new_cases_array[0] = cases_array[0]

    for i, n in enumerate(cases_array):
        if i > 0:
            new_cases_array[i] = cases_array[i] - cases_array[i - 1]

    df["new cases"] = new_cases_array

    # Calculate 7-day average for new cases per day
    df["new cases (7 day average)"] = df["new cases"].rolling(window=7).mean()
    
    # Add new deaths
    deaths_array = np.array(df["total deaths"])
    new_deaths_array = np.empty(np.shape(deaths_array))
    new_deaths_array[0] = deaths_array[0]

    for i, n in enumerate(deaths_array):
        if i > 0:
            new_deaths_array[i] = deaths_array[i] - deaths_array[i - 1]

    df["new deaths"] = new_deaths_array

    # Calculate 7-day average for new cases per day
    df["new deaths (7 day average)"] = df["new deaths"].rolling(window=7).mean()
    
    return df

In [137]:
def covid_plots(county, state, measurement, per_100000=False, timespan="all", lab_re_opening_label=False):
    """
    Plots data for a given county and state.
    Arguments:
    `county`: county name
    `state`: state name
    `measurement`: measurement to plot; choose from 'total cases',
                'total deaths', 'new cases', 'new cases (7 day average)', 
                'new deaths', 'new deaths (7 day average)'.
    `per_100000`: whether to plot measurement per 100,000 residents.
                Defaults to `False`.
    `timespan`: timespan to plot. Defaults to 'all'. 
                Other option is 'two weeks'.
    `lab_re_opening_label`: whether to include an annotation for 
                the date of Caltech lab re-openings. Defaults to `False`.
        
    Depends on `get_county_data()`, `add_new_per_day()` functions.
    """

    df = get_county_data(county, state)
    yesterday = pd.to_datetime(pd.to_datetime("today").date()) - pd.DateOffset(days=1)
    two_weeks_ago = yesterday - pd.DateOffset(days=13)

    # Calculate new cases per day
    if 'new' in measurement:
        df = add_new_per_day(df)

  
    if per_100000:
        p = bokeh.plotting.figure(
            frame_height=300,
            frame_width=600,
            title=f"{county} County, {state}",
            x_axis_type="datetime",
            x_axis_label="date",
            y_axis_label=str(measurement)+" (per 100,000)",
        )
        measurement_per = df[measurement] / df['population'] * 100000
        p.line(
            x=df["date"], y=measurement_per, line_width=2,
        )
    else:
        p = bokeh.plotting.figure(
            frame_height=300,
            frame_width=600,
            title=f"{county} County, {state}",
            x_axis_type="datetime",
            x_axis_label="date",
            y_axis_label=str(measurement),
        )
        p.line(
            source=df, x="date", y=measurement, line_width=2,
        ) 
    
    p.yaxis[0].formatter = bokeh.models.formatters.BasicTickFormatter(
        use_scientific=False
    )

    if lab_re_opening_label:
        lab_re_opening = bokeh.models.Span(
            location=pd.to_datetime("2020-06-08"),
            dimension="height",
            line_color="black",
            line_dash="dashed",
            line_width=2,
        )

        re_opening_label = bokeh.models.Label(
            x=pd.to_datetime("2020-06-08") - pd.DateOffset(days=35),
            y=25,
            y_units="screen",
            text="lab re-opening",
        )

        p.add_layout(lab_re_opening)
        p.add_layout(re_opening_label)

    if timespan == "two weeks":
        p.x_range.start = two_weeks_ago
        p.x_range.end = yesterday

    bokeh.io.show(p)

In [138]:
covid_plots(
    "Los Angeles",
    "California",
    "new deaths (7 day average)",
    per_100000=False,
    lab_re_opening_label=True,
)

In [126]:
def multi_covid_plots(county_tuple, measurement, per_100000=False, timespan="all", lab_re_opening_label=False):
    """
    Plots data for given counties.
    Arguments:
    `county tuple`: A tuple of (county, state) tuples.
    `measurement`: Measurement to plot; choose from 'total cases',
                'total deaths', 'new cases', 'new cases (7 day average)', 
                'new deaths', 'new deaths (7 day average)'.
    `per_100000`: Whether to plot measurement per 100,000 residents.
                Defaults to `False`.
    `timespan`: Timespan to plot. Defaults to 'all'. 
                Other option is 'two weeks'.
    `lab_re_opening_label`: Whether to include an annotation for 
                the date of Caltech lab re-openings. Defaults to `False`.
        
    Depends on `get_county_data()`, `add_new_per_day()` functions.
    """
    colors = colorcet.b_glasbey_category10
    
    if per_100000:
        p = bokeh.plotting.figure(
            frame_height=300,
            frame_width=600,
            x_axis_type="datetime",
            x_axis_label="date",
            y_axis_label=str(measurement)+" (per 100,000)",
        )
    else:
        p = bokeh.plotting.figure(
            frame_height=300,
            frame_width=600,
            x_axis_type="datetime",
            x_axis_label="date",
            y_axis_label=str(measurement),
        )
    
    for i, (county, state) in enumerate(county_tuple):
        df = get_county_data(county, state)
        yesterday = pd.to_datetime(pd.to_datetime("today").date()) - pd.DateOffset(days=1)
        two_weeks_ago = yesterday - pd.DateOffset(days=13)

        # Calculate new cases per day
        if 'new' in measurement:
            df = add_new_per_day(df)


        if per_100000:
            measurement_per = df[measurement] / df['population'] * 100000
            p.line(
                x=df["date"],
                y=measurement_per,
                line_width=2,
                color=colors[i % len(county_tuple)],
                legend_label=f"{county}, {state}",
            )
        else:
            p.line(
                source=df,
                x="date",
                y=measurement,
                line_width=2,
                color=colors[i % len(county_tuple)],
                legend_label=f"{county}, {state}",
            ) 

    if lab_re_opening_label:
        lab_re_opening = bokeh.models.Span(
            location=pd.to_datetime("2020-06-08"),
            dimension="height",
            line_color="black",
            line_dash="dashed",
            line_width=2,
        )

        re_opening_label = bokeh.models.Label(
            x=pd.to_datetime("2020-06-08") - pd.DateOffset(days=35),
            y=275,
            y_units="screen",
            text="lab re-opening",
        )

        p.add_layout(lab_re_opening)
        p.add_layout(re_opening_label)

    if timespan == "two weeks":
        p.x_range.start = two_weeks_ago
        p.x_range.end = yesterday
        
        
    p.yaxis[0].formatter = bokeh.models.formatters.BasicTickFormatter(
        use_scientific=False
    )
    
    p.legend.location = "top_left"

    bokeh.io.show(p)

In [136]:
county_tuple = (("Los Angeles", "California"), ("Lee", "Florida"), ("Harford", "Maryland"), ("Suffolk", "Massachusetts"), ("Boulder", "Colorado"))

multi_covid_plots(
    county_tuple,
    "new cases (7 day average)",
    per_100000=True,
    lab_re_opening_label=True,
)

In [130]:
multi_covid_plots(
    county_tuple,
    "new cases (7 day average)",
    per_100000=True,
    lab_re_opening_label=True,
    timespan="two weeks"
)

In [132]:
multi_covid_plots(
    county_tuple,
    "total cases",
    per_100000=True,
    lab_re_opening_label=True,
)

# To Do
- ~overlay two(+?) counties~
- ~per capita (county populations from https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html)~
- autoscale y-axis on two week plots
- build dashboard with dropdowns for changing measurement, per 100,000, timespan