# Exploration of NY Times COVID-19 data - Dashboard

Data from The New York Times, based on reports from state and local health agencies.
The Times is reporting at https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html.

County population data from https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html.
New York City population data from https://www1.nyc.gov/assets/planning/download/pdf/planning-level/nyc-population/new-population/current-populatiion-estimattes.pdf?r=2019.

**Remember to `git pull upstream master` every day.** Data includes up to yesterday's total.

Start date: 2020-07-12

In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import numpy as np

import panel as pn
pn.extension()

import bokeh.plotting
import bokeh.models
import bokeh.io

import colorcet

import tqdm

bokeh.io.output_notebook()

In [2]:
%load_ext blackcellmagic

# County dashboard

In [3]:
county_strings_list = [
    "Los Angeles, California",
    "Orange, California",
    "San Bernardino, California",
    "New York City, New York",
    "Lee, Florida",
    "Harford, Maryland",
    "Suffolk, Massachusetts",
    "Boulder, Colorado"
]

In [4]:
df_counties = pd.read_csv(
    "us-counties.csv"
).rename(
    columns={"cases": "total cases", "deaths": "total deaths"}
)
df_counties['date'] = pd.to_datetime(df_counties['date'])

df_pop = pd.read_excel(
    "co-est2019-annres.xlsx",
    usecols="A,M",
    skiprows=[0, 1, 2, 4],
    skipfooter=6,
).rename(
    columns={"Unnamed: 0": "geographic area", 2019: "population"}
)
df_pop[['county', 'state']] = df_pop['geographic area'].str.split("County,", expand=True)[[0, 1]]
df_pop['county']=df_pop['county'].str.strip(' .')
df_pop['state']=df_pop['state'].str.strip(' ')
df_pop = df_pop.drop(columns='geographic area')
df_nyc_pop = pd.DataFrame([[8336817, "New York City", "New York"]], columns=["population", "county", "state"])
df_pop = df_pop.append(df_nyc_pop)

df_counties = pd.merge(df_counties, df_pop)

In [5]:
def get_county_data(county, state):
    """
    Makes a Pandas Data Frame with data for a given county.
    Arguments:
    `county`: county name
    `state`: state name
    """
    if county not in df_counties['county'].values:
        raise RuntimeError(f"{county} is not a valid county name.")
    if state not in df_counties['state'].values:
        raise RuntimeError(f"{state} is not a valid state.")
    
    df = df_counties.loc[(df_counties['county']==county) & (df_counties['state']==state)]
    if len(df) == 0:
        raise RuntimeError(f"{county} is not in {state}.")
        
    return df

In [6]:
def add_new_per_day(df):
    """
    Adds columns to Dataframe for new cases/deaths per day and 7 day average.
    Argument `df` is Pandas Dataframe output from `get_county_data()`.
    """
    # Add new cases
    cases_array = np.array(df["total cases"])
    new_cases_array = np.empty(np.shape(cases_array))
    new_cases_array[0] = cases_array[0]

    for i, n in enumerate(cases_array):
        if i > 0:
            new_cases_array[i] = cases_array[i] - cases_array[i - 1]

    df["new cases"] = new_cases_array

    # Calculate 7-day average for new cases per day
    df["new cases (7 day average)"] = df["new cases"].rolling(window=7).mean()
    
    # Add new deaths
    deaths_array = np.array(df["total deaths"])
    new_deaths_array = np.empty(np.shape(deaths_array))
    new_deaths_array[0] = deaths_array[0]

    for i, n in enumerate(deaths_array):
        if i > 0:
            new_deaths_array[i] = deaths_array[i] - deaths_array[i - 1]

    df["new deaths"] = new_deaths_array

    # Calculate 7-day average for new cases per day
    df["new deaths (7 day average)"] = df["new deaths"].rolling(window=7).mean()
    
    return df

In [7]:
colors = colorcet.b_glasbey_category10
n = len(county_strings_list) // len(colors) + 1
color_dict = dict(zip(county_strings_list, colors * n))
color_dict["New York City, New York"] = 'gray'

counties_selector = pn.widgets.CheckBoxGroup(
    name="Counties:",
    options=county_strings_list,
    value=county_strings_list
)

measurement_selector = pn.widgets.Select(
    name="Plot:",
    options=['total cases', 'total deaths', 'new cases', 'new cases (7 day average)', 'new deaths', 'new deaths (7 day average)'],
    value='total cases',
)

normalization_selector = pn.widgets.Select(
    name="Normalization:",
    options=['none', 'per 100,000'],
    value='none',
)

timespan_selector = pn.widgets.Select(
    name="Timespan:",
    options=['all', 'two weeks', 'since lab re-opening'],
    value='all',
)

lab_reopening_indicator = pn.widgets.Toggle(
    name="show lab re-opening date",
    value=False,
)

yesterday = pd.to_datetime(pd.to_datetime("today").date()) - pd.DateOffset(days=1)
two_weeks_ago = yesterday - pd.DateOffset(days=13)
lab_re_opening_date = pd.to_datetime(pd.to_datetime("2020-06-08").date())

@pn.depends(counties_selector.param.value, measurement_selector.param.value, normalization_selector.param.value, timespan_selector.param.value, lab_reopening_indicator.param.value)
def multi_covid_plots(counties, measurement, normalization, timespan, lab_re_opening_label):
    """
    Plots data for given counties.
    
    Depends on `get_county_data()`, `add_new_per_day()` functions.
    """   
    
    if normalization=='per 100,000':
        p = bokeh.plotting.figure(
            frame_height=300,
            frame_width=600,
            x_axis_type="datetime",
            x_axis_label="date",
            y_axis_label=str(measurement)+" (per 100,000)",
        )
    else:
        p = bokeh.plotting.figure(
            frame_height=300,
            frame_width=600,
            x_axis_type="datetime",
            x_axis_label="date",
            y_axis_label=str(measurement),
        )
    
    if timespan == "two weeks":
        date_range = pd.date_range(start=two_weeks_ago, end=yesterday)
    elif timespan == "since lab re-opening":
        date_range = pd.date_range(start=lab_re_opening_date, end=yesterday)
    else:
        date_range = pd.date_range(start=min(df_counties['date']), end=yesterday)
    
    for i, county_string in enumerate(counties):     
        county = county_string[:county_string.find(',')]
        state=county_string[county_string.find(',')+2:]
        df = get_county_data(county, state).reset_index()

        # Calculate new cases per day
        if 'new' in measurement:
            df = add_new_per_day(df)

        # Get data for the specified date range
        inds = [False] * len(df)
        for i, r in df.iterrows():
            if r['date'] in date_range:
                inds[i] = True
        df = df.loc[inds, :]

        if normalization == 'per 100,000':
            measurement_per = df[measurement] / df['population'] * 100000
            df['measurement_per'] = measurement_per
            p.line(
                source=df,
                x="date",
                y="measurement_per",
                line_width=2,
                color=color_dict[county_string],
                legend_label=f"{county}, {state}",
            )
        else:
            p.line(
                source=df,
                x="date",
                y=measurement,
                line_width=2,
                color=color_dict[county_string],
                legend_label=f"{county}, {state}",
            ) 

    if lab_re_opening_label:
        lab_re_opening = bokeh.models.Span(
            location=pd.to_datetime("2020-06-08"),
            dimension="height",
            line_color="black",
            line_dash="dashed",
            line_width=2,
        )

        re_opening_label = bokeh.models.Label(
            x=pd.to_datetime("2020-06-08") - pd.DateOffset(days=35),
            y=275,
            y_units="screen",
            text="lab re-opening",
        )

        p.add_layout(lab_re_opening)
        p.add_layout(re_opening_label)
        
    p.yaxis[0].formatter = bokeh.models.formatters.BasicTickFormatter(
        use_scientific=False
    )
    
    p.legend.location = "top_left"
    p.legend.click_policy = 'hide'
    
    tooltips = bokeh.models.HoverTool(
        tooltips=[
            ('date', '@date{%F}'),
        ],
        formatters={'@date': 'datetime'},
#        mode='vline'
    )
    p.add_tools(tooltips)
    
        
    if timespan == "two weeks":
        p.y_range.start = -0.05 * max(df[measurement])

    return p
    
plot = pn.Column(
    pn.Spacer(height=15),
    multi_covid_plots,
    lab_reopening_indicator,
)
    
widgets = pn.Column(
    pn.Spacer(height=15),
    counties_selector,
    pn.Spacer(height=15),
    measurement_selector,
    pn.Spacer(height=15),
    normalization_selector,
    pn.Spacer(height=15),
    timespan_selector,
#    pn.Spacer(height=15),
#    lab_reopening_indicator,
)

pn.Row(plot, pn.Spacer(width=15), widgets)

# States dashboard

Note: currently does not support population normalization.

In [8]:
df_states = pd.read_csv(
    "us-states.csv"
).rename(
    columns={"cases": "total cases", "deaths": "total deaths"}
)
df_states['date'] = pd.to_datetime(df_states['date'])

#df_pop = pd.read_excel(
#    "co-est2019-annres.xlsx",
#    usecols="A,M",
#    skiprows=[0, 1, 2, 4],
#    skipfooter=6,
#).rename(
#    columns={"Unnamed: 0": "geographic area", 2019: "population"}
#)
#df_pop[['county', 'state']] = df_pop['geographic area'].str.split("County,", expand=True)[[0, 1]]
#df_pop['county']=df_pop['county'].str.strip(' .')
#df_pop['state']=df_pop['state'].str.strip(' ')
#df_pop = df_pop.drop(columns='geographic area')
#df_nyc_pop = pd.DataFrame([[8336817, "New York City", "New York"]], columns=["population", "county", "state"])
#df_pop = df_pop.append(df_nyc_pop)
#
#df_counties = pd.merge(df_counties, df_pop)

In [9]:
def get_state_data(state):
    """
    Makes a Pandas Data Frame with data for a given states.
    Arguments:
    `state`: state name
    """
    if state not in df_states['state'].values:
        raise RuntimeError(f"{state} is not a valid state name.")
    
    df = df_states.loc[(df_states['state']==state)]
        
    return df

In [10]:
states_list = ["California", "Florida", "Texas"]

colors = colorcet.b_glasbey_category10
n = len(states_list) // len(colors) + 1
color_dict = dict(zip(states_list, colors * n))
color_dict["New York"] = 'gray'

states_selector = pn.widgets.CheckBoxGroup(
    name="States:",
    options=states_list,
    value=states_list
)

measurement_selector = pn.widgets.Select(
    name="Plot:",
    options=['total cases', 'total deaths', 'new cases', 'new cases (7 day average)', 'new deaths', 'new deaths (7 day average)'],
    value='total cases',
)

normalization_selector = pn.widgets.Select(
    name="Normalization:",
    options=[
        'none', 
        #'per 100,000'
            ],
    value='none',
)

timespan_selector = pn.widgets.Select(
    name="Timespan:",
    options=['all', 'two weeks', 'since lab re-opening'],
    value='all',
)

lab_reopening_indicator = pn.widgets.Toggle(
    name="show lab re-opening date",
    value=False,
)

yesterday = pd.to_datetime(pd.to_datetime("today").date()) - pd.DateOffset(days=1)
two_weeks_ago = yesterday - pd.DateOffset(days=13)
lab_re_opening_date = pd.to_datetime(pd.to_datetime("2020-06-08").date())

@pn.depends(states_selector.param.value, measurement_selector.param.value, normalization_selector.param.value, timespan_selector.param.value, lab_reopening_indicator.param.value)
def multi_covid_plots(states_list, measurement, normalization, timespan, lab_re_opening_label):
    """
    Plots data for given counties.
    
    Depends on `get_county_data()`, `add_new_per_day()` functions.
    """   
    #
    #if normalization=='per 100,000':
    #    p = bokeh.plotting.figure(
    #        frame_height=300,
    #        frame_width=600,
    #        x_axis_type="datetime",
    #        x_axis_label="date",
    #        y_axis_label=str(measurement)+" (per 100,000)",
    #    )
    #else:
    p = bokeh.plotting.figure(
        frame_height=300,
        frame_width=600,
        x_axis_type="datetime",
        x_axis_label="date",
        y_axis_label=str(measurement),
    )
    
    if timespan == "two weeks":
        date_range = pd.date_range(start=two_weeks_ago, end=yesterday)
    elif timespan == "since lab re-opening":
        date_range = pd.date_range(start=lab_re_opening_date, end=yesterday)
    else:
        date_range = pd.date_range(start=min(df_counties['date']), end=yesterday)
    
    for i, state in enumerate(states_list):     
        df = get_state_data(state).reset_index()

        # Calculate new cases per day
        if 'new' in measurement:
            df = add_new_per_day(df)

        # Get data for the specified date range
        inds = [False] * len(df)
        for i, r in df.iterrows():
            if r['date'] in date_range:
                inds[i] = True
        df = df.loc[inds, :]

        if normalization == 'per 100,000':
            measurement_per = df[measurement] / df['population'] * 100000
            df['measurement_per'] = measurement_per
            p.line(
                source=df,
                x="date",
                y="measurement_per",
                line_width=2,
                color=color_dict[state],
                legend_label=f"{state}",
            )
        else:
            p.line(
                source=df,
                x="date",
                y=measurement,
                line_width=2,
                color=color_dict[state],
                legend_label=f"{state}",
            ) 

    if lab_re_opening_label:
        lab_re_opening = bokeh.models.Span(
            location=pd.to_datetime("2020-06-08"),
            dimension="height",
            line_color="black",
            line_dash="dashed",
            line_width=2,
        )

        re_opening_label = bokeh.models.Label(
            x=pd.to_datetime("2020-06-08") - pd.DateOffset(days=35),
            y=275,
            y_units="screen",
            text="lab re-opening",
        )

        p.add_layout(lab_re_opening)
        p.add_layout(re_opening_label)
        
    p.yaxis[0].formatter = bokeh.models.formatters.BasicTickFormatter(
        use_scientific=False
    )
    
    p.legend.location = "top_left"
    p.legend.click_policy = 'hide'
    
    tooltips = bokeh.models.HoverTool(
        tooltips=[
            ('date', '@date{%F}'),
        ],
        formatters={'@date': 'datetime'},
#        mode='vline'
    )
    p.add_tools(tooltips)
    
        
    if timespan == "two weeks":
        p.y_range.start = -0.05 * max(df[measurement])

    return p
    
plot = pn.Column(
    pn.Spacer(height=15),
    multi_covid_plots,
    lab_reopening_indicator,
)
    
widgets = pn.Column(
    pn.Spacer(height=15),
    states_selector,
    pn.Spacer(height=15),
    measurement_selector,
    pn.Spacer(height=15),
    normalization_selector,
    pn.Spacer(height=15),
    timespan_selector,
#    pn.Spacer(height=15),
#    lab_reopening_indicator,
)

pn.Row(plot, pn.Spacer(width=15), widgets)

In [11]:
%load_ext watermark
%watermark -v -p numpy,pandas,bokeh,panel,colorcet,jupyterlab

CPython 3.7.7
IPython 7.18.1

numpy 1.19.2
pandas 0.24.2
bokeh 2.2.3
panel 0.9.7
colorcet 2.0.2
jupyterlab 2.2.6
