In [None]:
# Setup our notebook tools
import pandas as pd
import plotly.express as px
import numpy

def deltas(dataframe, column, category):
    # Return a new series containing changes by category
    column_name = column + "_deltas"
    column_data = [0] * len(dataframe.index)
    series = pd.Series(name=column_name, data=column_data, index=dataframe.index)
    last_data = {}
    for index, row in dataframe.iterrows():
        try:
            last = last_data[row[category]]
        except KeyError:
            last = 0
        series[index] = row[column] - last
        if series[index] < 0:
            series[index] = 0
        last_data[row[category]] = row[column]
    return series

df = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")
states = ['Virginia', 'Ohio', 'Georgia', 'West Virginia', 'Florida', 'South Carolina', 'California', 'New York']
tail_size = len(states)

In [None]:
# Cast our dates to bonafide datetimes
df['date'] = pd.to_datetime(df['date'])
# Sort the dataframe by date
df = df.sort_values(by=['date'])

# Narrow the dataset to states we're interested in
states_query = ' | '.join(['state == "' + state + '"' for state in states])
df = df.query(states_query)

In [None]:
df = df.assign(
    timestamps=pd.Series(
        name='timestamps',
        data=[ int(row['date'].timestamp()) for index, row in df.iterrows() ],
        index=df.index
    )
)

In [None]:
px.line(df, x='date', y='cases', color='state', title="COVID 19 Cases For Select US States vs Date")

In [None]:
cases_delta = deltas(df, "cases", "state")
df = df.assign(cases_delta=cases_delta)
deaths_delta = deltas(df, "deaths", "state")
df = df.assign(deaths_delta=deaths_delta)
df.tail(tail_size)

In [None]:
px.scatter(df, x='date', y='cases_delta', color='state', title='COVID 19 Change in Cases Per Day For Select US States vs Date', trendline='lowess')

In [None]:
# source: the info box at the top of a google search
# that tries to guess the thing you were googling for
facts_by_state = {
    'Virginia': {
        'population': 8518000,
        'density': 202.6
    },
    'Ohio': {
        'population': 11690000,
        'density': 282.3
    },
    'Georgia': {
        'population': 10520000,
        'density': 149.0
    },
    'West Virginia': {
        'population': 1806000,
        'density': 77.1
    },
    'Florida': {
        'population': 21300000,
        'density': 353.4
    },
    'South Carolina': {
        'population': 5419000,
        'density': 157
    },
    'California': {
        'population': 39510000,
        'density': 251.3
    },
    'New York': {
        'population': 19450000,
        'density': 421
    }
}
for state in facts_by_state.keys():
    facts_by_state[state]['population_per_million'] = facts_by_state[state]['population'] / 1000000

cases_per_million = pd.Series(
    name='cases_per_million',
    data=[0] * len(df.index),
    index=df.index
)

for index, row in df.iterrows():
    cases_per_million[index] = row.cases / facts_by_state[row.state]['population_per_million']
df = df.assign(cases_per_million=cases_per_million)
cases_delta = deltas(df, "cases_per_million", "state")
df = df.assign(cases_per_million_delta=cases_delta)
df.tail(tail_size)

In [None]:
px.line(df, x='date', y='cases_per_million', color='state', title='COVID 19 Total Cases Per Million People For Select US States vs Date')

In [None]:
df = df.query('date > "2020-04-01"')
px.scatter(df, 
    x='timestamps', log_x=True,
    y='cases_per_million_delta', log_y=True,
    title='COVID 19 Change in Cases Per Million People Per Day For Select US States vs Date',
    color='state', trendline='lowess'
)

In [None]:
px.line(df, x='date', y='deaths', title='COVID 19 related deaths in select US states vs Date', color='state')

In [None]:
df.tail(tail_size*2)