In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt

In [None]:
ts = pd.read_csv("COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv")

In [None]:
ts.head()

In [None]:
def makeMovingAverageDailyRate(fr):
    # create a copy of the frame, as we're going to muss it up a bit
    new_fr = fr.copy()
    # The first date in our series is January 22, 2020.
    # We'll start our timeline there.
    st_col = ts.columns.get_loc("Population") + 1

    # strip out all the districts with a zero or NaN population
    new_fr = new_fr[new_fr['Population'] > 0]
    
    # now calculate for each day, the increase in deaths relative 
    # to the number of deaths reported exactly one 
    # week before.  This amounts to an estimate, on that day of the 
    # week, of the weekly death rate.
    # We don't actually want the day-to-day change, as there are substantial swings
    # during the week due to the timing of reports, and the way deaths are recorded.
    # (You aren't officially dead until after someone notices.)
    new_fr.iloc[:,st_col+7:] = np.subtract(ts.iloc[:,st_col+7:], ts.iloc[:,st_col:-7])
    
    # now here is the tricky (and perhaps not-very-statistically-sound) part
    # calculate the rolling average (over the preceeding 7 days) of the "weekly" death rate
    # and divide by 7 to get an estimate of the daily death rate... ???
    # I got a "C" in statistics, and this might be a hint as to why...
    
    # If you are looking for a reason to feel good about this: look at it as a trailing estimate
    # of the daily death rate. 
    new_fr.iloc[:,st_col+7:] = np.divide(new_fr.iloc[:, st_col:].rolling(window=7,axis=1).sum().iloc[:,st_col:], 7)
    return new_fr

### Let's find the 7 day moving average death rate

But we should only look at reporters with more than 25 or so reported deaths by the end of the period.

Here's why:

In [None]:
plotDistrict(ts, 'Buncombe', 'North Carolina', "Total Deaths to Date", False)

In [None]:
delta_series = makeMovingAverageDailyRate(ts[ts.iloc[:,-1] > 25])
delta_series.shape

In [None]:
delta_series.head()

In [None]:
def getDistrict(fr, district, state):
    new_fr = fr[(fr.Admin2 == district) & (fr.Province_State == state)]
    return new_fr

def plotDistrict(fr, district, state, base_title = 'Deaths-per-day per Million Population', normalize_per_million = True):
    new_fr = getDistrict(fr, district, state)
    # first date is right after Population
    st_col = fr.columns.get_loc("Population") + 1
    # normalize? 
    population = 1
    if normalize_per_million: 
        population = 1e-6 * new_fr.Population.sum() # sum turns it into a scalar
    
    nums = np.copy(new_fr.iloc[:,st_col:])
    
    # all the plot magic is from Stack Overflow and various other 
    # magic sources / deep pits of iniquity
    xaxis_dates = list(new_fr)[st_col:]
    x = [dt.datetime.strptime(d,'%m/%d/%y').date() for d in xaxis_dates]
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
    plt.gca().xaxis.set_minor_locator(mdates.WeekdayLocator(byweekday=mdates.SU))
    plt.title("%s : %s, %s" % (base_title, district, state))
    plt.gcf().autofmt_xdate()
    plt.grid()
    
    plt.plot(x, nums[0] / population)


In [None]:
mdates.WeekdayLocator.__doc__

In [None]:
fig = plt.figure(figsize=[15,10])
plotDistrict(delta_series, 'Baltimore', 'Maryland')

In [None]:
plotDistrict(delta_series, 'Alameda', 'California')
getDistrict(delta_series, 'Alameda', 'California')

In [None]:
plotDistrict(ts, 'Alameda', 'California', 'Total Reported Deaths', False)

## New York

The death rate in the city of New York has been staggering. However, the number of new deaths reported per day has been declining for several weeks.  At this point (May 12, 2020) the death rate is 200 per million per day. 

The third plot puts this in perspective, by not normalizing to the size of the population. 

In [None]:
plotDistrict(ts, 'New York', 'New York', "Total Deaths Reported", False)

In [None]:
plotDistrict(delta_series, 'New York', 'New York')

In [None]:
plotDistrict(delta_series, 'New York', 'New York', "Deaths reported per day", False)

## New York -- Can that possibly be true? 

Let's look at the last week or so of data from the "reported deaths" timeseries

In [None]:
ny = ts[ts.Admin2 == 'New York'].iloc[:,-14:]
ny

In [None]:
(ny.iloc[:,-7:].sum(axis=1) - ny.iloc[:,-14:-7].sum(axis=1)) / 7