<h1>ARIMA Residuals Model</h1>

<p>This model first fits a standard erf curve to the cumulative deaths data for each county. Over the long term, this method tends to underpredict the cumulative deaths, especially in the larger counties. To account for this, we fit an erf curve up to varying dates before the start of our actual prediction boundary, then train an ARIMA forecasting model on the residuals of the curve. </p>

In [55]:
from scipy.special import erf
from scipy.optimize import curve_fit
from scipy.stats import norm
import numpy as np
from numpy import array
import matplotlib.pyplot as plt

from statsmodels.tsa.arima_model import ARIMA

import pandas as pd
from util import util

from datetime import datetime

<h1>Global Dataframes and Variables</h1>

In [56]:
# We import the sample_submission.csv file as a way of determining
# the order of the rows in out output file
sample_submission = pd.read_csv("../sample_submission.csv")

# The fips_key.csv file contains standard information about each county
key = pd.read_csv("../data/us/processing_data/fips_key.csv", encoding='latin-1')

# Daily deaths contains the death count per day for each county.
# Cumulative deaths contains the total death count for each county
# by day.
daily_deaths = pd.read_csv("../data/us/covid/nyt_us_counties_daily.csv")
cumulative_deaths = pd.read_csv("../data/us/covid/deaths.csv")
county_land_areas = pd.read_csv("../data/us/demographics/county_land_areas.csv", encoding='latin1')
county_population = pd.read_csv("../data/us/demographics/county_populations.csv", encoding='latin1')
mobility_data = pd.read_csv("../data/us/mobility/DL-us-m50.csv", encoding='latin1')

# List of all counties
all_fips = key["FIPS"].tolist()

util = util(daily_deaths, cumulative_deaths, county_land_areas, county_population, mobility_data, key)

MIN_TOTAL_DEATHS = 80
MIN_DAYS_SINCE_FIRST_DEATH = 10

# Relevant dates
today = cumulative_deaths.columns[-1]

<h1>Quantile Generating Functions</h1>

In [57]:
# Generate the quantiles for a given value and standard error
# according to a normal distribution.
def generate_quantiles(value, err):
    if err == 0:
        return [value] * 9
    
    quantiles = []
    for quantile in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
        quantiles.append(norm.ppf(quantile, loc=value, scale=err))

    return quantiles

# Generate quantiles for a given list of values and errors
def generate_list_quantiles(lst, err_lst):
    quantiles = []
    for i in range(len(lst)):
        quantiles.append(generate_quantiles(lst[i], err_lst[i]))

    return quantiles

<h1>Helpful Date Functions</h1>

In [58]:
# Get all dates used over the course of the term
all_dates = sample_submission["id"].values.copy()

def extract_date_from_id(row_id):
    split = row_id.split('-')
    return '-'.join(split[:-1])

for i in range(len(all_dates)):
    all_dates[i] = extract_date_from_id(all_dates[i])

# Remove duplicates in the list
all_dates = list(dict.fromkeys(all_dates))

# Assume date is in format mm/dd/yy, convert to yyyy-mm-dd
def convert_date_to_yyyy_mm_dd(date):
    parts = date.split('/')
    
    # Ensure leading zeros if necessary
    if len(parts[0]) == 1:
        parts[0] = "0" + parts[0]
    
    if len(parts[1]) == 1:
        parts[1] = "0" + parts[1]
        
    return "2020" + "-" + parts[0] + "-" + parts[1]

# Assume date is in format yyyy-mm-dd, convert to mm/dd/yy
def convert_date_to_mm_dd_yy(date):
    parts = date.split('-')
    
    # Remove leading zeros if necessary
    if parts[1][0] == "0":
        parts[1] = parts[1][1:]
    
    if parts[2][0] == "0":
        parts[2] = parts[2][1:]
        
    return parts[1] + "/" + parts[2] + "/" + "20"

# Starting from a given date, take an input number of steps
# and compute a list of dates containg the start date and
# "steps" dates into the future or past, for a total of steps
# dates.
def get_dates_from_start(startDate, steps):
    if steps > 0:
        dates = all_dates[all_dates.index(startDate):all_dates.index(startDate) + steps]
    else:
        dates = all_dates[all_dates.index(startDate) + steps:all_dates.index(startDate)]
    return dates

# Get the next date of a given date
def get_next_date(startDate):
    return get_dates_from_start(startDate, 2)[1]

<h1>Functions Used for Curve Fitting</h1>

In [59]:
def erf_curve(times, log_max, slope, center):
    max_val = 10 ** log_max
    deaths = max_val * (1 + erf(slope * (times - center)))
    return deaths

def eval_erf(times, coefs):
    max_val = 10 ** coefs[0]
    deaths = max_val * (1 + erf(coefs[1] * (times - coefs[2])))
    return deaths

def linear_curve(times, slope, intercept):
    return [x * slope for x in times] + intercept

def constant_curve(times, c):
    return [x * c for x in times]

In [60]:
# Given a list of daily deaths, compute a list of cumulative
# deaths of the same length
def get_cumulative_from_daily(daily):
    cumulative = []
    curr = 0
    for deaths in daily:
        curr += deaths
        cumulative.append(curr)
    
    return cumulative

# Given a fips and end date, fit an erf curve to the cumulative deaths
# of the county and return the coefficients
def get_erf_curve(fips, endDate):
    daily_deaths_list = util.get_deaths_list(fips, endDate=endDate)
    cumulative_deaths_list = get_cumulative_from_daily(daily_deaths_list)

    # Compute x and y lists to pass to curve_fit
    x = [i for i in range(len(cumulative_deaths_list))]
    y = cumulative_deaths_list
    
    assert len(y) >= MIN_DAYS_SINCE_FIRST_DEATH and y[-1] > MIN_TOTAL_DEATHS
    popt, pcov = curve_fit(erf_curve, x, y, maxfev=10000)
    
    return popt

def get_erf_residuals(fips, end_train_date, n_steps):
    train_daily_deaths = util.get_deaths_list(fips, endDate=end_train_date)
    all_daily_deaths = util.get_deaths_list(fips, endDate=convert_date_to_yyyy_mm_dd(today))
    
    # Get an optimal erf curve fit for this county up to end_train_date
    erf_coefs = get_erf_curve(fips, end_train_date)
    
    # Ensure that there are n_steps more dates after the end of the train date
    assert len(train_daily_deaths) + n_steps <= len(all_daily_deaths)
    
    # Generate an input array to evaluate predictions on the coming n_steps dates
    x_input = []
    for i in range(len(train_daily_deaths), len(train_daily_deaths) + n_steps):
        x_input.append(i)
    
    cumulative_train_deaths = get_cumulative_from_daily(train_daily_deaths)
    all_cumulative_deaths = get_cumulative_from_daily(all_daily_deaths)
    
    # Make predictions for the next n_steps days
    predictions = []
    for i in x_input:
        predictions.append(eval_erf(i, erf_coefs))
    
    # Compute the residuals of the predictions
    residuals = []
    for i, pred in enumerate(predictions):
        residuals.append(all_cumulative_deaths[x_input[i]] - pred)
        
    assert len(residuals) == n_steps
    
    return residuals, erf_coefs

In [61]:
def get_id_list():
    return sample_submission["id"].values

def extract_date_from_id(row_id):
    split = row_id.split('-')
    return '-'.join(split[:-1])

def extract_fips_from_id(row_id):
    return row_id.split('-')[-1]

def train_arima(trainData, order=(2, 1, 0)):
    model = ARIMA(trainData, order=order)
    model_fit = model.fit(disp=0)

    return model_fit

def get_residuals_predictions(train_residuals, n_steps, order=(2, 1, 0)):
    try:
        model = train_arima(train_residuals, order=order)
    except:
        average = np.mean(train_residuals)
        return [average] * n_steps
    
    forecast = model.forecast(steps=n_steps)[0]
    return list(forecast)

In [62]:
n_pred_steps = 18

last_train_date = "2020-05-22"
dates_to_consider = get_dates_from_start(get_next_date(last_train_date), n_pred_steps)

for n_erf_pred_steps in range(7, 16):
    print("Window Size: " + str(n_erf_pred_steps))
    # Train an erf curve fitting model up until two weeks before the 
    # last train date. This residuals map will store for each fips
    # passing a certain threshold number of deaths a list of the
    # residuals up until the last train date.
    residuals_map = {}
    erf_coefs_map = {}

    x_train = []
    y_train = []

    for fips in all_fips:
        last_erf_train_date = get_dates_from_start(dates_to_consider[0], -(n_erf_pred_steps + 1))[0]

        daily_deaths_list = util.get_deaths_list(fips, endDate=last_erf_train_date)
        cumulative_deaths_list = get_cumulative_from_daily(daily_deaths_list)

        if len(cumulative_deaths_list) <= MIN_DAYS_SINCE_FIRST_DEATH or cumulative_deaths_list[-1] <= MIN_TOTAL_DEATHS:
            continue

        residuals, erf_coefs = get_erf_residuals(fips, last_erf_train_date, n_erf_pred_steps)

        residuals_map[fips] = residuals
        erf_coefs_map[fips] = erf_coefs

    #############################################
    ##### Store predictions in a dictionary #####
    #############################################

    stderr = 0.6
    data = {}
    for fips in all_fips:
        if fips in [44001, 44003, 44005, 44007, 44009]:
            continue
            
        data[fips] = {}

        daily_deaths_list = util.get_deaths_list(fips, endDate=last_train_date)
        cumulative_deaths_list = get_cumulative_from_daily(daily_deaths_list)

        if (len(cumulative_deaths_list) == 0) or (not fips in residuals_map and cumulative_deaths_list[-1] < 20):
            for i, date in enumerate(dates_to_consider):
                data[fips][date] = [0] * 9
        elif not fips in residuals_map and cumulative_deaths_list[-1] >= 20:
            # Fit a linear model to the last 20 points of data
            length = min(20, len(daily_deaths_list))
            x_input = [i for i in range(length)]

            popt, pcov = curve_fit(linear_curve, x_input, daily_deaths_list[-length:], maxfev=10000)

            x_preds = [i + length for i in range(n_pred_steps)]
            output = linear_curve(x_preds, popt[0], popt[1])
            errors = [x * stderr for x in output]

            quantiles = generate_list_quantiles(output, errors)
            for i, date in enumerate(dates_to_consider):
                data[fips][date] = quantiles[i]

        else:
            residuals = residuals_map[fips]
            erf_coefs = erf_coefs_map[fips]

            daily_deaths = util.get_deaths_list(fips, endDate=last_train_date)
            cumulative_deaths = get_cumulative_from_daily(daily_deaths)

            residuals_predictions = get_residuals_predictions(residuals, n_pred_steps)
            x_in = [i + len(cumulative_deaths) for i in range(0, n_pred_steps)]
            erf_predictions = eval_erf(x_in, erf_coefs)

            final_predictions = list(residuals_predictions + erf_predictions)
            final_predictions.insert(0, cumulative_deaths[-1])
            final_predictions = np.diff(final_predictions)

            errors = [x * stderr for x in final_predictions]

            quantiles = generate_list_quantiles(final_predictions, errors)

            for i, date in enumerate(dates_to_consider):
                data[fips][date] = quantiles[i]

    ###########################
    ##### Export to a CSV #####
    ###########################

    lists = []
    for row_id in get_id_list():
        date = extract_date_from_id(row_id)
        fips = int(extract_fips_from_id(row_id))
        
        if not fips in data:
            lst = [row_id] + ["%.2f" % 0.00] * 9
            lists.append(lst)
            continue

        if not date in data[fips]:
            lst = [row_id] + ["%.2f" % 0.00] * 9
            lists.append(lst)
            continue

        quantiles = data[fips][date]
        lst = [row_id]
        for q in quantiles:
            if str(q) == "nan":
                lst.append("%.2f" % 0.00)
            elif q < 0:
                lst.append("%.2f" % 0.00)
            else:
                lst.append("%.2f" % q)

        lists.append(lst)

    df = pd.DataFrame(lists, columns=sample_submission.columns)
    df.to_csv("arima_residuals_predictions_" + str(n_erf_pred_steps) +  ".csv", index=False, sep=',')
              
              

Window Size: 7


  invarcoefs = 2*np.arctanh(params)


Window Size: 8




Window Size: 9




Window Size: 10




Window Size: 11




Window Size: 12




Window Size: 13
Window Size: 14
Window Size: 15


In [63]:
predictions1 = pd.read_csv("arima_residuals_predictions_7.csv")
predictions2 = pd.read_csv("arima_residuals_predictions_8.csv")
predictions3 = pd.read_csv("arima_residuals_predictions_9.csv")
predictions4 = pd.read_csv("arima_residuals_predictions_10.csv")
predictions5 = pd.read_csv("arima_residuals_predictions_11.csv")
predictions6 = pd.read_csv("arima_residuals_predictions_12.csv")
predictions7 = pd.read_csv("arima_residuals_predictions_13.csv")
predictions8 = pd.read_csv("arima_residuals_predictions_14.csv")
predictions9 = pd.read_csv("arima_residuals_predictions_15.csv")

cols = ['id', '10', '20', '30', '40', '50', '60', '70', '80', '90']

averaged_predictions = pd.DataFrame(columns=cols)
averaged_predictions['id'] = predictions1['id']

for col in cols[1:]:
    averaged_predictions[col] = predictions1[col] * 1/9 + predictions2[col] * 1/9 + predictions3[col] * 1/9 + predictions4[col] * 1/9 + predictions5[col] * 1/9 + predictions6[col] * 1/9 + predictions7[col] * 1/9 + predictions8[col] * 1/9 + predictions9[col] * 1/9

averaged_predictions.to_csv("averaged_arima_predictions.csv", index=False, sep=',')