In [59]:
import pandas as pd
import numpy as np
from timeit import default_timer as timer
pd.options.plotting.backend = 'plotly'

In [60]:
train = pd.read_csv('train.csv')
states = train['Province_State'].unique()
state_dfs_raw = {state: train[train['Province_State'] == state] for state in states}
state_means = {}
state_stds = {}
state_dfs = {}
for s, state_df_raw in state_dfs_raw.items():
    state_df = state_df_raw.drop(columns=['Province_State', 'ID']).iloc[:-14]
    state_df['Date'] = pd.to_datetime(state_df['Date'], format='%m-%d-%Y')
    state_df = state_df.set_index('Date')

    mean, std = state_df.mean(), state_df.std()
    state_df = (state_df - mean) / std

    state_means[s] = mean
    state_stds[s] = std
    state_dfs[s] = state_df

column_names = state_dfs['Alabama'].columns

In [61]:
from var import VAR

lag_order = 4
n_features = column_names.size

def model_factory():
    model = VAR(lag_order)
    return model

In [62]:
# import numpy.linalg as la

# lag_order = 2
# data = state_dfs['California'].values

# A_windows = (
#     np.expand_dims(np.arange(lag_order), 0) +
#     np.expand_dims(np.arange(data.shape[0] - lag_order), 0).T
# )
# b_indices = np.arange(lag_order, data.shape[0])

# A = np.nan_to_num(data[A_windows])
# A = A.reshape((A.shape[0], A.shape[1] * A.shape[2]))
# b = np.nan_to_num(data[b_indices])

# x, res, _, _ = la.lstsq(A, b)

In [63]:
# np.average(res)

In [66]:
# Normally takes 80s to train
models = {}
model_histories = {}
model_test_data = {}
for state in states:
    stime = timer()
    print(f'------------------------\nTraining model for {state}')
    state_data = np.diff(state_dfs[state].values, 1, axis=0)

    model = model_factory()
    res = model.fit(state_data, state_data)
    
    print(f'\tTraining loss for {state}: {np.average(res):.4f}')
    models[state] = model

    print(f'\tTraining took {timer() - stime:.4f}s')

------------------------
Training model for Alabama
	Training loss for Alabama: 0.7775
	Training took 0.0064s
------------------------
Training model for Alaska
	Training loss for Alaska: 0.3611
	Training took 0.0033s
------------------------
Training model for Arizona
	Training loss for Arizona: 0.4418
	Training took 0.0035s
------------------------
Training model for Arkansas
	Training loss for Arkansas: 2.0402
	Training took 0.0023s
------------------------
Training model for California
	Training loss for California: nan
	Training took 0.0017s
------------------------
Training model for Colorado
	Training loss for Colorado: 0.5000
	Training took 0.0034s
------------------------
Training model for Connecticut
	Training loss for Connecticut: nan
	Training took 0.0025s
------------------------
Training model for Delaware
	Training loss for Delaware: 0.3099
	Training took 0.0035s
------------------------
Training model for Florida
	Training loss for Florida: nan
	Training took 0.0033s
-

In [90]:
def forecast(model, data, steps, columns, index):
    output = []
    for _ in range(steps):
        prediction = model.predict(np.reshape(data, (1, lag_order, n_features)))[0]
        output.append(prediction)
        data = np.vstack((data[1:], prediction))

    return pd.DataFrame(output, columns=columns, index=index)

In [93]:
state_forecasts = {}
periods = 26
columns, index = column_names, pd.date_range(state_dfs['Alabama'].index[-1], periods=periods + 1)[1:]
for state in states:
    print(f'Forecasting {state} ({index[0]} to {index[-1]})')
    data_df = state_dfs[state].iloc[-lag_order - 1:]

    forecast_df = forecast(models[state], np.nan_to_num(data_df.values), periods, columns, index)
    state_forecasts[state] = forecast_df

Forecasting Alabama (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Alaska (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Arizona (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Arkansas (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting California (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Colorado (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Connecticut (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Delaware (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Florida (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Georgia (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Hawaii (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Idaho (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Illinois (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Indiana (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Iowa (2020-08-18 00:00:00 to 2020-09-12 00:00:00)
Forecasting Kansas (2

In [94]:
cleaned_dfs = {}
for state in states:
    cleaned_df = state_forecasts[state][['Confirmed', 'Deaths']]
    cleaned_df['Province_State'] = state
    cleaned_df['Date'] = cleaned_df.index

    cleaned_df['Confirmed'] *= state_stds[state]['Confirmed']
    cleaned_df['Deaths'] *= state_stds[state]['Deaths']
    cleaned_df['Confirmed'] += state_means[state]['Confirmed']
    cleaned_df['Deaths'] += state_means[state]['Deaths']


    cleaned_dfs[state] = cleaned_df[['Province_State', 'Date', 'Confirmed', 'Deaths']]

kaggle_res = []

for _ in range(periods):
    for state in states:
        kaggle_res.append(state_forecasts[state].iloc[_])

In [95]:
state_dfs['Alabama']

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-04-12,-1.052775,-1.471530,,-0.939513,-1.052544,-1.351007,-0.940580,-0.322742,-1.355781,0.638766
2020-04-13,-1.047588,-1.460031,,-0.930893,-1.047108,-1.319982,-0.935523,-0.273013,-1.323213,0.626093
2020-04-14,-1.040945,-1.431285,,-0.920234,-1.040146,-1.303916,-0.926421,0.008032,-1.306349,0.739237
2020-04-15,-1.037245,-1.423619,,-0.914070,-1.036268,-1.299997,-0.918330,0.022315,-1.302234,0.939550
2020-04-16,-1.029055,-1.394873,,-0.900747,-1.027685,-1.290549,-0.911250,0.222047,-1.292317,0.863605
...,...,...,...,...,...,...,...,...,...,...
2020-08-13,2.040889,1.972286,1.666994,2.125945,2.043060,1.837541,2.098405,-1.313228,1.840986,0.412781
2020-08-14,2.063699,1.978035,1.666994,2.165077,2.065921,1.929084,2.098405,-1.325123,1.932881,0.372187
2020-08-15,2.102250,1.983784,1.666994,2.231325,2.104558,1.978212,2.136585,-1.347175,1.982198,0.373127
2020-08-16,2.128123,1.987617,1.666994,2.275786,2.130489,2.016619,2.136585,-1.361700,2.020752,0.328295


In [97]:
state = 'Alabama'
pd.concat([state_dfs[state], state_forecasts[state]]).plot()

In [51]:
state_dfs['Alabama'].iloc[-lag_order:]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-08-14,2.063699,1.978035,1.666994
2020-08-15,2.10225,1.983784,1.666994
2020-08-16,2.128123,1.987617,1.666994
2020-08-17,2.145443,2.03936,1.666994


In [52]:
state_forecasts['Alabama']

Unnamed: 0,Confirmed,Deaths,Recovered
2020-08-18,2.171903,2.062194,1.765508
2020-08-19,2.19447,2.070517,1.751835
2020-08-20,2.215957,2.077248,1.681364
2020-08-21,2.235687,2.083775,1.696538
2020-08-22,2.254012,2.091205,1.7134
2020-08-23,2.271066,2.098898,1.741256
2020-08-24,2.286822,2.105648,1.777979
2020-08-25,2.301322,2.111137,1.810095
2020-08-26,2.314494,2.11534,1.839426
2020-08-27,2.326324,2.118286,1.865941


In [53]:
state_dfs_raw['Alabama']['Confirmed'].plot()

In [54]:
cleaned_dfs['Alabama']['Confirmed'].plot()

In [55]:
state_forecasts['Alabama']['Deaths'].plot()

In [56]:
state_dfs_raw['Alabama']['Confirmed']

0         3563
50        3734
100       3953
150       4075
200       4345
         ...  
6850    121023
6900    122185
6950    123889
7000    125235
7050    126058
Name: Confirmed, Length: 142, dtype: int64

In [57]:
cleaned_dfs['Alabama']['Confirmed'].plot() # state_stds['Alabama']['Confirmed']