In [142]:
import pandas as pd
import numpy as np
pd.options.plotting.backend = 'plotly'

In [143]:
train = pd.read_csv('train.csv')
states = train['Province_State'].unique()
state_dfs_raw = {state: train[train['Province_State'] == state] for state in states}
state_means = {}
state_stds = {}
state_dfs = {}
for s, state_df_raw in state_dfs_raw.items():
    state_df = state_df_raw.drop(columns=['Province_State', 'ID'])
    state_df['Date'] = pd.to_datetime(state_df['Date'], format='%m-%d-%Y')
    state_df = state_df.set_index('Date')

    mean, std = state_df.mean(), state_df.std()

    state_df = (state_df - mean) / std

    state_means[s] = mean
    state_stds[s] = std
    state_dfs[s] = state_df

In [166]:
column_names = state_dfs['Alabama'].columns
state_dfs['Alabama']

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-04-12,-1.081612,-1.451163,,-0.977881,-1.081171,-1.337257,-0.972641,-0.180803,-1.340628,0.630798
2020-04-13,-1.077266,-1.441323,,-0.970797,-1.076618,-1.311068,-0.968219,-0.132734,-1.313159,0.617707
2020-04-14,-1.071698,-1.416721,,-0.962039,-1.070786,-1.297506,-0.960259,0.138927,-1.298934,0.734577
2020-04-15,-1.068597,-1.410161,,-0.956972,-1.067537,-1.294197,-0.953184,0.152733,-1.295464,0.941486
2020-04-16,-1.061734,-1.385559,,-0.946025,-1.060347,-1.286222,-0.946994,0.345796,-1.287100,0.863040
...,...,...,...,...,...,...,...,...,...,...
2020-08-27,1.904284,1.801151,1.689791,1.917901,1.905858,1.885961,2.027223,-1.225969,1.888886,0.282799
2020-08-28,1.933822,1.851994,1.689791,1.966458,1.935449,1.913718,,-1.215389,1.916728,
2020-08-29,1.977139,1.925798,1.689791,2.037684,1.978842,1.951474,,-1.200666,1.954597,
2020-08-30,2.011355,1.942199,1.689791,2.095042,2.013118,1.969548,,-1.213146,1.972725,


In [145]:
state_dfs['Alabama'].plot()

In [155]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

# input is (n_samples, n_timesteps, n_features)
# output is (n_samples, n_features)

timesteps = 14
features = state_dfs['Alabama'].columns.size

model = Sequential()
model.add(LSTM(20, activation='relu', input_shape=(steps, features)))
model.add(Dense(features))
model.compile(optimizer='adam', loss='mse')

In [156]:
alabama_data = state_dfs['Alabama'].values

X_windows = (
    np.expand_dims(np.arange(timesteps), 0) +
    np.expand_dims(np.arange(alabama_data.shape[0] - timesteps), 0).T
)
y_indices = np.arange(timesteps, alabama_data.shape[0])

X = np.nan_to_num(alabama_data[X_windows])
y = np.nan_to_num(alabama_data[y_indices])

split = int(X.shape[0] * 0.8)

X_train, y_train = X[:split], y[:split]
X_test, y_test = X[split:], y[split:]

In [157]:
model.fit(X_train, y_train, epochs=3000, verbose=2, validation_data=(X_test, y_test))

 0s - loss: 7.9702e-04 - val_loss: 0.1008
Epoch 2679/3000
4/4 - 0s - loss: 8.2028e-04 - val_loss: 0.0988
Epoch 2680/3000
4/4 - 0s - loss: 8.5988e-04 - val_loss: 0.0979
Epoch 2681/3000
4/4 - 0s - loss: 7.9222e-04 - val_loss: 0.0978
Epoch 2682/3000
4/4 - 0s - loss: 7.8932e-04 - val_loss: 0.1001
Epoch 2683/3000
4/4 - 0s - loss: 8.2536e-04 - val_loss: 0.0989
Epoch 2684/3000
4/4 - 0s - loss: 7.9412e-04 - val_loss: 0.1001
Epoch 2685/3000
4/4 - 0s - loss: 7.5104e-04 - val_loss: 0.0974
Epoch 2686/3000
4/4 - 0s - loss: 7.3253e-04 - val_loss: 0.1036
Epoch 2687/3000
4/4 - 0s - loss: 7.3315e-04 - val_loss: 0.0981
Epoch 2688/3000
4/4 - 0s - loss: 7.9068e-04 - val_loss: 0.0996
Epoch 2689/3000
4/4 - 0s - loss: 7.4518e-04 - val_loss: 0.0980
Epoch 2690/3000
4/4 - 0s - loss: 7.7430e-04 - val_loss: 0.0971
Epoch 2691/3000
4/4 - 0s - loss: 7.7754e-04 - val_loss: 0.1002
Epoch 2692/3000
4/4 - 0s - loss: 7.8674e-04 - val_loss: 0.1024
Epoch 2693/3000
4/4 - 0s - loss: 7.8206e-04 - val_loss: 0.1028
Epoch 2694/30

<tensorflow.python.keras.callbacks.History at 0x7f4530226160>

In [168]:
df_pred = pd.DataFrame(model.predict(X_test), columns=column_names + '_hat')
df_true = pd.DataFrame(y_test, columns=column_names)

pd.concat([df_pred, df_true], axis=1).plot()

In [165]:
df = pd.DataFrame()
df['yhat'] = model.predict(X_test)[:, 0]
df['y'] = y_test[:, 0]

df.plot()

In [150]:
y_train

array([-1.0089606 , -1.00596098, -1.00059724, -0.99647912, -0.9920051 ,
       -0.98676847, -0.97871016, -0.97166867, -0.96597447, -0.9577128 ,
       -0.95125598, -0.94223169, -0.93361413, -0.92642012, -0.92080218,
       -0.91381153, -0.90618537, -0.90018612, -0.88999249, -0.8830781 ,
       -0.87542652, -0.87296073, -0.86495326, -0.85758131, -0.84039702,
       -0.83439778, -0.82468713, -0.81332415, -0.80414734, -0.79123371,
       -0.77435447, -0.76464382, -0.7519844 , -0.73924871, -0.73091077,
       -0.71583639, -0.69860127, -0.69514408, -0.69298333, -0.68736539,
       -0.67935793, -0.66268205, -0.65106487, -0.64026114, -0.62762713,
       -0.61321369, -0.59145371, -0.56946495, -0.54681525, -0.52103883,
       -0.50433753, -0.48806839, -0.47790017, -0.45517422, -0.43493947,
       -0.42103444, -0.40903594, -0.39802885, -0.38168345, -0.35710179,
       -0.32807153, -0.30323567, -0.28035719, -0.27125664, -0.22717743,
       -0.20506156, -0.18175093, -0.15254273, -0.10795511, -0.08