In [1]:
import pandas as pd
import numpy as np
from timeit import default_timer as timer
pd.options.plotting.backend = 'plotly'

In [2]:
train = pd.read_csv('train.csv')
states = train['Province_State'].unique()
state_dfs_raw = {state: train[train['Province_State'] == state] for state in states}
state_means = {}
state_stds = {}
state_dfs = {}
for s, state_df_raw in state_dfs_raw.items():
    state_df = state_df_raw.drop(columns=['Province_State', 'ID'])
    state_df['Date'] = pd.to_datetime(state_df['Date'], format='%m-%d-%Y')
    state_df = state_df.set_index('Date')

    mean, std = state_df.mean(), state_df.std()
    state_df = (state_df - mean) / std

    state_means[s] = mean
    state_stds[s] = std
    state_dfs[s] = state_df

column_names = state_dfs['Alabama'].columns

In [20]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense, Flatten, Dropout

from tensorflow.keras.losses import MSE
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# variable seq2single - seq2seq provided by rolling predictions as inputs

# input is (variable_samples, lag_order, n_features)
# output is (variable_samples, n_features)

n_features = column_names.size

def model_factory():
    model = Sequential()
    model.add(LSTM(50, input_shape=(None, n_features)))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    print(model.summary())

    return model

tmp_path = 'tmp/tmp_model.h5'
checkpoint = ModelCheckpoint(
        tmp_path,
        save_weights_only=True,
        monitor='val_loss',
        mode='min',
        verbose=0,
        save_best_only=True,
    )

In [21]:
model = model_factory()
model.predict(np.random.rand(1, 10, n_features)).shape

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_5 (Dense)              (None, 10)                510       
Total params: 12,710
Trainable params: 12,710
Non-trainable params: 0
_________________________________________________________________
None


(1, 10)

In [9]:
from tensorflow.keras.utils import Sequence

class MyBatchGenerator(Sequence):
    'Generates data for Keras'
    def __init__(self, X, y, batch_size=1, shuffle=True):
        'Initialization'
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.y)/self.batch_size))

    def __getitem__(self, index):
        return self.__data_generation(index)

    def on_epoch_end(self):
        'Shuffles indexes after each epoch'
        self.indexes = np.arange(len(self.y))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, index):
        Xb = np.empty((self.batch_size, *X[index].shape))
        yb = np.empty((self.batch_size, *y[index].shape))
        # naively use the same sample over and over again
        for s in range(0, self.batch_size):
            Xb[s] = X[index]
            yb[s] = y[index]
        return Xb, yb

In [36]:
# Normally takes 80s to train
models = {}
model_histories = {}
model_test_data = {}
for state in states:
    stime = timer()
    print(f'------------------------\nTraining model for {state}')
    state_data = state_dfs[state].values

    X_windows = (
        np.expand_dims(np.arange(lag_order), 0) +
        np.expand_dims(np.arange(state_data.shape[0] - lag_order), 0).T
    )
    y_indices = np.arange(lag_order, state_data.shape[0])

    X = np.nan_to_num(state_data[X_windows])
    y = np.nan_to_num(state_data[y_indices])

    model = model_factory()
    model_histories[state] = model.fit(
        X,
        y,
        validation_split=0.2,
        epochs=2000,
        verbose=1,
        callbacks=[checkpoint]
    )

    # model.load_weights(tmp_path)
    
    y_pred = model.predict(X)
    print(f'\tTraining loss for {state}: {np.average(MSE(y_pred, y)):.4f}')
    models[state] = model
    break

    print(f'\tTraining took {timer() - stime:.4f}s')

/2000
Epoch 1806/2000
Epoch 1807/2000
Epoch 1808/2000
Epoch 1809/2000
Epoch 1810/2000
Epoch 1811/2000
Epoch 1812/2000
Epoch 1813/2000
Epoch 1814/2000
Epoch 1815/2000
Epoch 1816/2000
Epoch 1817/2000
Epoch 1818/2000
Epoch 1819/2000
Epoch 1820/2000
Epoch 1821/2000
Epoch 1822/2000
Epoch 1823/2000
Epoch 1824/2000
Epoch 1825/2000
Epoch 1826/2000
Epoch 1827/2000
Epoch 1828/2000
Epoch 1829/2000
Epoch 1830/2000
Epoch 1831/2000
Epoch 1832/2000
Epoch 1833/2000
Epoch 1834/2000
Epoch 1835/2000
Epoch 1836/2000
Epoch 1837/2000
Epoch 1838/2000
Epoch 1839/2000
Epoch 1840/2000
Epoch 1841/2000
Epoch 1842/2000
Epoch 1843/2000
Epoch 1844/2000
Epoch 1845/2000
Epoch 1846/2000
Epoch 1847/2000
Epoch 1848/2000
Epoch 1849/2000
Epoch 1850/2000
Epoch 1851/2000
Epoch 1852/2000
Epoch 1853/2000
Epoch 1854/2000
Epoch 1855/2000
Epoch 1856/2000
Epoch 1857/2000
Epoch 1858/2000
Epoch 1859/2000
Epoch 1860/2000
Epoch 1861/2000
Epoch 1862/2000
Epoch 1863/2000
Epoch 1864/2000
Epoch 1865/2000
Epoch 1866/2000
Epoch 1867/2000
Ep

In [37]:
def save_models():
    for state, model in models.items():
        model.save(f'models/{state}')

def load_models():
    global models
    models = {}
    for state in states:
        print(f'Loading {state} model...')
        models[state] = load_model(f'models/{state}')

In [38]:
def forecast(model, data, steps, columns, index):
    output = []
    for _ in range(steps):
        prediction = model.predict(np.reshape(data, (1, lag_order, n_features)))[0]
        output.append(prediction)
        data = np.vstack((data[1:], prediction))
    return pd.DataFrame(output, columns=columns, index=index)

In [39]:
lag_amount = lag_order + 2
data = np.nan_to_num(state_dfs['Alabama'].iloc[-lag_amount:].values)
models['Alabama'].predict(np.reshape(data, (1, lag_amount, n_features)))

ValueError: in user code:

    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:1454 predict_function  *
        return step_function(self, iterator)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:1444 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:1268 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:2734 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\distribute\distribute_lib.py:3355 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:1437 run_step  **
        outputs = model.predict_step(data)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\training.py:1410 predict_step
        return self(x, training=False)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\base_layer.py:1003 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\sequential.py:375 call
        return super(Sequential, self).call(inputs, training=training, mask=mask)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\functional.py:425 call
        inputs, training=training, mask=mask)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\functional.py:560 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\base_layer.py:989 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\likev\AppData\Roaming\Python\Python37\site-packages\tensorflow\python\keras\engine\input_spec.py:259 assert_input_compatibility
        ' but received input with shape ' + display_shape(x.shape))

    ValueError: Input 0 of layer dense_6 is incompatible with the layer: expected axis -1 of input shape to have value 70 but received input with shape (None, 90)


In [14]:
state_forecasts = {}
periods = 26
columns, index = column_names, pd.date_range(state_dfs['Alabama'].index[-1], periods=periods + 1)[1:]
for state in states:
    print(f'Forecasting {state} ({index[0]} to {index[-1]})')
    data_df = state_dfs[state].iloc[-lag_order:]

    forecast_df = forecast(models[state], np.nan_to_num(data_df.values), periods, columns, index)
    state_forecasts[state] = forecast_df
    break

Forecasting Alabama (2020-08-18 00:00:00 to 2020-09-12 00:00:00)


In [15]:
cleaned_dfs = {}
for state in states:
    cleaned_df = state_forecasts[state][['Confirmed', 'Deaths']]
    cleaned_df['Province_State'] = state
    cleaned_df['Date'] = cleaned_df.index

    cleaned_df['Confirmed'] *= state_stds[state]['Confirmed']
    cleaned_df['Deaths'] *= state_stds[state]['Deaths']
    cleaned_df['Confirmed'] += state_means[state]['Confirmed']
    cleaned_df['Deaths'] += state_means[state]['Deaths']


    cleaned_dfs[state] = cleaned_df[['Province_State', 'Date', 'Confirmed', 'Deaths']]

    break

kaggle_res = []

for _ in range(periods):
    for state in states:
        kaggle_res.append(state_forecasts[state].iloc[_])
        break

In [16]:
state_dfs['Alabama']

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-04-12,-1.052775,-1.471530,,-0.939513,-1.052544,-1.351007,-0.940580,-0.322742,-1.355781,0.638766
2020-04-13,-1.047588,-1.460031,,-0.930893,-1.047108,-1.319982,-0.935523,-0.273013,-1.323213,0.626093
2020-04-14,-1.040945,-1.431285,,-0.920234,-1.040146,-1.303916,-0.926421,0.008032,-1.306349,0.739237
2020-04-15,-1.037245,-1.423619,,-0.914070,-1.036268,-1.299997,-0.918330,0.022315,-1.302234,0.939550
2020-04-16,-1.029055,-1.394873,,-0.900747,-1.027685,-1.290549,-0.911250,0.222047,-1.292317,0.863605
...,...,...,...,...,...,...,...,...,...,...
2020-08-13,2.040889,1.972286,1.666994,2.125945,2.043060,1.837541,2.098405,-1.313228,1.840986,0.412781
2020-08-14,2.063699,1.978035,1.666994,2.165077,2.065921,1.929084,2.098405,-1.325123,1.932881,0.372187
2020-08-15,2.102250,1.983784,1.666994,2.231325,2.104558,1.978212,2.136585,-1.347175,1.982198,0.373127
2020-08-16,2.128123,1.987617,1.666994,2.275786,2.130489,2.016619,2.136585,-1.361700,2.020752,0.328295


In [17]:
state_forecasts['Alabama']

Unnamed: 0,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
2020-08-18,0.904821,0.874941,0.777975,0.785305,0.902831,0.9466,0.938818,1.6e-05,0.94522,0.077989
2020-08-19,0.904819,0.87494,0.777972,0.785301,0.902829,0.946599,0.938816,1.6e-05,0.945219,0.077989
2020-08-20,0.904816,0.874937,0.777966,0.785295,0.902826,0.946598,0.938813,1.6e-05,0.945217,0.077988
2020-08-21,0.904813,0.874934,0.777961,0.785289,0.902824,0.946596,0.938811,1.6e-05,0.945216,0.077988
2020-08-22,0.904803,0.874925,0.777943,0.78527,0.902814,0.946592,0.938803,1.6e-05,0.945211,0.077987
2020-08-23,0.904786,0.874909,0.777912,0.785238,0.902798,0.946584,0.938789,1.6e-05,0.945203,0.077984
2020-08-24,0.904767,0.874891,0.777876,0.785199,0.902778,0.946575,0.938772,1.6e-05,0.945193,0.077982
2020-08-25,0.904691,0.87482,0.777738,0.785052,0.902705,0.94654,0.938709,1.6e-05,0.945157,0.077972
2020-08-26,0.904691,0.87482,0.777738,0.785052,0.902705,0.94654,0.938709,1.6e-05,0.945157,0.077972
2020-08-27,0.904691,0.87482,0.777738,0.785052,0.902705,0.94654,0.938709,1.6e-05,0.945157,0.077972


In [43]:
data

array([[ 1.98914303,  1.88987935,  1.36017189,  2.22714522,  1.991199  ,
         1.76048704,  2.00080512, -1.32771935,  1.7636355 ,  0.3262941 ],
       [ 2.01750328,  1.95695422,  1.66699391,  2.08608209,  2.01962242,
         1.80981491,  2.05693746, -1.30653362,  1.81315306,  0.3788915 ],
       [ 2.04088911,  1.97228562,  1.66699391,  2.12594548,  2.04306034,
         1.83754091,  2.0984046 , -1.31322779,  1.84098568,  0.41278058],
       [ 2.06369864,  1.97803489,  1.66699391,  2.16507744,  2.06592067,
         1.92908449,  2.0984046 , -1.32512287,  1.93288128,  0.3721867 ],
       [ 2.10225039,  1.98378417,  1.66699391,  2.23132487,  2.10455828,
         1.97821231,  2.1365847 , -1.34717508,  1.98219802,  0.3731268 ],
       [ 2.12812343,  1.98761702,  1.66699391,  2.27578588,  2.13048895,
         2.0166188 ,  2.1365847 , -1.36169989,  2.0207522 ,  0.32829471],
       [ 2.1454429 ,  2.03936049,  1.66699391,  2.30420748,  2.147847  ,
         2.04315264,  2.22533449, -1.34284791

In [45]:
model.predict(np.zeros((1, lag_order, n_features)))

array([[7.8575581e-02, 2.1269998e-01, 1.9072860e-02, 3.7107766e-03,
        7.7775538e-02, 2.8666687e-01, 6.7725182e-03, 1.6045570e-04,
        2.6910794e-01, 2.1868646e-03]], dtype=float32)

In [47]:
data = state_dfs['Alabama'].iloc[-lag_order:]

model.predict(np.reshape(np.nan_to_num(data.values), (1, lag_order, n_features)))

array([[8.7328923e-01, 8.3507824e-01, 7.6038605e-01, 8.1728578e-01,
        8.7057137e-01, 9.1755855e-01, 9.5172745e-01, 4.5900210e-06,
        9.3304586e-01, 8.1912458e-01]], dtype=float32)

In [18]:
pd.concat([state_dfs['Alabama'], state_forecasts['Alabama']]).plot()

In [12]:
state_dfs['Alabama'].iloc[-lag_order:]

Unnamed: 0_level_0,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-08-11,1.989143,1.889879,1.360172,2.227145,1.991199,1.760487,2.000805,-1.327719,1.763635,0.326294
2020-08-12,2.017503,1.956954,1.666994,2.086082,2.019622,1.809815,2.056937,-1.306534,1.813153,0.378891
2020-08-13,2.040889,1.972286,1.666994,2.125945,2.04306,1.837541,2.098405,-1.313228,1.840986,0.412781
2020-08-14,2.063699,1.978035,1.666994,2.165077,2.065921,1.929084,2.098405,-1.325123,1.932881,0.372187
2020-08-15,2.10225,1.983784,1.666994,2.231325,2.104558,1.978212,2.136585,-1.347175,1.982198,0.373127
2020-08-16,2.128123,1.987617,1.666994,2.275786,2.130489,2.016619,2.136585,-1.3617,2.020752,0.328295
2020-08-17,2.145443,2.03936,1.666994,2.304207,2.147847,2.043153,2.225334,-1.342848,2.047388,0.455274


In [13]:
state_forecasts['Alabama']

Unnamed: 0,Confirmed,Deaths,Recovered,Active,Incident_Rate,People_Tested,People_Hospitalized,Mortality_Rate,Testing_Rate,Hospitalization_Rate
2020-08-18,0.750578,0.723176,0.758303,0.752485,0.970006,0.993595,1.001136,-0.919508,0.55901,-0.277527
2020-08-19,0.714971,0.701951,0.675647,0.69122,0.921879,0.931179,0.925978,-0.87125,0.542146,-0.352744
2020-08-20,0.708087,0.698648,0.654526,0.676092,0.910894,0.917751,0.906106,-0.864117,0.540283,-0.379681
2020-08-21,0.705996,0.698028,0.647236,0.670996,0.9073,0.913597,0.89926,-0.862408,0.540232,-0.39016
2020-08-22,0.70497,0.697727,0.644329,0.668888,0.905656,0.91188,0.896393,-0.861653,0.540195,-0.394326
2020-08-23,0.704239,0.697442,0.643008,0.667823,0.904639,0.910939,0.894931,-0.861089,0.540056,-0.396006
2020-08-24,0.703604,0.697147,0.642296,0.667154,0.90385,0.910264,0.894001,-0.860555,0.539855,-0.396689
2020-08-25,0.702931,0.696816,0.641808,0.666606,0.903081,0.909614,0.893225,-0.859935,0.53959,-0.396939
2020-08-26,0.702806,0.696738,0.641598,0.66643,0.902911,0.909455,0.892995,-0.859826,0.539537,-0.397145
2020-08-27,0.702777,0.696723,0.64152,0.666372,0.902865,0.909409,0.892918,-0.859805,0.539531,-0.397248


In [14]:
state_dfs_raw['Alabama']['Confirmed'].plot()

In [15]:
cleaned_dfs['Alabama']['Confirmed'].plot()

In [16]:
state_forecasts['Alabama']['Deaths'].plot()

In [17]:
state_dfs_raw['Alabama']['Confirmed']

0         3563
50        3734
100       3953
150       4075
200       4345
         ...  
6850    121023
6900    122185
6950    123889
7000    125235
7050    126058
Name: Confirmed, Length: 142, dtype: int64

In [18]:
cleaned_dfs['Alabama']['Confirmed'].plot() # state_stds['Alabama']['Confirmed']