In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

In [None]:
SEED = 42
KERAS_VERBOSITY = 0

In [None]:
display(tf.config.list_physical_devices('CPU'))
display(tf.config.list_physical_devices('GPU'))

# HELPER FUNCTIONS

In [None]:
def eval_model_np(model, data_test):
    est_trues = np.zeros(len(data_test))
    est_preds = np.zeros(len(data_test))
    for i, zip_data in enumerate(data_test):
        y_preds = model.predict(zip_data[0], verbose=KERAS_VERBOSITY)
        est_trues[i] = zip_data[1][0]
        est_preds[i] = y_preds[0][0]

    return est_trues, est_preds

def eval_model_df(model, data_test):
    est_trues, est_preds = eval_model_np(model, data_test)
    
    return pd.DataFrame({'est_trues': est_trues,
                         'est_preds': est_preds})    

# DATA INGESTION

In [None]:
file_path = '../../../src/data/temp/lagged_zbp_totals_with_features.csv'
data = pd.read_csv(file_path)
data.head()

# DROP NON-NUMERICAL

In [None]:
included_feats = data.columns.drop(['emp_nf', 'qp1_nf', 'ap_nf'])
data = data[included_feats]

# TRAIN TEST SPLIT

In [None]:
class CustomTimeSeriesSplit:
    def __init__(self, n_splits=None):
        self.n_splits = n_splits

    def split(self, X, y=None, groups=None):
        
        year_range = np.sort(X['year'].unique())
        min_year = year_range[0]
        
        self.n_splits = len(year_range) - 1
        
        for test_year in year_range[1:]:
            curr_range = np.arange(min_year, test_year)
            train = X[X['year'].apply(lambda year: year in curr_range)].index.to_numpy()
            test = X[X['year'] == test_year].index.to_numpy()
            
            yield train, test

    def get_n_splits(self, X, y, groups=None):
        
        year_range = np.sort(X['year'].unique())
        
        return len(year_range) - 1

In [None]:
tscv = CustomTimeSeriesSplit()
split = list(tscv.split(data, None))

train_indicies, test_indicies = split[-1]

data_train = data.iloc[train_indicies].reset_index(drop=True)
data_test = data.iloc[test_indicies].reset_index(drop=True)

# STANDARDIZING

In [None]:
train_mean = data_train.mean()
train_mean['zip'] = 0

train_std = data_train.std()
train_std['zip'] = 1

In [None]:
data_train = (data_train-train_mean)/train_std
data_test = (data_test-train_mean)/train_std

# DATA PROCESSING (OHE)

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')

data_ohe_train = preproc.fit_transform(data_train)

feature_names = preproc.get_feature_names_out()
feature_names = np.char.replace(feature_names.astype('str'), 'onehots__','')
feature_names = np.char.replace(feature_names, 'remainder__','')

data_ohe_train = pd.DataFrame(data_ohe_train, columns=feature_names)

data_ohe_test = preproc.transform(data_test)
data_ohe_test = pd.DataFrame(data_ohe_test, columns=feature_names)

data_ohe_train.head()

# RESHAPE INPUTS

In [None]:
def split_by_zip(data):
    
    zip_codes = [col for col in data.columns if 'zip' in col]
    included_feats = data.columns.drop(['est'])
    
    temp = []
    for curr_zip in zip_codes:
        
        curr_data = data[data[curr_zip] == 1.0]
        X = curr_data[included_feats].values
        X = X.reshape((X.shape[0], 1, X.shape[1]))
        y = curr_data['est'].values
        
        temp += [[X, y]]
    
    return temp

In [None]:
data_train = split_by_zip(data_ohe_train)
data_test = split_by_zip(data_ohe_test)

# MODEL

In [None]:
model = Sequential()
model.add(LSTM(200, input_shape=(data_train[0][0].shape[1], data_train[0][0].shape[2])))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

# TRAIN

In [None]:
%%time

TOTAL_EPOCHS = 75

losses = []
val_losses = []
for _ in tqdm(np.arange(TOTAL_EPOCHS)):
    
    loss_curr_epoch = 0
    i = 0
    for zip_data in data_train:
        history = model.fit(zip_data[0], zip_data[1], epochs=1, batch_size=1, verbose=KERAS_VERBOSITY, shuffle=False)
        loss_curr_epoch += history.history['loss'][0]
        i += 1
    # loss calculation only holds if all zip codes have same # of observations
    losses += [loss_curr_epoch/i]
    est_trues, est_preds = eval_model_np(model, data_test)
    val_losses += [mean_squared_error(est_trues, est_preds)]

In [None]:
plt.plot(np.arange(1, len(losses) + 1), losses, label='train')
plt.plot(np.arange(1, len(val_losses) + 1), val_losses, label='validation')
plt.legend()
plt.show()

# EVALUATE

In [None]:
baseline_rmse = mean_squared_error(data['est'], data['est_lag_1'], squared=False)
print(f'baseline rmse: {baseline_rmse}')

In [None]:
eval_df = (eval_model_df(model, data_train)*train_std['est'])+train_mean['est']
test_rmse = mean_squared_error(eval_df['est_trues'], eval_df['est_preds'], squared=False)
display(eval_df)
print('Final Train RMSE: %.3f' % test_rmse)

In [None]:
eval_df = (eval_model_df(model, data_test)*train_std['est'])+train_mean['est']
test_rmse = mean_squared_error(eval_df['est_trues'], eval_df['est_preds'], squared=False)
display(eval_df)
print('Final Test RMSE: %.3f' % test_rmse)