In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

In [None]:
SEED = 42

# DATA INGESTION

In [None]:
file_path = '../../../src/data/temp/lagged_zbp_totals_with_features.csv'
data = pd.read_csv(file_path)
data = data[data['zip'].apply(lambda x: x not in [91934, 91905, 91931, 91917, 92145])]
data.head()

# DATA PROCESSING (OHE)

In [None]:
included_feats = data.columns.drop(['emp_nf', 'qp1_nf', 'ap_nf'])
data = data[included_feats]

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')

ohe_data = preproc.fit_transform(data)

feature_names = preproc.get_feature_names_out()
feature_names = np.char.replace(feature_names.astype('str'), 'onehots__','')
feature_names = np.char.replace(feature_names, 'remainder__','')

ohe_data = pd.DataFrame(ohe_data, columns=feature_names)
ohe_data.head()

# TRAIN TEST SPLIT

In [None]:
class CustomTimeSeriesSplit:
    def __init__(self, n_splits=None):
        self.n_splits = n_splits

    def split(self, X, y=None, groups=None):
        
        year_range = np.sort(X['year'].unique())
        min_year = year_range[0]
        
        self.n_splits = len(year_range) - 1
        
        for test_year in year_range[1:]:
            curr_range = np.arange(min_year, test_year)
            train = X[X['year'].apply(lambda year: year in curr_range)].index.to_numpy()
            test = X[X['year'] == test_year].index.to_numpy()
            
            yield train, test

    def get_n_splits(self, X, y, groups=None):
        
        year_range = np.sort(X['year'].unique())
        
        return len(year_range) - 1

In [None]:
tscv = CustomTimeSeriesSplit()
split = list(tscv.split(ohe_data, None))

train_indicies, test_indicies = split[-1]

data_train = ohe_data.iloc[train_indicies].reset_index(drop=True)
data_test = ohe_data.iloc[test_indicies].reset_index(drop=True)

# RESHAPE INPUTS

In [None]:
def split_by_zip(data):
    
    zip_codes = [col for col in data.columns if 'zip' in col]
    included_feats = data.columns.drop(['est'])
    
    temp = []
    for curr_zip in zip_codes:
        
        curr_data = data[data[curr_zip] == 1.0]
        X = curr_data[included_feats].values
        X = X.reshape((X.shape[0], 1, X.shape[1]))
        y = curr_data['est'].values
        
        temp += [[X, y]]
    
    return temp

In [None]:
data_train = split_by_zip(data_train)
data_test = split_by_zip(data_test)

# MODEL

In [None]:
model = Sequential()
model.add(LSTM(50, input_shape=(data_train[0][0].shape[1], data_train[0][0].shape[2])))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

# TRAIN

In [None]:
TOTAL_EPOCHS = 10

losses = []
for _ in tqdm(np.arange(TOTAL_EPOCHS)):
    
    loss_curr_epoch = 0
    for zip_data in data_train:
        history = model.fit(zip_data[0], zip_data[1], epochs=1, batch_size=1, verbose=0, shuffle=False)
        loss_curr_epoch += history.history['loss'][0]
    losses += [loss_curr_epoch]

In [None]:
plt.plot(np.arange(1, len(losses) + 1), losses, label='train')
plt.legend()
plt.show()

# EVALUATE

In [None]:
temp = []
for i, zip_data in enumerate(data_test):
    try:
        y_preds = model.predict(zip_data[0], verbose=0)
    except:
        print(i)
    res = pd.DataFrame({'est_trues':zip_data[1],
                        'est_preds':y_preds[0]})
    temp += [res]
    
temp = pd.concat(temp)
temp

In [None]:
# calculate RMSE
rmse = mean_squared_error(temp['est_trues'], temp['est_preds'], squared=True)
print('Test RMSE: %.3f' % rmse)