In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from tqdm import tqdm

In [None]:
SEED = 42
KERAS_VERBOSITY = 0

In [None]:
display(tf.config.list_physical_devices('CPU'))
display(tf.config.list_physical_devices('GPU'))

# DATA INGESTION

In [None]:
file_path = '../../../src/data/temp/lagged_zbp_totals_with_features.csv'
data = pd.read_csv(file_path)
data.head()

# DROP NON-NUMERICAL

In [None]:
included_feats = data.columns.drop(['emp_nf', 'qp1_nf', 'ap_nf'])
data = data[included_feats]

# TRAIN TEST SPLIT

In [None]:
split_year = 2018

data_train = data[data['year']<=split_year]
data_test = data[data['year']>split_year]

# STANDARDIZING

In [None]:
train_mean = data_train.mean()
train_mean['zip'] = 0

train_std = data_train.std()
train_std['zip'] = 1

In [None]:
data_train = (data_train-train_mean)/train_std
data_test = (data_test-train_mean)/train_std

In [None]:
def invert_est_standardization(val):
    return (val*train_std['est'])+train_mean['est']

# DATA PROCESSING (OHE)

In [None]:
preproc = ColumnTransformer([('onehots', OneHotEncoder(handle_unknown='ignore'), ['zip'])]
                             ,remainder = 'passthrough')

data_ohe_train = preproc.fit_transform(data_train)

feature_names = preproc.get_feature_names_out()
feature_names = np.char.replace(feature_names.astype('str'), 'onehots__','')
feature_names = np.char.replace(feature_names, 'remainder__','')

data_ohe_train = pd.DataFrame(data_ohe_train, columns=feature_names)

data_ohe_test = preproc.transform(data_test)
data_ohe_test = pd.DataFrame(data_ohe_test, columns=feature_names)

In [None]:
data_ohe_train.head(1)

In [None]:
data_ohe_test.head(1)

# WINDOWING

In [None]:
class WindowGenerator():
    
    def __init__(self, input_width, label_width, shift,
                train_df=data_ohe_train, test_df=data_ohe_test,
                label_columns=None, batch_size=1):
        
        self.batch_size = batch_size
        
        # Store the raw data.
        self.train_df = train_df
        self.test_df = test_df

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in
                                          enumerate(label_columns)}
        self.column_indices = {name: i for i, name in
                               enumerate(train_df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

In [None]:
def split_window(self, features):
    
    inputs = features[:, self.input_slice, :]
    labels = features[:, self.labels_slice, :]
    
    if self.label_columns is not None:
        labels = tf.stack(
            [labels[:, :, self.column_indices[name]] for name in self.label_columns],
            axis=-1)

    # Slicing doesn't preserve static shape information, so set the shapes
    # manually. This way the `tf.data.Datasets` are easier to inspect.
    inputs.set_shape([None, self.input_width, None])
    labels.set_shape([None, self.label_width, None])

    return inputs, labels

WindowGenerator.split_window = split_window

In [None]:
def make_dataset(self, data):
    data = np.array(data, dtype=np.float32)
    ds = tf.keras.utils.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=self.total_window_size,
        sequence_stride=1,
        shuffle=True,
        batch_size=self.batch_size,)

    ds = ds.map(self.split_window)

    return ds

WindowGenerator.make_dataset = make_dataset

In [None]:
@property
def train(self):
    return self.make_dataset(self.train_df)

@property
def example(self):
    """Get and cache an example batch of `inputs, labels` for plotting."""
    result = getattr(self, '_example', None)
    if result is None:
        # No example batch was found, so get one from the `.train` dataset
        result = next(iter(self.train))
        # And cache it for next time
        self._example = result
    return result

WindowGenerator.train = train
WindowGenerator.example = example

In [None]:
IN_STEPS = 1
OUT_STEPS = 1

multi_window = WindowGenerator(input_width=IN_STEPS,
                               label_width=OUT_STEPS,
                               shift=OUT_STEPS,
                               label_columns=['est'],
                               batch_size=1)
multi_window

# SPLITTING DATA INTO ZIPCODES

In [None]:
data_train_by_zc_tf = {}
for zip_code in data_ohe_train.filter(like='zip').columns:
    data_by_zc = data_ohe_train[data_ohe_train[zip_code]==1]
    data_train_by_zc_tf[zip_code] = multi_window.make_dataset(data_by_zc)

In [None]:
data_test_by_zc_tf = {}
for zip_code in data_ohe_test.filter(like='zip').columns:
    data_by_zc = data_ohe_test[data_ohe_test[zip_code]==1]
    data_test_by_zc_tf[zip_code] = multi_window.make_dataset(data_by_zc)

# MODEL

In [None]:
class FeedBack(tf.keras.Model):
    
    def __init__(self, units, out_steps):
        super().__init__()
        self.out_steps = out_steps
        self.units = units
        self.lstm_cell = tf.keras.layers.LSTMCell(units)
        # Also wrap the LSTMCell in an RNN to simplify the `warmup` method.
        self.lstm_rnn = tf.keras.layers.RNN(self.lstm_cell, return_state=True)
        self.dense = tf.keras.layers.Dense(data_ohe_train.shape[1])
        
    def warmup(self, inputs):
        # inputs.shape => (batch, time, features)
        # x.shape => (batch, lstm_units)
        x, *state = self.lstm_rnn(inputs)

        # predictions.shape => (batch, features)
        prediction = self.dense(x)
        return prediction, state
    
    def call(self, inputs, training=None):
        # Use a TensorArray to capture dynamically unrolled outputs.
        predictions = []
        # Initialize the LSTM state.
        prediction, state = self.warmup(inputs)

        # Insert the first prediction.
        predictions.append(prediction)

        # Run the rest of the prediction steps.
        for n in range(1, self.out_steps):
            # Use the last prediction as input.
            x = prediction
            # Execute one lstm step.
            x, state = self.lstm_cell(x, states=state,
                                      training=training)
            # Convert the lstm output to a prediction.
            prediction = self.dense(x)
            # Add the prediction to the output.
            predictions.append(prediction)

        # predictions.shape => (time, batch, features)
        predictions = tf.stack(predictions)
        # predictions.shape => (batch, time, features)
        predictions = tf.transpose(predictions, [1, 0, 2])
        return predictions

In [None]:
MAX_EPOCHS = 100
KERAS_VERBOSITY = 0

def compile_and_fit(model, data, patience=2):

    losses = []
    val_losses = []

    model.compile(loss=tf.keras.losses.MeanSquaredError(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=[tf.keras.metrics.MeanAbsoluteError()])
    
    for epoch in tqdm(np.arange(MAX_EPOCHS)):
        
        loss_curr_epoch = 0
        val_loss_curr_epoch = 0
        i = 0
        
        data_train_by_zip = list(data_train_by_zc_tf.values())
        data_test_by_zip = list(data_test_by_zc_tf.values())
        
        for i in np.arange(len(data_train_by_zip)):
            
            history = model.fit(data_train_by_zip[i], epochs=1, validation_data=data_test_by_zip[i], verbose=KERAS_VERBOSITY)
            loss_curr_epoch += history.history['loss'][0]
            val_loss_curr_epoch += history.history['val_loss'][0]
            i += 1
                
        losses += [invert_est_standardization(loss_curr_epoch/i)]
        val_losses += [invert_est_standardization(val_loss_curr_epoch/i)]
                
    return losses, val_losses

In [None]:
feedback_model = FeedBack(units=200, out_steps=OUT_STEPS)

In [None]:
losses, val_losses = compile_and_fit(feedback_model, data_train_by_zc_tf)

In [None]:
plt.plot(np.arange(1, len(losses) + 1), losses, label='train')
plt.plot(np.arange(1, len(val_losses) + 1), val_losses, label='validation')
plt.legend()
plt.show()

# MAKE LONG-TERM PREDICTIONS

In [None]:
VIS_IN_STEPS = 2018-2012
VIS_OUT_STEPS = 0

vis_window = WindowGenerator(input_width=VIS_IN_STEPS,
                             label_width=VIS_OUT_STEPS,
                             shift=VIS_OUT_STEPS,
                             label_columns=['est'],
                             batch_size=1)
vis_window

In [None]:
data_91901 = data_ohe_train[data_ohe_train['zip_91901.0']==1]
data_91901 = vis_window.make_dataset(data_91901)

In [None]:
AUTOREGRESSIVE_OUT_STEPS = 10

plot_col_index = vis_window.column_indices['est']

inputs = next(iter(data_91901))[0]

predictions = []
prediction, state = feedback_model.warmup(inputs)

predictions.append(prediction)

for n in range(1, AUTOREGRESSIVE_OUT_STEPS):
    x = prediction
    x, state = feedback_model.lstm_cell(x, states=state, training=None)
    prediction = feedback_model.dense(x)
    predictions.append(prediction)
predictions = tf.stack(predictions)
predictions = tf.transpose(predictions, [1, 0, 2])
# f, ax = plt.subplots()
plt.plot(vis_window.input_indices + 2012, (inputs[0, :, plot_col_index]*train_std['est'])+train_mean['est'], label='Inputs', marker='.', zorder=-10)
plt.plot(np.arange(VIS_IN_STEPS, VIS_IN_STEPS + AUTOREGRESSIVE_OUT_STEPS) + 2012, (predictions[0, :, plot_col_index]*train_std['est'])+train_mean['est'], label='Labels', marker='.', zorder=-10, color='g')