In [1]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error
from math import sqrt

import tensorflow as tf
from tensorflow.data import Dataset

In [2]:
# Import data

DATA_PATH = 'Reservoir_Project/Data'

In [3]:
basin_inflow_train = pd.read_excel(f'{DATA_PATH}/Custom/basin_inflow_train.xlsx', index_col=0)
basin_inflow_validation = pd.read_excel(f'{DATA_PATH}/Custom/basin_inflow_validation.xlsx', index_col=0)
basin_inflow_test = pd.read_excel(f'{DATA_PATH}/Custom/basin_inflow_test.xlsx', index_col=0)

In [4]:
basin_inflow_train.head()

Unnamed: 0,INFLOW,ADR_PRECIP_ACC,ADR_PRECIP_INCR,ADR_TEMP_AVG,ADR_TEMP_MAX,ADR_TEMP_MIN,HYS_PRECIP_ACC,HYS_PRECIP_INCR,HYS_SNOW_DEPTH,HYS_SNOW_WATER_CONTENT,...,FRN_SNOW_DEPTH,FRN_SNOW_WATER_CONTENT,FRN_TEMP_AVG,FRN_TEMP_MAX,FRN_TEMP_MIN,PFH_PRECIP_ACC,PFH_PRECIP_INCR,PFH_TEMP_AVG,PFH_TEMP_MAX,PFH_TEMP_MIN
0,0.009407,-0.120687,0.184068,0.009505,-0.391558,0.331857,-0.064454,0.309236,-0.160654,-0.115369,...,-0.094061,-0.026854,-0.467328,-0.565277,-0.419978,-0.011883,0.153432,-0.125129,-0.568938,0.021703
1,0.009825,-0.115892,0.215971,-0.132046,-0.389896,0.0925,-0.059993,0.338807,-0.144326,-0.108889,...,-0.090099,-0.024826,-0.342022,-0.407445,-0.301333,-0.007179,0.22359,-0.117018,-0.685109,0.061987
2,0.003807,-0.113764,0.098568,-0.151271,-0.247423,0.010084,-0.055076,0.360112,-0.1314,-0.10013,...,-0.086392,-0.022915,-0.183639,-0.217192,-0.148991,-0.002509,0.221885,-0.174368,-0.534766,0.017436
3,-0.011969,-0.122304,-0.393446,-0.182306,-0.177232,-0.113069,-0.062791,-0.574891,-0.130901,-0.093803,...,-0.087422,-0.022983,-0.118318,-0.109488,-0.107866,-0.011679,-0.461838,-0.184934,-0.356396,-0.077629
4,-0.047851,-0.130268,-0.460592,-0.102898,-0.067602,-0.09175,-0.068891,-0.615032,-0.146738,-0.104076,...,-0.078444,-0.022474,-0.0699,-0.023878,-0.079156,-0.019216,-0.496717,-0.080716,-0.110395,-0.10784


### Data windowing

In [18]:
# TensorFlow utility class for producing data windows from time series data

class WindowGenerator():
  def __init__(self, input_width, label_width, shift,
               train_df, val_df, test_df,
               label_columns=None):
    # Store the raw data.
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df
    
    self.column_indices = {name: i for i, name in
                           enumerate(train_df.columns)}
    
    # Work out the label column indices.
    self.label_columns = label_columns
    if label_columns is not None:
      self.label_columns_indices = {name: i for i, name in
                                    enumerate(label_columns)}

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

In [19]:
"""
Window
- Given 60 days of history predict 30 days into the future. Why? A season is about 90 days in a CA WY
- Window size: 90
"""

window = WindowGenerator(input_width=60, label_width=1, shift=30,
                     train_df=basin_inflow_train, val_df=basin_inflow_validation, 
                     test_df=basin_inflow_test, label_columns=['INFLOW'])

window

<__main__.WindowGenerator at 0x29a9a7d10>

In [20]:
# create a window of inputs and labels

def split_window(self, features):
  inputs = features[:, self.input_slice, :]
  labels = features[:, self.labels_slice, :]
  if self.label_columns is not None:
    labels = tf.stack(
        [labels[:, :, self.column_indices[name]] for name in self.label_columns],
        axis=-1)

  # set shapes after slicing
  inputs.set_shape([None, self.input_width, None])
  labels.set_shape([None, self.label_width, None])

  return inputs, labels

WindowGenerator.split_window = split_window

In [21]:
# create a dataset of sliding windows over a time series dataframe

def make_dataset(self, data):
  data = np.array(data, dtype=np.float32)
  ds = tf.keras.utils.timeseries_dataset_from_array(
      data=data,
      targets=None,
      sequence_length=self.total_window_size,
      sequence_stride=1,
      shuffle=True,
      batch_size=32,)

  # (input_window, label_window) pairs 
  ds = ds.map(self.split_window)

  return ds

WindowGenerator.make_dataset = make_dataset

In [22]:
@property
def train(self):
  return self.make_dataset(self.train_df)

@property
def val(self):
  return self.make_dataset(self.val_df)

@property
def test(self):
  return self.make_dataset(self.test_df)

WindowGenerator.train = train
WindowGenerator.val = val
WindowGenerator.test = test

In [23]:
# each element is an (inputs, label) pair
window.train.element_spec

(TensorSpec(shape=(None, 60, 36), dtype=tf.float32, name=None),
 TensorSpec(shape=(None, 1, 1), dtype=tf.float32, name=None))

In [24]:
# example batch
for inputs, labels in window.train.take(1):
  print(f'Inputs shape (batch, time, features): {inputs.shape}')
  print(f'Labels shape (batch, time, features): {labels.shape}')

Inputs shape (batch, time, features): (32, 60, 36)
Labels shape (batch, time, features): (32, 1, 1)


### Modeling

#### Create baseline

In [25]:
# tensorflow baseline utility class for data windowing

class Baseline(tf.keras.Model):
  def __init__(self, label_index=None):
    super().__init__()
    self.label_index = label_index

  def call(self, inputs):
    if self.label_index is None:
      return inputs
    result = inputs[:, :, self.label_index]
    return result[:, :, tf.newaxis]

In [None]:
baseline = Baseline(label_index=window.column_indices['INFLOW'])

baseline.compile(loss=tf.keras.losses.MeanAbsoluteError(),
                 metrics=[tf.keras.metrics.MeanAbsoluteError()])

val_performance = {}
test_performance = {}
val_performance['Baseline'] = baseline.evaluate(window.val)
test_performance['Baseline'] = baseline.evaluate(window.test, verbose=0)

#### LSTM

In [28]:
MAX_EPOCHS = 20

def compile_and_fit(model, window, patience=2):
  early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=patience,
                                                    mode='min')

  model.compile(loss=tf.keras.losses.MeanAbsoluteError(),
                optimizer=tf.keras.optimizers.legacy.Adam(),
                metrics=[tf.keras.metrics.MeanAbsoluteError()])

  history = model.fit(window.train, epochs=MAX_EPOCHS,
                      validation_data=window.val,
                      callbacks=[early_stopping])
  return history

In [29]:
"""
inputs.shape gives you input_shape (see above)
Inputs shape (batch, time, features): (32, 60, 36) - 32 batch size, 60 time steps, 36 features
Note: need to verify that 36 features is correct (35 features and 1 label). See window class above 
"""

lstm_model = tf.keras.models.Sequential([
    # Shape [batch, time, features] => [batch, time, lstm_units]
    tf.keras.layers.LSTM(32, return_sequences=True, input_shape=[None, 36]),
    tf.keras.layers.Dense(units=1)
])

In [None]:
history = compile_and_fit(lstm_model, window)

In [None]:
val_performance['LSTM'] = lstm_model.evaluate(window.val)
test_performance['LSTM'] = lstm_model.evaluate(window.test, verbose=0)

print('Validation performance: ', val_performance['LSTM'])
print('Test performance: ', test_performance['LSTM'])