In [1]:
import tensorflow as tf
import pandas as pd
import os
import datetime as dt
import numpy as np
from random import seed
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import backend as K

In [2]:
seed(36)

## Paths to Files

In [3]:
BASE_DIR_PATH = '/Users/neilb/Documents/dsci_thesis/'
DATA_PATH = 'Scraping_and_Cleaning'
DATASET_FILE = os.path.join(BASE_DIR_PATH, DATA_PATH, 'data_2022.csv')

## Loading Dataset

In [4]:
df = pd.read_csv(DATASET_FILE)
df['datetime'] = pd.to_datetime(df['datetime']) # parsing column to datetime as it is read as string

In [5]:
df

Unnamed: 0,datetime,water_level,station,1hr,3hr,6hr,12hr,24hr
0,2022-01-01,11.64,Airport (PAGASA),0.0,0.0,0.0,0.0,0.0
1,2022-01-01,11.64,Antipolo,0.0,0.0,0.0,0.0,0.0
2,2022-01-01,11.64,Bagong Nayon,0.0,0.0,0.0,0.0,0.0
3,2022-01-01,11.64,Boso Boso,0.0,0.0,0.0,0.0,0.0
4,2022-01-01,11.64,Calawis,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
227781,2023-01-01,12.08,San Pedro 9 Chapel,0.0,0.0,0.0,0.0,0.0
227782,2023-01-01,12.08,Science Garden,0.0,0.0,0.0,0.0,0.0
227783,2023-01-01,12.08,Sitio Wawa,0.0,0.0,0.0,0.0,0.0
227784,2023-01-01,12.08,Taytay,0.0,0.0,0.0,0.0,0.0


In [6]:
# We only focus on the five stations most commonly used in previous research
# along with the two stations we have found to have the highest correlations with water level.
# Also, we use hourly rainfall data so that there are more data points to train on
stations = ['Boso Boso', 'Mt. Aries', 'Mt. Campana', 'Mt. Oro', 'Nangka', 'Pintong Bukawe', 'Sitio Wawa']

focused_df = df[df['station'].isin(stations)].sort_values(by=['datetime', 'station'])
focused_df = focused_df[['datetime', 'water_level', 'station', '1hr']]
focused_df.head(14)

Unnamed: 0,datetime,water_level,station,1hr
3,2022-01-01 00:00:00,11.64,Boso Boso,0.0
11,2022-01-01 00:00:00,11.64,Mt. Aries,0.0
12,2022-01-01 00:00:00,11.64,Mt. Campana,0.0
13,2022-01-01 00:00:00,11.64,Mt. Oro,0.0
14,2022-01-01 00:00:00,11.64,Nangka,0.0
18,2022-01-01 00:00:00,11.64,Pintong Bukawe,0.0
23,2022-01-01 00:00:00,11.64,Sitio Wawa,0.0
29,2022-01-01 01:00:00,11.61,Boso Boso,0.0
37,2022-01-01 01:00:00,11.61,Mt. Aries,0.0
38,2022-01-01 01:00:00,11.61,Mt. Campana,0.0


## We then create a dataframe that is formatted in a way for easier input into neural networks

In [7]:
keys_list = list(focused_df.groupby(['datetime', 'water_level']).groups)
keys_list[0]

(Timestamp('2022-01-01 00:00:00'), 11.64)

In [8]:
groups = focused_df.groupby(['datetime', 'water_level']).groups
groups

{(2022-01-01 00:00:00, 11.64): [3, 11, 12, 13, 14, 18, 23], (2022-01-01 01:00:00, 11.61): [29, 37, 38, 39, 40, 44, 49], (2022-01-01 02:00:00, 11.58): [55, 63, 64, 65, 66, 70, 75], (2022-01-01 03:00:00, 11.55): [81, 89, 90, 91, 92, 96, 101], (2022-01-01 04:00:00, 11.49): [107, 115, 116, 117, 118, 122, 127], (2022-01-01 05:00:00, 11.49): [133, 141, 142, 143, 144, 148, 153], (2022-01-01 06:00:00, 11.48): [159, 167, 168, 169, 170, 174, 179], (2022-01-01 07:00:00, 11.48): [185, 193, 194, 195, 196, 200, 205], (2022-01-01 08:00:00, 11.49): [211, 219, 220, 221, 222, 226, 231], (2022-01-01 09:00:00, 11.49): [237, 245, 246, 247, 248, 252, 257], (2022-01-01 10:00:00, 11.5): [263, 271, 272, 273, 274, 278, 283], (2022-01-01 11:00:00, 11.52): [289, 297, 298, 299, 300, 304, 309], (2022-01-01 12:00:00, 11.54): [315, 323, 324, 325, 326, 330, 335], (2022-01-01 13:00:00, 11.55): [341, 349, 350, 351, 352, 356, 361], (2022-01-01 14:00:00, 11.55): [367, 375, 376, 377, 378, 382, 387], (2022-01-01 15:00:00, 1

In [9]:
list_of_dict = []

for key in keys_list:
    row_dict = {
        'datetime': key[0],
        'water_level': key[1],
        'Boso Boso': df.iloc[groups[key][0]]['1hr'],
        'Mt. Aries': df.iloc[groups[key][1]]['1hr'], 
        'Mt. Campana': df.iloc[groups[key][2]]['1hr'], 
        'Mt. Oro': df.iloc[groups[key][3]]['1hr'], 
        'Nangka': df.iloc[groups[key][4]]['1hr'], 
        'Pintong Bukawe': df.iloc[groups[key][5]]['1hr'], 
        'Sitio Wawa': df.iloc[groups[key][6]]['1hr']
    }
    
    list_of_dict.append(row_dict)
    
list_of_dict[0]

{'datetime': Timestamp('2022-01-01 00:00:00'),
 'water_level': 11.64,
 'Boso Boso': 0.0,
 'Mt. Aries': 0.0,
 'Mt. Campana': 0.0,
 'Mt. Oro': 0.0,
 'Nangka': 0.0,
 'Pintong Bukawe': 0.0,
 'Sitio Wawa': 0.0}

In [10]:
final_df = pd.DataFrame(list_of_dict)
final_df.head(10)

Unnamed: 0,datetime,water_level,Boso Boso,Mt. Aries,Mt. Campana,Mt. Oro,Nangka,Pintong Bukawe,Sitio Wawa
0,2022-01-01 00:00:00,11.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-01-01 01:00:00,11.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-01-01 02:00:00,11.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2022-01-01 03:00:00,11.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2022-01-01 04:00:00,11.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2022-01-01 05:00:00,11.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2022-01-01 06:00:00,11.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2022-01-01 07:00:00,11.48,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2022-01-01 08:00:00,11.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2022-01-01 09:00:00,11.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Removing the datetime column
final_df = final_df[['water_level', 'Boso Boso', 'Mt. Aries', 'Mt. Campana', 'Mt. Oro', 'Nangka', 'Pintong Bukawe', 'Sitio Wawa']]
final_df

Unnamed: 0,water_level,Boso Boso,Mt. Aries,Mt. Campana,Mt. Oro,Nangka,Pintong Bukawe,Sitio Wawa
0,11.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
8756,12.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8757,12.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8758,12.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8759,12.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Splitting Dataset into Train-Val-Test

In [12]:
# Splitting for time series: split into 70-20-10
n = len(final_df)
train_df = final_df[0:int(n*0.7)]
val_df = final_df[int(n*0.7):int(n*0.9)]
test_df = final_df[int(n*0.9):]

In [13]:
train_df.tail()

Unnamed: 0,water_level,Boso Boso,Mt. Aries,Mt. Campana,Mt. Oro,Nangka,Pintong Bukawe,Sitio Wawa
6127,12.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6128,12.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6129,12.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6130,12.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6131,12.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
val_df.head()

Unnamed: 0,water_level,Boso Boso,Mt. Aries,Mt. Campana,Mt. Oro,Nangka,Pintong Bukawe,Sitio Wawa
6132,12.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6133,12.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6134,12.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6135,12.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6136,12.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
val_df.tail()

Unnamed: 0,water_level,Boso Boso,Mt. Aries,Mt. Campana,Mt. Oro,Nangka,Pintong Bukawe,Sitio Wawa
7879,12.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7880,12.55,0.0,0.0,3.0,0.0,0.0,0.0,0.0
7881,12.54,1.0,0.0,4.0,0.0,0.0,0.0,0.5
7882,12.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7883,12.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
test_df.head()

Unnamed: 0,water_level,Boso Boso,Mt. Aries,Mt. Campana,Mt. Oro,Nangka,Pintong Bukawe,Sitio Wawa
7884,12.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7885,12.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7886,12.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7887,12.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7888,12.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creating a WindowGenerator Class
This is used for predicting using consecutive inputs, which is useful for time series data.
The code is obtained from the tensorflow tutorials: https://www.tensorflow.org/tutorials/structured_data/time_series.

In [17]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift, train_df=train_df, val_df=val_df, test_df=test_df, label_columns=None):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}
        self.column_indices = {name: i for i, name in enumerate(train_df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack([labels[:, :, self.column_indices[name]] for name in self.label_columns], axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels
    
    # Creating tf datasets for more convenient use and integration into model in the future
    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=1,
            shuffle=True,
            batch_size=32,)

        ds = ds.map(self.split_window)

        return ds
    
    # properties to access them as tf datasets
    @property
    def train(self):
        return self.make_dataset(self.train_df)

    @property
    def val(self):
        return self.make_dataset(self.val_df)

    @property
    def test(self):
        return self.make_dataset(self.test_df)

    @property
    def example(self):
        """Get and cache an example batch of `inputs, labels` for plotting."""
        result = getattr(self, '_example', None)
        if result is None:
            # No example batch was found, so get one from the `.train` dataset
            result = next(iter(self.train))
            # And cache it for next time
            self._example = result
        return result

In [18]:
# The wide window uses independent hours of data as input to predict the water level of the next hour
# Here, the prediction is done on 6 hours
# This is used for Dense and Recurrent Neural Networks
wide_window = WindowGenerator(
        input_width=6, label_width=6, shift=1,
        label_columns=['water_level']
    )

wide_window

Total window size: 7
Input indices: [0 1 2 3 4 5]
Label indices: [1 2 3 4 5 6]
Label column name(s): ['water_level']

In [19]:
# The conv window is used for the Convolutional Neural Netwrok
# 6 consecutive hours of data are used together to make predictions one hour into the future
CONV_WIDTH = 6
conv_window = WindowGenerator(
        input_width=CONV_WIDTH,
        label_width=1,
        shift=1,
        label_columns=['water_level']
    )

conv_window

Total window size: 7
Input indices: [0 1 2 3 4 5]
Label indices: [6]
Label column name(s): ['water_level']

## Define Loss Function
We define the loss functions in the Mahesh paper.
Codes are from their github repository: https://github.com/RaginiBalMahesh/Physics-Informed-Neural-Network-for-Flood-Forecasting/tree/main

In [20]:
def r_square(y_true, y_pred):
    x = y_true
    y = y_pred
    mx = K.mean(x, axis=0)
    my = K.mean(y, axis=0)
    xm, ym = x - mx, y - my
    r_num = K.square(K.sum(xm * ym))
    x_square_sum = K.sum(xm * xm)
    y_square_sum = K.sum(ym * ym)
    r_den = (x_square_sum * y_square_sum) + K.epsilon()
    
    r = r_num / r_den
    return r

In [21]:
def NSE(y_true, y_pred):
    '''
    This is the Nash-Sutcliffe Efficiency Coefficient
    '''
    y_pred = K.flatten(y_pred)
    y_true = K.flatten(y_true)

    
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

## Building Models

In [22]:
# For easy compiling and fitting of different models
MAX_EPOCHS = 20

def compile_and_fit(model, window, patience=2):
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, mode='min')

    model.compile(
        loss=tf.keras.losses.MeanSquaredError(), 
        optimizer=tf.keras.optimizers.Adam(), 
        metrics=[tf.keras.metrics.MeanSquaredError(), NSE, r_square]
    )

    history = model.fit(
        window.train, 
        epochs=MAX_EPOCHS,
        validation_data=window.val,
        callbacks=[early_stopping]
    )

    return history

In [23]:
# Dense Neural Network
dense = tf.keras.Sequential([
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1)
])

# Convolution Neural Network
conv_model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(filters=64, kernel_size=(CONV_WIDTH,), activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1),
])

# LSTM
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(64, return_sequences=True),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=64, activation='relu'),
    tf.keras.layers.Dense(units=1)
])

## Compiling and Fitting Models

In [24]:
dense_history = compile_and_fit(dense, wide_window)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [25]:
conv_history = compile_and_fit(conv_model, conv_window)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [26]:
lstm_history = compile_and_fit(lstm_model, wide_window)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20


## Evaluate Model

In [27]:
val_performance = {}
performance = {}

In [28]:
val_performance['Dense'] = dense.evaluate(wide_window.val)



In [29]:
performance['Dense'] = dense.evaluate(wide_window.test, verbose=0)

In [30]:
val_performance['Conv'] = conv_model.evaluate(conv_window.val)



In [31]:
performance['Conv'] = conv_model.evaluate(conv_window.test, verbose=0)

In [32]:
val_performance['LSTM'] = lstm_model.evaluate(wide_window.val)



In [33]:
performance['LSTM'] = lstm_model.evaluate(wide_window.test, verbose=0)

In [34]:
val_performance

{'Dense': [0.06997368484735489,
  0.06997368484735489,
  0.8562962412834167,
  0.87933349609375],
 'Conv': [0.05173927918076515,
  0.05173927918076515,
  0.8729154467582703,
  0.9188089966773987],
 'LSTM': [0.09347333759069443,
  0.09347332268953323,
  0.872291088104248,
  0.8866416215896606]}

In [35]:
performance

{'Dense': [0.00888330303132534,
  0.008883302100002766,
  0.4536984860897064,
  0.8027930855751038],
 'Conv': [0.02208961360156536,
  0.02208961360156536,
  -0.5538405179977417,
  0.7979046106338501],
 'LSTM': [0.0077938511967659,
  0.007793852128088474,
  0.5589085221290588,
  0.7979263663291931]}

# Time Series Forecasting Sample

In [None]:
# Normalizing data
train_mean = train_df.mean()
train_std = train_df.std()

train_df = (train_df - train_mean) / train_std
val_df = (val_df - train_mean) / train_std
test_df = (test_df - train_mean) / train_std

## Tests

In [49]:
input_vals = [[
    [12.17, 8.0, 0.0, 2.0, 4.0, 0.0, 0.0, 3.5], #7/26/23 1:00
    [12.63, 17.0, 9.0, 8.0, 0.0, 1.0, 0.0, 0.5],
    [12.76, 8.0, 23.0, 7.0, 11.0, 19.0, 0.0, 15.5],
    [13.27, 0.0, 2.0, 4.0, 4.0, 8.0, 0.0, 9.5],
    [13.35, 2.0, 2.0, 1.0, 4.0, 3.0, 0.0, 5.0],
    [13.33, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
]]

In [51]:
dense.predict(input_vals)
# correct values are 12.63, 12.76, 13.27, 13.35, 13.33, 13.25



array([[[12.334484],
        [12.498712],
        [12.42679 ],
        [13.139084],
        [13.524072],
        [13.279024]]], dtype=float32)

In [52]:
conv_model.predict(input_vals)
# correct value is 13.25



array([[[12.927531]]], dtype=float32)

In [55]:
lstm_model.predict(input_vals)
# correct values are 12.63, 12.76, 13.27, 13.35, 13.33, 13.25



array([[[12.065204],
        [12.606915],
        [13.211148],
        [13.33691 ],
        [13.429615],
        [13.208246]]], dtype=float32)