In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
from datetime import datetime, timedelta
%matplotlib qt

# WITH TWITTER

In [None]:
raw_data = pd.read_csv('dataset/traffic_label_acceptable_traffic_data_new.csv')

day_of_week = pd.date_range('21/02/2021', '08/07/2021', freq='D').to_series().dt.dayofweek
raw_data['weekday'] = 0

datetimes = raw_data['recorded'].to_numpy()
days = np.core.defchararray.split(datetimes.astype(str), sep=' ')

unique_dates = dict()

for day in days:
    if unique_dates.get(day[0]) == None:
        unique_dates[day[0]] = True

valid_days = list(unique_dates.keys())

for day in unique_dates:
    raw_data.loc[raw_data['recorded'].str.contains(day), 'weekday'] = day_of_week[day]
    

scaler = MinMaxScaler()
raw_data[['ds1', 'mf1', 'rf1', 'ds2', 'mf2', 'rf2', 'ds3', 'mf3', 'rf3', 'num_neg', 'num_pos']] = scaler.fit_transform(raw_data[['ds1', 'mf1', 'rf1', 'ds2', 'mf2', 'rf2', 'ds3', 'mf3', 'rf3', 'num_neg', 'num_pos']])

raw_data['nextday_weekday'] = 0
raw_data['nextday_weekend'] = 0
raw_data['nextday_holiday'] = 0

old_holiday = ('4/2/2021', '4/3/2021', '4/4/2021', '4/5/2021', '4/26/2021', '4/25/2021', '3/30/2021', '3/31/2021', '4/1/2021')
holiday = set()

for day in old_holiday:
    date = pd.to_datetime(day, format='%m/%d/%Y')
    holiday.add(date.strftime('%Y-%m-%d'))
    
for day in unique_dates.keys():
    if (day_of_week[day] + 1) % 7 == 5 or (day_of_week[day] + 1) % 7 == 6:
        raw_data.loc[raw_data['recorded'].str.contains(day), 'nextday_weekend'] = 1
    
    if (day_of_week[day] + 1) % 7 >= 0 and (day_of_week[day] + 1) % 7 <= 4:
        raw_data.loc[raw_data['recorded'].str.contains(day), 'nextday_weekday'] = 1

    if (pd.to_datetime(day) + dt.timedelta(days=1)).strftime('%Y-%m-%d') in holiday:
        raw_data.loc[raw_data['recorded'].str.contains(day), 'nextday_holiday'] = 1
        
datetime = pd.to_datetime(np.array(list(unique_dates.keys()))).sort_values().strftime('%Y-%m-%d')
datetime = datetime.drop('2021-06-11')

training_datetime = np.array([['2021-02-21', '2021-02-22', '2021-02-23', '2021-02-24', '2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28', '2021-03-01', '2021-03-02', '2021-03-03'],
       ['2021-03-12', '2021-03-13', '2021-03-14', '2021-03-15', '2021-03-16', '2021-03-17', '2021-03-18', '2021-03-19', '2021-03-20', '2021-03-21', '2021-03-22', '2021-03-23'],
       ['2021-03-26', '2021-03-27', '2021-03-28', '2021-03-29'], 
       ['2021-04-04', '2021-04-05', '2021-04-06', '2021-04-07', '2021-04-08', '2021-04-09'],
       ['2021-04-13', '2021-04-14', '2021-04-15', '2021-04-16', '2021-04-17', '2021-04-18'],
       ['2021-04-23', '2021-04-24', '2021-04-25', '2021-04-26', '2021-04-27', '2021-04-28', '2021-04-29', '2021-04-30', '2021-05-01'],
       ['2021-05-04', '2021-05-05', '2021-05-06', '2021-05-07']], dtype=object)

validation_datetime = np.array([['2021-05-08', '2021-05-09', '2021-05-10', '2021-05-11', '2021-05-12', '2021-05-13', '2021-05-14'],
        ['2021-05-21', '2021-05-22', '2021-05-23', '2021-05-24', '2021-05-25', '2021-05-26', '2021-05-27'],
        ['2021-06-01', '2021-06-02', '2021-06-03']], dtype=object)

test_datetime = np.array([['2021-06-04', '2021-06-05', '2021-06-06', '2021-06-07'], 
                          ['2021-06-15', '2021-06-16', '2021-06-17'],
                         ['2021-03-30', '2021-03-31', '2021-04-01', '2021-04-02', '2021-04-03']], dtype=object)


training_data = dict()
validation_data = dict()
testing_data = dict()

# CREATING TRAINING DATA
for j in range(len(training_datetime)):
    current_training_set = training_datetime[j]
    current_training_df = pd.DataFrame()
    for each_set in current_training_set:
        current_training_df = current_training_df.append(raw_data.loc[raw_data['recorded'].str.contains(each_set)])
    training_data[j] = current_training_df

# CREATE VALIDATION DATA
for v in range(len(validation_datetime)):
    current_validation_set = validation_datetime[v]
    current_validation_df = pd.DataFrame()
    for each_set in current_training_set:
        current_validation_df = current_validation_df.append(raw_data.loc[raw_data['recorded'].str.contains(each_set)])
    validation_data[v] = current_validation_df

# CREATE TEST DATA
for t in range(len(test_datetime)):
    current_test_set = test_datetime[t]
    current_test_df = pd.DataFrame()
    for each_set in current_test_set:
        current_test_df = current_test_df.append(raw_data.loc[raw_data['recorded'].str.contains(each_set)])
    testing_data[t] = current_test_df

### Data Windowing

#### Data Windowing Function

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift, label_columns=None):

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in
                                        enumerate(label_columns)}
        self.column_indices = {name: i for i, name in
                            enumerate(raw_data.iloc[:, 1:].columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [labels[:, :, self.column_indices[name]] for name in self.label_columns],
                axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels

    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        # targets = np.array(data.iloc[:, 0:7:3], dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=self.shift,
            shuffle=False,
            batch_size=1,)

        ds = ds.map(self.split_window)

        return ds

    def create_tf_dataset(self, data):
        ds = tf.data.Dataset.from_tensors([])
        ds = None
        for d in data:
            if ds == None:
                ds = self.make_dataset(data=d.iloc[:, 1:])
            else:
                ds = ds.concatenate(self.make_dataset(data=d.iloc[:, 1:]))
        return ds

    def train(self, training_data):
        return self.create_tf_dataset(training_data)

    def val(self, validation_data):
        return self.create_tf_dataset(validation_data)

    def test(self, testing_data):
        return self.create_tf_dataset(testing_data)

#### Windowing Training, Validation and Testing dataset

In [None]:
window = WindowGenerator(input_width=2, label_width=1, shift=1
, label_columns=['ds1', 'ds2', 'ds3'])

training_data_tf = window.train(list(training_data.values()))
validation_data_tf = window.val(list(validation_data.values()))
testing_data_tf = window.test(list(testing_data.values()))

In [None]:
inputs = []
targets = []
for train, target in training_data_tf:
    a = tf.reshape(train, -1)
    b = tf.reshape(target, -1)
    inputs.append(a.numpy())
    targets.append(b.numpy())

inputs = np.array(inputs)
targets = np.array(targets)

In [None]:
inputs_val = []
targets_val = []
for train, target in validation_data_tf:
    a = tf.reshape(train, -1)
    b = tf.reshape(target, -1)
    inputs_val.append(a.numpy())
    targets_val.append(b.numpy())

inputs_val = np.array(inputs_val)
targets_val = np.array(targets_val)

In [None]:
inputs_test = []
targets_test = []
for train, target in testing_data_tf:
    a = tf.reshape(train, -1)
    b = tf.reshape(target, -1)
    inputs_test.append(a.numpy())
    targets_test.append(b.numpy())

inputs_test_mlp = np.array(inputs_test)
targets_test_mlp = np.array(targets_test)

## TRAINING MODEL AND DEVELOPING

### Model Architecture

#### CNN

In [None]:
inputs = []
targets = []
for train, target in training_data_tf:
    a = tf.reshape(train, [2, 15])
    b = tf.reshape(target, -1)
    inputs.append(a.numpy())
    targets.append(b.numpy())

inputs = np.array(inputs)
targets = np.array(targets)

inputs_val = []
targets_val = []
for train, target in validation_data_tf:
    a = tf.reshape(train, [2, 15])
    b = tf.reshape(target, -1)
    inputs_val.append(a.numpy())
    targets_val.append(b.numpy())

inputs_val = np.array(inputs_val)
targets_val = np.array(targets_val)

inputs_test = []
targets_test = []
for train, target in testing_data_tf:
    a = tf.reshape(train, [2, 15])
    b = tf.reshape(target, -1)
    inputs_test.append(a.numpy())
    targets_test.append(b.numpy())

inputs_test = np.array(inputs_test)
targets_test = np.array(targets_test)

In [None]:
results = []

with tf.device('/GPU:0'):
    for i in range(50):
        cnn_model = tf.keras.models.Sequential()

        # FIRST LAYER
        cnn_model.add(tf.keras.layers.Conv1D(filters=60, kernel_size=1, activation='relu', input_shape=(2,15)))
        cnn_model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
        cnn_model.add(tf.keras.layers.Flatten())
        cnn_model.add(tf.keras.layers.Dense(100, activation='relu'))
        cnn_model.add(tf.keras.layers.Dense(3))
        cnn_model.add(layers.Reshape((3,)))

        # COMPILE MODEL
        cnn_model.compile(optimizer=tf.optimizers.Adam(), loss=tf.losses.MeanSquaredError(), metrics=[tf.keras.metrics.MeanAbsoluteError()])

        history = cnn_model.fit(inputs, targets, epochs=50, shuffle=False, validation_data=(inputs_val, targets_val))

        result = cnn_model.evaluate(inputs_test, targets_test)
        print(f"Run: {i} times")
        print("-------------------------------")
        results.append(result[0])

In [None]:
np.savetxt('cnn_without_result_mse.txt', results)

cnn_model.save("cnn_model_with_text")

In [None]:
# CREATING TRAINING DATA
for j in range(len(training_datetime)):
    current_training_set = training_datetime[j]
    current_training_df = pd.DataFrame()
    for each_set in current_training_set:
        current_training_df = current_training_df.append(raw_data.loc[raw_data['recorded'].str.contains(each_set)])
    training_data[j] = current_training_df

# CREATE VALIDATION DATA
for v in range(len(validation_datetime)):
    current_validation_set = validation_datetime[v]
    current_validation_df = pd.DataFrame()
    for each_set in current_training_set:
        current_validation_df = current_validation_df.append(raw_data.loc[raw_data['recorded'].str.contains(each_set)])
    validation_data[v] = current_validation_df

# CREATE TEST DATA
for t in range(len(test_datetime)):
    current_test_set = test_datetime[t]
    current_test_df = pd.DataFrame()
    for each_set in current_test_set:
        current_test_df = current_test_df.append(raw_data.loc[raw_data['recorded'].str.contains(each_set)])
    testing_data[t] = current_test_df

In [None]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift, label_columns=None):

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in
                                        enumerate(label_columns)}
        self.column_indices = {name: i for i, name in
                            enumerate(raw_data.iloc[:, 1:].columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

    def split_window(self, features):
        inputs = features[:, self.input_slice, 2:]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [labels[:, :, self.column_indices[name]] for name in self.label_columns],
                axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels

    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        # targets = np.array(data.iloc[:, 0:7:3], dtype=np.float32)
        ds = tf.keras.preprocessing.timeseries_dataset_from_array(
            data=data,
            targets=None,
            sequence_length=self.total_window_size,
            sequence_stride=self.shift,
            shuffle=False,
            batch_size=32,)

        ds = ds.map(self.split_window)

        return ds

    def create_tf_dataset(self, data):
        ds = tf.data.Dataset.from_tensors([])
        ds = None
        for d in data:
            if ds == None:
                ds = self.make_dataset(data=d.iloc[:, 1:])
            else:
                ds = ds.concatenate(self.make_dataset(data=d.iloc[:, 1:]))
        return ds

    def train(self, training_data):
        return self.create_tf_dataset(training_data)

    def val(self, validation_data):
        return self.create_tf_dataset(validation_data)

    def test(self, testing_data):
        return self.create_tf_dataset(testing_data)

In [None]:
window = WindowGenerator(input_width=2, label_width=1, shift=1, label_columns=['ds1', 'ds2', 'ds3'])

training_data_tf = window.train(list(training_data.values()))
validation_data_tf = window.val(list(validation_data.values()))
testing_data_tf = window.test(list(testing_data.values()))

#### VANILLA RNN

In [None]:
results = []

with tf.device('/GPU:0'):
    for i in range(50):
        rnn_model = tf.keras.models.Sequential()

        # FIRST LAYER
        rnn_model.add(layers.SimpleRNN(units=64, return_sequences = True))
        rnn_model.add(layers.Dropout(0.5))

        # SECOND LAYER
        rnn_model.add(layers.SimpleRNN(units=32, return_sequences = True))
        rnn_model.add(layers.Dropout(0.5))

        # THIRD LAYER
        rnn_model.add(layers.SimpleRNN(units=32, return_sequences = True))
        rnn_model.add(layers.Dropout(0.5))


        # FORTH LAYER
        rnn_model.add(layers.SimpleRNN(units=32))
        rnn_model.add(layers.Dropout(0.5))


        # OUTPUT LAYER
        rnn_model.add(layers.Dense(units=3))
        rnn_model.add(layers.Reshape((1, 3)))

        # COMPILE MODEL
        rnn_model.compile(optimizer=tf.optimizers.Adam(), loss=tf.losses.MeanSquaredError(), metrics=[tf.keras.metrics.MeanAbsoluteError()])

        history = rnn_model.fit(training_data_tf, epochs=50, shuffle=False, validation_data=validation_data_tf)

        result = rnn_model.evaluate(testing_data_tf)
        print(f"Run: {i} times")
        print("-------------------------------")
        results.append(result[0])

In [None]:
np.savetxt('rnn_without_result_mse.txt', results)

rnn_model.save("rnn_model_with_text")

#### LSTM MODEL

In [None]:
# results = []

with tf.device('/GPU:0'):
#     for i in range(50):
        lstm_model = tf.keras.models.Sequential()

        # FIRST LAYER
        lstm_model.add(layers.LSTM(units=64, bias_initializer=tf.keras.initializers.HeNormal(), return_sequences = True))
        lstm_model.add(layers.Dropout(0.5))

        # SECOND LAYER
        lstm_model.add(layers.LSTM(units=32, bias_initializer=tf.keras.initializers.HeNormal(), return_sequences = True))
        lstm_model.add(layers.Dropout(0.5))

        # THIRD LAYER
        lstm_model.add(layers.LSTM(units=32, bias_initializer=tf.keras.initializers.HeNormal(), return_sequences = True))
        lstm_model.add(layers.Dropout(0.5))

        # FORTH LAYER
        lstm_model.add(layers.LSTM(units=32, bias_initializer=tf.keras.initializers.HeNormal()))
        lstm_model.add(layers.Dropout(0.5))

        # OUTPUT LAYER
        lstm_model.add(layers.Dense(units=3))
        lstm_model.add(layers.Reshape((1, 3)))

        # COMPILE MODEL
        lstm_model.compile(optimizer=tf.optimizers.Adam(), loss=tf.losses.MeanSquaredError(), metrics=[tf.keras.metrics.MeanAbsoluteError()])

        history = lstm_model.fit(training_data_tf, epochs=100, shuffle=False, validation_data=validation_data_tf)
        ds_predicted_evaluation = lstm_model.evaluate(testing_data_tf)

In [None]:
mae = np.array([0.0352,0.0464,0.0673,0.1275,0.1624,0.1958,0.1802,0.1894])
mse = np.array([0.0022,0.0042,0.0089,0.0274,0.0407,0.0582,0.0464,0.0501])

np.savetxt('maes.txt', mae, delimiter=',')
np.savetxt('mses.txt', mse, delimiter=',')

np.savetxt('lstm_without_result_mse.txt', results)

lstm_model.save("lstm_model_with_text")