In [114]:
import keras
import math
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
from torch.utils.data import Dataset, DataLoader
import pickle
pd.set_option('display.max_rows', 500)
import os
import tensorflow as tf
import torch
import torch.nn as nn
from math import sqrt
# import rmse from sklearn
from sklearn.metrics import mean_squared_error


# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)
tf.random.set_seed(0)
# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# OWRI FRAMEWORK

In [230]:
# results save path
base_result_path = '../results/hauge/LSTM'
exp_name = 'univariate_AE_real_time_results.pkl'
results_save_path = os.path.join(base_result_path, exp_name)

In [231]:
with open('../data/hauge/processed/featured_fpds_raw.pickle', 'rb') as f:
    results = pickle.load(f)

In [232]:
# load data of correlated results from pickle file
with open('../results/hauge/outlier_scores/AE/correlated_results.pickle', 'rb') as f:
    correlated_results = pickle.load(f)

In [233]:
# get target intersections for each trajectory and direction
target_intersections={"T1":{"North":"K504", "South":"K561"},
                      "T2":{"North":"K703", "South":"K206"}}

In [234]:
thresholds = [0,0.25,0.5,0.75,1]

In [235]:
def merge_trejectory_data(results, trajectory, direction):
    data = pd.DataFrame()
    for intersection_name in results[trajectory][direction]['raw']:
        intersection = results[trajectory][direction]['raw'][intersection_name]
        intersection = intersection.rename(columns={"cars": intersection_name})
        intersection = intersection.set_index(pd.DatetimeIndex(intersection['timestamp']))
        intersection = intersection.drop(columns=['timestamp'])
        data = pd.merge(data, intersection, left_index=True, right_index=True, how='outer')
    data.dropna(inplace=True)
    return data

In [236]:
def preprocess_df(df,n_obs, n_features, sequence_length):
    #do scaling:
    scaler = StandardScaler()
    train_portion = 0.8
    test_portion = 0.2
    df_train = df[:math.ceil(len(df)*train_portion)].values
    df_test = df[math.ceil(len(df)*(train_portion)):].values
    train_X, train_y = df_train[:, :n_obs], df_train[:, -n_features]
    test_X, test_y = df_test[:, :n_obs], df_test[:, -n_features]
    scl = scaler.fit(train_X) # fit only on training data
    train_X = scl.transform(train_X)
    test_X = scl.transform(test_X)
    train_X = train_X.reshape((train_X.shape[0], sequence_length, n_features))
    test_X = test_X.reshape((test_X.shape[0], sequence_length, n_features))
    return train_X, train_y, test_X, test_y, scl

In [242]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [243]:
class LSTM_uni(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, device = 'mps',layer_dim=1, dropout_prob = 0.2):
        super(LSTM_uni, self).__init__()
        self.hidden_dim = hidden_dim # number of hidden units in hidden state
        self.layer_dim = layer_dim # number of stacked lstm layers
        # batch_first=True causes input/output tensors to be of shape
        # (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True, dropout=dropout_prob)
        self.fc = nn.Linear(hidden_dim, output_dim) # fully connected layer

    def forward(self, x, future=False):
        # input x is expected to be of shape (batch_dim, seq_dim, feature_dim)
        # hidden and cell states are expected along with input x in LSTMs = (h_0, c_0)
        # Initialize hidden state with zeros (layer_dim, batch_size, hidden_dim)
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=device).requires_grad_()
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim, device=device).requires_grad_()

        # LSTM output is Outputs: output, (h_n, c_n)
        # output is of shape (batch_dim, seq_dim, hidden_dim), h_n and c_n are of shape (layer_dim, batch_dim, hidden_dim)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = out[:, -1, :] # only take the last output of the sequence
        out = self.fc(out) # fully connected layer

        return out

In [244]:
def train_model(model, train_X,train_y, loss_fn, optimiser, device, epochs=100):
    history = {}
    history['train_loss'] = []

    train_X_loader = DataLoader(train_X, batch_size=64, shuffle=False)
    train_y_loader = DataLoader(train_y, batch_size=64, shuffle=False)

    for epoch in range(epochs):
        history[epoch] = []
        ep_start = time.time()
        running_loss = 0.0
        for bx, data in enumerate(zip(train_X_loader,train_y_loader)):
            X = data[0].to(device)
            y = data[1].to(device)
            bt = model(X)
            loss = loss_fn(bt.reshape(-1), y.reshape(-1)) # calculate loss for input and recreated output
            history[epoch].append(loss.item())
            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            running_loss += loss.item()
        epoch_loss = running_loss/train_X.shape[0]
        history['train_loss'].append(epoch_loss)
    

In [245]:
def model_evaluation( model, test_X, device):
    test_X_loader = DataLoader(test_X, batch_size=64, shuffle=False)
    model = model.eval()
    preds = []
    with torch.no_grad():
        for bx, data in enumerate(test_X_loader):
            X = data.to(device)
            bt = model(X)
            preds.append(bt.cpu().numpy())
    preds = np.vstack(preds)
    preds = preds.reshape(-1)
    return preds

In [246]:
errors={}
dfs={}
intersection_arrays = []
for trajectory in results.keys():
    errors[trajectory]={}
    print("\n \n Starting trajectory: {}".format(trajectory))
    for direction in results[trajectory]:
        target = target_intersections[trajectory][direction]
        errors[trajectory][direction]={}
        print("Starting direction: {}".format(direction))
        for threshold in thresholds:
            errors[trajectory][direction][threshold]={}
            print("Starting threshold: {}".format(threshold))
            # ------------------------------------ data processing ---------------------------------------- #
            data = merge_trejectory_data(results, trajectory, direction)# get raw data of the current trajectory and direction
            ae_score = correlated_results[trajectory][direction] # AE scores of the current trajectory and direction
            number_of_cols = math.ceil(len(ae_score.columns)*threshold) # number of outlier weighted intersections
            if number_of_cols==0: # if threshold is 0, then use the target intersection only
                number_of_cols=1
            top_corr_df = ae_score.corr()[target].sort_values(ascending=False)[:number_of_cols] # get the top correlated intersections
            isct_inc = top_corr_df.index.tolist()
            df = data[isct_inc].copy(deep=True)
            df = df[ [target] + [ col for col in df.columns if col != target ] ]  #move target var to front of DF
            df = df.mul(top_corr_df, axis=1)
            df = df.astype('float32')
            sequence_length = 12 # number of time steps to look back
            n_features = len(isct_inc) # number of features (correlated intersections)
            output_pred = 1 # number of time steps to predict
            n_obs = sequence_length * n_features # number of columns in the input
            reframed = series_to_supervised(df, sequence_length, output_pred)
            train_X, train_y, test_X, test_y, scl = preprocess_df(reframed, n_obs, n_features, sequence_length)
            device = 'mps' if torch.backends.mps.is_available() else 'cpu'

#             # # ------------------------------------ modelling ---------------------------------------------- #
            # define model, loss function and optimizer
            model = LSTM_uni(input_dim = n_features, hidden_dim = 32, layer_dim = 1, output_dim = 1, dropout_prob= 0.2)
            model = model.to(device)
            loss_fn = torch.nn.MSELoss()
            optimiser = torch.optim.Adam(model.parameters(), lr=0.01)
            start = time.time()
            history = train_model(model, train_X,train_y, loss_fn, optimiser, device, epochs=100)
            end = time.time()
            print("Training time: {}".format(end-start))


            # ------------------------------------ evaluation ---------------------------------------------- #
            yhat = model_evaluation( model, test_X , device)
            errors[trajectory][direction][threshold]['RMSE'] = sqrt(mean_squared_error(yhat,test_y))
            errors[trajectory][direction][threshold]['MAE'] = mean_absolute_error(yhat,test_y)
            errors[trajectory][direction][threshold]['history'] = history
            errors[trajectory][direction][threshold]['df'] = pd.DataFrame({"Real":test_y,"Predicted":yhat})
            errors[trajectory][direction][threshold]['train_time'] = end-start


# save errors in save path as pickle file
with open(results_save_path, 'wb') as handle:
    pickle.dump(errors, handle)


 
 Starting trajectory: T1
Starting direction: North
Starting threshold: 0
Training time: 1102.97265791893
Starting threshold: 0.25
Training time: 2710.679126024246
Starting threshold: 0.5
Training time: 1065.304591178894
Starting threshold: 0.75
Training time: 10699.2421438694
Starting threshold: 1
Training time: 1081.2799367904663
Starting direction: South
Starting threshold: 0
Training time: 8442.917735815048
Starting threshold: 0.25
Training time: 1052.9938189983368
Starting threshold: 0.5
Training time: 1058.919678926468
Starting threshold: 0.75
Training time: 4353.747722864151
Starting threshold: 1
Training time: 6885.659650087357

 
 Starting trajectory: T2
Starting direction: South
Starting threshold: 0
Training time: 810.9871940612793
Starting threshold: 0.25
Training time: 825.5231857299805
Starting threshold: 0.5
Training time: 820.1537919044495
Starting threshold: 0.75
Training time: 804.725732088089
Starting threshold: 1
Training time: 1917.9892659187317
Starting directio

In [249]:
AE_results={}
for trajectory in errors.keys():
    for direction in errors[trajectory].keys():
        for threshold in errors[trajectory][direction].keys():
            AE_results[trajectory+'_'+direction+'_'+str(threshold)] = errors[trajectory][direction][threshold]['RMSE']

In [250]:
AE_results

{'T1_North_0': 10.415910332242516,
 'T1_North_0.25': 9.995510619470389,
 'T1_North_0.5': 9.812833938261788,
 'T1_North_0.75': 9.832457250157352,
 'T1_North_1': 9.775909038139249,
 'T1_South_0': 18.325964446663935,
 'T1_South_0.25': 17.623300456899116,
 'T1_South_0.5': 17.001525810432796,
 'T1_South_0.75': 16.935522435579472,
 'T1_South_1': 16.698712071991697,
 'T2_South_0': 15.415971949965055,
 'T2_South_0.25': 14.555171841780158,
 'T2_South_0.5': 14.608267839063346,
 'T2_South_0.75': 14.727751941355264,
 'T2_South_1': 14.539517452413042,
 'T2_North_0': 15.372495369704996,
 'T2_North_0.25': 15.104448969164991,
 'T2_North_0.5': 14.985238504837852,
 'T2_North_0.75': 14.975554828841588,
 'T2_North_1': 15.060420102444333}