In [None]:
import keras
import math
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
import pickle
pd.set_option('display.max_rows', 500)
import os
import tensorflow as tf
import torch
from math import sqrt


# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)
tf.random.set_seed(0)
# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# OWRI FRAMEWORK

In [None]:
# results save path
base_result_path = '../results/hauge/LSTM'
exp_name = 'univariate_AE_real_time_results.pkl'
results_save_path = os.path.join(base_result_path, exp_name)

In [None]:
with open('../data/hauge/processed/featured_fpds_raw.pickle', 'rb') as f:
    results = pickle.load(f)

In [None]:
# load data of correlated results from pickle file
with open('../results/hauge/outlier_scores/AE/correlated_results.pickle', 'rb') as f:
    correlated_results = pickle.load(f)

In [None]:
# get target intersections for each trajectory and direction
target_intersections={"T1":{"North":"K504", "South":"K561"},
                      "T2":{"North":"K703", "South":"K206"}}

In [None]:
thresholds = [0,0.25,0.5,0.75,1]
# thresholds = [0.5]

In [None]:
def merge_trejectory_data(results, trajectory, direction):
    data = pd.DataFrame()
    for intersection_name in results[trajectory][direction]['raw']:
        intersection = results[trajectory][direction]['raw'][intersection_name]
        intersection = intersection.rename(columns={"cars": intersection_name})
        intersection = intersection.set_index(pd.DatetimeIndex(intersection['timestamp']))
        intersection = intersection.drop(columns=['timestamp'])
        data = pd.merge(data, intersection, left_index=True, right_index=True, how='outer')
    data.dropna(inplace=True)
    return data

In [None]:
def preprocess_df(df,n_obs, n_features, sequence_length):
    #do scaling:
    scaler = StandardScaler()
    train_portion = 0.7
    val_portion = 0.1
    test_portion = 0.2
    df_train = df[:math.ceil(len(df)*train_portion)].values
    df_val = df[math.ceil(len(df)*train_portion):math.ceil(len(df)*(train_portion+val_portion))].values
    df_test = df[math.ceil(len(df)*(train_portion+val_portion)):].values
    train_X, train_y = df_train[:, :n_obs], df_train[:, -n_features]
    val_X, val_y = df_val[:, :n_obs], df_val[:, -n_features]
    test_X, test_y = df_test[:, :n_obs], df_test[:, -n_features]
    scl = scaler.fit(train_X) # fit only on training data
    train_X = scl.transform(train_X)
    val_X = scl.transform(val_X)
    test_X = scl.transform(test_X)
    train_X = train_X.reshape((train_X.shape[0], sequence_length, n_features))
    val_X = val_X.reshape((val_X.shape[0], sequence_length, n_features))
    test_X = test_X.reshape((test_X.shape[0], sequence_length, n_features))
    return train_X, train_y, val_X, val_y, test_X, test_y, scl

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [None]:
def init_model(train_X):
    model = keras.models.Sequential()
    model.add(keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(keras.layers.Dense(1))
    model.compile(loss=keras.losses.MeanSquaredLogarithmicError(), optimizer='adam')
    return model

In [None]:
def train_model(train_X,train_y,val_X,val_y,model):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
    history = model.fit(train_X, train_y, epochs=250, batch_size=64, validation_data=(val_X, val_y), 
                        verbose=0,callbacks=[es], shuffle=False)
    return history

In [None]:
errors={}
dfs={}
intersection_arrays = []
for trajectory in results.keys():
    errors[trajectory]={}
    print("\n \n Starting trajectory: {}".format(trajectory))
    for direction in results[trajectory]:
        target = target_intersections[trajectory][direction]
        errors[trajectory][direction]={}
        print("Starting direction: {}".format(direction))
        for threshold in thresholds:
            errors[trajectory][direction][threshold]={}
            # ------------------------------------ data processing ---------------------------------------- #
            data = merge_trejectory_data(results, trajectory, direction)# get raw data of the current trajectory and direction
            ae_score = correlated_results[trajectory][direction] # AE scores of the current trajectory and direction
            number_of_cols = math.ceil(len(ae_score.columns)*threshold) # number of outlier weighted intersections
            if number_of_cols==0: # if threshold is 0, then use the target intersection only
                number_of_cols=1
            top_corr_df = ae_score.corr()[target].sort_values(ascending=False)[:number_of_cols] # get the top correlated intersections
            isct_inc = top_corr_df.index.tolist()
            df = data[isct_inc].copy(deep=True)
            df = df[ [target] + [ col for col in df.columns if col != target ] ]  #move target var to front of DF
            # df = df.mul(top_corr_df, axis=1)
            sequence_length = 12 # number of time steps to look back
            n_features = len(isct_inc) # number of features (correlated intersections)
            output_pred = 1 # number of time steps to predict
            n_obs = sequence_length * n_features # number of columns in the input
            reframed = series_to_supervised(df, sequence_length, output_pred)
            train_X, train_y, val_X, val_y, test_X, test_y, scl = preprocess_df(reframed, n_obs, n_features, sequence_length)
    #         break
    #     break
    # break
            # ------------------------------------ modelling ---------------------------------------------- #
            print('training model for threshold: {}'.format(threshold))
            model=init_model(train_X)
            start = time.time()
            history = train_model(train_X,train_y,val_X,val_y,model)
            end = time.time()
            print("Training time: {}".format(end-start))

            # ------------------------------------ evaluation ---------------------------------------------- #
            print('evaluating model for threshold: {}'.format(threshold))
            yhat = model.predict(test_X)
            errors[trajectory][direction][threshold]['RMSE'] = sqrt(mean_squared_error(yhat,test_y))
            errors[trajectory][direction][threshold]['MAE'] = mean_absolute_error(yhat,test_y)
            errors[trajectory][direction][threshold]['history'] = history
            errors[trajectory][direction][threshold]['df'] = pd.DataFrame({"Real":test_y,"Predicted":yhat.reshape(-1)})
            errors[trajectory][direction][threshold]['train_time'] = end-start


# save errors in save path as pickle file
with open(results_save_path, 'wb') as handle:
    pickle.dump(errors, handle)

In [None]:
errors

In [None]:
AE_results={}
for trajectory in errors.keys():
    for direction in errors[trajectory].keys():
        for threshold in errors[trajectory][direction].keys():
            AE_results[trajectory+'_'+direction+'_'+str(threshold)] = errors[trajectory][direction][threshold]['RMSE']

In [None]:
AE_results

In [None]:
AE_results={}
for trajectory in errors.keys():
    for direction in errors[trajectory].keys():
        for threshold in errors[trajectory][direction].keys():
            AE_results[trajectory+'_'+direction+'_'+str(threshold)] = errors[trajectory][direction][threshold]['RMSE']

In [None]:
AE_results

In [None]:
mse(yhat,test_y.reshape((test_y.shape[0], 1)))

In [None]:
test_X.shape

In [None]:
df_train[:,:n_obs].reshape(-1,sequence_length,n_features)

In [None]:
reframed.values.reshape(-1, sequence_length, n_features).shape

In [None]:
reframed.shape[:,:n_obs]

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
ae_score = correlated_results[trajectory][direction]

In [None]:
ae_score.corr()[target].sort_values(ascending=False)[:1].index.tolist()

In [None]:
isct_inc

In [None]:
df = data[isct_inc].copy(deep=True)
#move target var to front of DF
df = df[ [target] + [ col for col in df.columns if col != target ] ]

In [None]:
# specify the lag sequence
sequence_length = 12
n_features = len(isct_inc)
# frame as supervised learning
reframed = series_to_supervised(df, sequence_length, 1)

In [None]:
values = reframed.values
n_train_hours = math.ceil(len(df)*0.8)
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]

In [None]:
train.shape

In [None]:
# split into input and outputs
n_obs = sequence_length * n_features
train_X, train_y = train[:, :n_obs], train[:, -n_features]
test_X, test_y = test[:, :n_obs], test[:, -n_features]

In [None]:
train_X = train_X.reshape((train_X.shape[0], sequence_length, n_features))
test_X = test_X.reshape((test_X.shape[0], sequence_length, n_features))

In [None]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape