In [196]:
import keras
import math
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
import pickle
pd.set_option('display.max_rows', 500)
import os
import tensorflow as tf
import torch
from math import sqrt


# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)
tf.random.set_seed(0)
# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# OWRI FRAMEWORK

In [355]:
# results save path
base_result_path = '../results/hauge/LSTM'
exp_name = 'univariate_AE_real_time_results.pkl'
results_save_path = os.path.join(base_result_path, exp_name)

In [356]:
with open('../data/hauge/processed/featured_fpds_raw.pickle', 'rb') as f:
    results = pickle.load(f)

In [357]:
# load data of correlated results from pickle file
with open('../results/hauge/outlier_scores/AE/correlated_results.pickle', 'rb') as f:
    correlated_results = pickle.load(f)

In [358]:
# get target intersections for each trajectory and direction
target_intersections={"T1":{"North":"K504", "South":"K561"},
                      "T2":{"North":"K703", "South":"K206"}}

In [359]:
thresholds = [0,0.25,0.5,0.75,1]
# thresholds = [0.5]

In [360]:
def merge_trejectory_data(results, trajectory, direction):
    data = pd.DataFrame()
    for intersection_name in results[trajectory][direction]['raw']:
        intersection = results[trajectory][direction]['raw'][intersection_name]
        intersection = intersection.rename(columns={"cars": intersection_name})
        intersection = intersection.set_index(pd.DatetimeIndex(intersection['timestamp']))
        intersection = intersection.drop(columns=['timestamp'])
        data = pd.merge(data, intersection, left_index=True, right_index=True, how='outer')
    data.dropna(inplace=True)
    return data

In [361]:
def preprocess_df(df,n_obs, n_features, sequence_length):
    #do scaling:
    scaler = StandardScaler()
    train_portion = 0.7
    val_portion = 0.1
    test_portion = 0.2
    df_train = df[:math.ceil(len(df)*train_portion)].values
    df_val = df[math.ceil(len(df)*train_portion):math.ceil(len(df)*(train_portion+val_portion))].values
    df_test = df[math.ceil(len(df)*(train_portion+val_portion)):].values
    train_X, train_y = df_train[:, :n_obs], df_train[:, -n_features]
    val_X, val_y = df_val[:, :n_obs], df_val[:, -n_features]
    test_X, test_y = df_test[:, :n_obs], df_test[:, -n_features]
    scl = scaler.fit(train_X) # fit only on training data
    train_X = scl.transform(train_X)
    val_X = scl.transform(val_X)
    test_X = scl.transform(test_X)
    train_X = train_X.reshape((train_X.shape[0], sequence_length, n_features))
    val_X = val_X.reshape((val_X.shape[0], sequence_length, n_features))
    test_X = test_X.reshape((test_X.shape[0], sequence_length, n_features))
    return train_X, train_y, val_X, val_y, test_X, test_y, scl

In [362]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

In [363]:
def init_model(train_X):
    model = keras.models.Sequential()
    model.add(keras.layers.LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(keras.layers.Dense(1))
    model.compile(loss=keras.losses.MeanSquaredLogarithmicError(), optimizer='adam')
    return model

In [364]:
def train_model(train_X,train_y,val_X,val_y,model):
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)
    history = model.fit(train_X, train_y, epochs=250, batch_size=64, validation_data=(val_X, val_y), 
                        verbose=0,callbacks=[es], shuffle=False)
    return history

In [365]:
errors={}
dfs={}
intersection_arrays = []
for trajectory in results.keys():
    errors[trajectory]={}
    print("\n \n Starting trajectory: {}".format(trajectory))
    for direction in results[trajectory]:
        target = target_intersections[trajectory][direction]
        errors[trajectory][direction]={}
        print("Starting direction: {}".format(direction))
        for threshold in thresholds:
            errors[trajectory][direction][threshold]={}
            # ------------------------------------ data processing ---------------------------------------- #
            data = merge_trejectory_data(results, trajectory, direction)# get raw data of the current trajectory and direction
            ae_score = correlated_results[trajectory][direction] # AE scores of the current trajectory and direction
            number_of_cols = math.ceil(len(ae_score.columns)*threshold) # number of outlier weighted intersections
            if number_of_cols==0: # if threshold is 0, then use the target intersection only
                number_of_cols=1
            top_corr_df = ae_score.corr()[target].sort_values(ascending=False)[:number_of_cols] # get the top correlated intersections
            isct_inc = top_corr_df.index.tolist()
            df = data[isct_inc].copy(deep=True)
            df = df[ [target] + [ col for col in df.columns if col != target ] ]  #move target var to front of DF
            # df = df.mul(top_corr_df, axis=1)
            sequence_length = 12 # number of time steps to look back
            n_features = len(isct_inc) # number of features (correlated intersections)
            output_pred = 1 # number of time steps to predict
            n_obs = sequence_length * n_features # number of columns in the input
            reframed = series_to_supervised(df, sequence_length, output_pred)
            train_X, train_y, val_X, val_y, test_X, test_y, scl = preprocess_df(reframed, n_obs, n_features, sequence_length)
    #         break
    #     break
    # break
            # ------------------------------------ modelling ---------------------------------------------- #
            print('training model for threshold: {}'.format(threshold))
            model=init_model(train_X)
            start = time.time()
            history = train_model(train_X,train_y,val_X,val_y,model)
            end = time.time()
            print("Training time: {}".format(end-start))

            # ------------------------------------ evaluation ---------------------------------------------- #
            print('evaluating model for threshold: {}'.format(threshold))
            yhat = model.predict(test_X)
            errors[trajectory][direction][threshold]['RMSE'] = sqrt(mean_squared_error(yhat,test_y))
            errors[trajectory][direction][threshold]['MAE'] = mean_absolute_error(yhat,test_y)
            errors[trajectory][direction][threshold]['history'] = history
            errors[trajectory][direction][threshold]['df'] = pd.DataFrame({"Real":test_y,"Predicted":yhat.reshape(-1)})
            errors[trajectory][direction][threshold]['train_time'] = end-start


# save errors in save path as pickle file
with open(results_save_path, 'wb') as handle:
    pickle.dump(errors, handle)


 
 Starting trajectory: T1
Starting direction: North
training model for threshold: 0
Epoch 83: early stopping
Training time: 328.64207005500793
evaluating model for threshold: 0
training model for threshold: 0.25
Epoch 45: early stopping
Training time: 178.0862741470337
evaluating model for threshold: 0.25
training model for threshold: 0.5
Epoch 54: early stopping
Training time: 209.17061805725098
evaluating model for threshold: 0.5
training model for threshold: 0.75
Epoch 45: early stopping
Training time: 178.4835720062256
evaluating model for threshold: 0.75
training model for threshold: 1
Epoch 44: early stopping
Training time: 173.7394118309021
evaluating model for threshold: 1
Starting direction: South
training model for threshold: 0
Epoch 53: early stopping
Training time: 203.4845860004425
evaluating model for threshold: 0
training model for threshold: 0.25
Epoch 50: early stopping
Training time: 200.75539994239807
evaluating model for threshold: 0.25
training model for threshol

In [341]:
errors

{'T1': {'North': {0: {'RMSE': 10.535132347551992,
    'MAE': 7.485085318512258,
    'history': <keras.callbacks.History at 0x38ea08eb0>,
    'df':        Real  Predicted
    0       6.0   3.959224
    1      10.0   4.406097
    2       6.0   6.557203
    3       7.0   7.302107
    4       9.0   8.405619
    ...     ...        ...
    26750   3.0   5.019311
    26751   6.0   4.002414
    26752   7.0   4.185501
    26753   7.0   4.278893
    26754   4.0   4.084305
    
    [26755 rows x 2 columns],
    'train_time': 275.1172299385071},
   0.25: {'RMSE': 10.264870137806325,
    'MAE': 7.280481796528865,
    'history': <keras.callbacks.History at 0x3480dba00>,
    'df':        Real  Predicted
    0       6.0   3.871505
    1      10.0   5.117630
    2       6.0   7.459230
    3       7.0   7.674866
    4       9.0   7.194969
    ...     ...        ...
    26750   3.0   7.050316
    26751   6.0   2.982599
    26752   7.0   3.312296
    26753   7.0   3.360334
    26754   4.0   3.263627
    


In [342]:
AE_results={}
for trajectory in errors.keys():
    for direction in errors[trajectory].keys():
        for threshold in errors[trajectory][direction].keys():
            AE_results[trajectory+'_'+direction+'_'+str(threshold)] = errors[trajectory][direction][threshold]['RMSE']

In [343]:
AE_results

{'T1_North_0': 10.535132347551992,
 'T1_North_0.25': 10.264870137806325,
 'T1_North_0.5': 9.981852190981062,
 'T1_North_0.75': 9.936356655526351,
 'T1_North_1': 9.816147274365582,
 'T1_South_0': 18.468399215410425,
 'T1_South_0.25': 17.93657610996099,
 'T1_South_0.5': 17.836112271458045,
 'T1_South_0.75': 17.81936074855347,
 'T1_South_1': 17.60942828580208,
 'T2_South_0': 15.418871755438703,
 'T2_South_0.25': 14.892053154611016,
 'T2_South_0.5': 14.808390425994887,
 'T2_South_0.75': 15.275508976145279,
 'T2_South_1': 14.818099250328288,
 'T2_North_0': 15.45425024358104,
 'T2_North_0.25': 14.965960989755688,
 'T2_North_0.5': 14.932952037085133,
 'T2_North_0.75': 15.160100533590441,
 'T2_North_1': 15.18109814438314}

In [305]:
AE_results={}
for trajectory in errors.keys():
    for direction in errors[trajectory].keys():
        for threshold in errors[trajectory][direction].keys():
            AE_results[trajectory+'_'+direction+'_'+str(threshold)] = errors[trajectory][direction][threshold]['RMSE']

In [306]:
AE_results

{'T1_North_0': 10.59151633354107,
 'T1_North_0.25': 10.196899412467983,
 'T1_North_0.5': 9.955075986889941,
 'T1_North_0.75': 10.05299665551894,
 'T1_North_1': 9.830371060706703,
 'T1_South_0': 18.641919656471167,
 'T1_South_0.25': 17.971930531215893,
 'T1_South_0.5': 17.78031504964693,
 'T1_South_0.75': 17.8761927808514,
 'T1_South_1': 17.471410650736853,
 'T2_South_0': 15.783835842420112,
 'T2_South_0.25': 14.955686164952139,
 'T2_South_0.5': 14.87429694295857,
 'T2_South_0.75': 14.916430099779108,
 'T2_South_1': 14.930474792897037,
 'T2_North_0': 15.473913778461938,
 'T2_North_0.25': 14.940598792684167,
 'T2_North_0.5': 15.136461083171628,
 'T2_North_0.75': 15.01907330839573,
 'T2_North_1': 15.349926838139215}

In [190]:
mse(yhat,test_y.reshape((test_y.shape[0], 1)))

0.2325527460714449

In [171]:
test_X.shape

(26755, 12, 9)

In [165]:
df_train[:,:n_obs].reshape(-1,sequence_length,n_features)

array([[[-0.90355742, -0.9533622 , -0.79573109, ..., -0.7424034 ,
         -0.95437715, -0.89496412],
        [-0.75557919, -0.38073286, -1.11754366, ..., -0.69638785,
         -1.01557695, -1.17619786],
        [-0.79255873, -0.46661   , -0.99375869, ..., -0.51236917,
         -1.09718131, -0.68401791],
        ...,
        [-0.23762361, -0.66690799, -0.52332379, ..., -0.74229107,
         -0.68901399, -0.64877125],
        [-0.90342489, -0.38060682, -0.77085419, ..., -0.46627684,
         -0.54618666, -0.54330012],
        [-0.97739437, -0.52373474, -0.22625324, ..., -0.60426475,
         -0.40335927, -0.47297916]],

       [[-0.75558877, -0.3807381 , -1.11755911, ..., -0.69639982,
         -1.01558801, -1.17621218],
        [-0.79257114, -0.46662612, -0.9937646 , ..., -0.51237487,
         -1.09719095, -0.68401723],
        [-1.27345058, -0.52387158, -0.89473577, ..., -0.74238459,
         -0.81153376, -0.68401791],
        ...,
        [-0.90344436, -0.38061838, -0.77086864, ..., -

In [144]:
reframed.values.reshape(-1, sequence_length, n_features).shape

(144924, 12, 9)

In [148]:
reframed.shape[:,:n_obs]

TypeError: tuple indices must be integers or slices, not tuple

In [133]:
df_train.shape, df_val.shape, df_test.shape

((93644, 117), (13377, 117), (26755, 117))

In [89]:
ae_score = correlated_results[trajectory][direction]

In [124]:
ae_score.corr()[target].sort_values(ascending=False)[:1].index.tolist()

['K703']

In [91]:
isct_inc

['K703', 'K704', 'K183', 'K206', 'K182']

In [92]:
df = data[isct_inc].copy(deep=True)
#move target var to front of DF
df = df[ [target] + [ col for col in df.columns if col != target ] ]

In [49]:
# specify the lag sequence
sequence_length = 12
n_features = len(isct_inc)
# frame as supervised learning
reframed = series_to_supervised(df, sequence_length, 1)

In [66]:
values = reframed.values
n_train_hours = math.ceil(len(df)*0.8)
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]

In [68]:
train.shape

(29892, 65)

In [54]:
# split into input and outputs
n_obs = sequence_length * n_features
train_X, train_y = train[:, :n_obs], train[:, -n_features]
test_X, test_y = test[:, :n_obs], test[:, -n_features]

In [64]:
train_X = train_X.reshape((train_X.shape[0], sequence_length, n_features))
test_X = test_X.reshape((test_X.shape[0], sequence_length, n_features))

In [65]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((29892, 12, 5), (29892,), (7460, 12, 5), (7460,))