In [25]:
# import libraries
import keras
import math
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
from torch.utils.data import Dataset, DataLoader
import pickle
pd.set_option('display.max_rows', 500)
import os
import tensorflow as tf
import plotly.express as px
import torch
import torch.nn as nn
from math import sqrt
# import rmse from sklearn
from sklearn.metrics import mean_squared_error
import plotly.io as pio


# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)
tf.random.set_seed(0)
# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# ---------- Functions ----------

In [26]:
# function to convert hague dataset to same key value pair as the results
def mod_dict(results):
    new_dict = {}
    for key,val in results.items():
        new_key1 = key+'_'+'North'
        new_dict[new_key1] = val['North']
        new_key2 = key+'_'+'South'
        new_dict[new_key2] = val['South']
    return new_dict

In [27]:
# function to extract results from the results dictionary
def extract_results(results,data_name):
    if data_name == 'PEMS-BAY':
        thresholds = [0,0.01,0.05,0.1,0.25,0.5, 0.75, 1]
    else:
        thresholds = [0,0.05,0.1,0.25,0.5, 0.75, 1]
    result_dict = {}
    for val in thresholds:
        result_dict[val] = {}
        result_dict[val]['RMSE'] = []
        result_dict[val]['MAE'] = []
        result_dict[val]['train_time'] = []

    c=0
    for intersection, value1 in results.items():
        c+=1
        if c%1==0:
            for threshold, value2 in value1.items():
                result_dict[threshold]['RMSE'].append(value2['RMSE'])
                result_dict[threshold]['MAE'].append(value2['MAE'])
                result_dict[threshold]['train_time'].append(value2['train_time'])

    for intersection, value1 in results.items():
        for threshold, value2 in value1.items():
            result_dict[threshold]['RMSE'] = np.mean(result_dict[threshold]['RMSE'])
            result_dict[threshold]['MAE'] = np.mean(result_dict[threshold]['MAE'])
            result_dict[threshold]['train_time'] = np.mean(result_dict[threshold]['train_time'])

    return result_dict

In [28]:
# function to extract results from pickle files
def get_results(base_result_path, data_name, exp_name,out_mod):
    load_path = os.path.join(base_result_path, exp_name)
    with open(load_path, 'rb') as f:
        results = pickle.load(f)

    if data_name == 'hague':
        results = mod_dict(results)

    result_dict = extract_results(results, data_name)
    
    # make dataframe
    df = pd.DataFrame(result_dict).T
    df['model_name'] = out_mod
    df['threshold'] = df.index
    df['threshold'] = df['threshold'].astype(str)
    df.reset_index(drop=True, inplace=True)
    
    return df

In [29]:
# function to load results
def load_real_time_results(load_path, data_name):
    # laod results
    with open(load_path, 'rb') as f:
        results = pickle.load(f)
    if data_name == 'hague':
        results = mod_dict(results)

    return results

In [30]:
# function to extract real-time results
def extract_time_key_results(results_time_dict):
    incremental_weighted_update_real = []
    incremental_weighted_update_predicted = []
    train_time = []
    for key,val in results_time_dict.items():
        incremental_weighted_update_real.extend(val['df']['Real'].to_list())
        incremental_weighted_update_predicted.extend(val['df']['Predicted'].to_list())
        train_time.append(val['train_time'])

    RMSE = sqrt(mean_squared_error(incremental_weighted_update_real,incremental_weighted_update_predicted))
    return RMSE, np.sum(train_time)

In [31]:
# supporting function to extract real-time results from the dictionary 
def extract_df_intc_results(results_time_dict):
    incremental_df = pd.DataFrame()
    intersection_lists = []
    time_list  = []
    for key,val in results_time_dict.items():
        time_list.append(key)
        incremental_df = incremental_df.append(val['df'])
        intersection_lists.append(val['intersedctions'])

    return incremental_df, intersection_lists,time_list

# RQ - Does incorporating the Outlier Weighted Autoencoder Model (OWAM) in traffic flow prediction improve its performance?


- Can OWAM effectively reduce the dimensionality of the LSTM-based traffic prediction model while preserving essential features?
- To what extent does changing loss function in Autoencoders with earth’s mover distance impacts the overall model performance?
- How does OWAM perform in terms of time and accuracy as compared to LSTM baseline and state-of-the-art traffic prediction approaches?

In [32]:
# declare global variables
previous_oultier_model_name = 'OBIS'
outlier_model_name = ["AE", "DAE", "PW-AE","HST", "ILOF","Kit-Net"]
RMSE_mode_name = 'PW-AE'

## Extract Hague Results (Vehicle Numbers Data)

In [33]:
# results save path
data_name = 'hague'
base_result_path = f'../results/{data_name}/LSTM'
exp_non_weighted = f'univariate_ILOF_outlier_non_weighted.pkl'
exp_earth_mover = f'univariate_{RMSE_mode_name}_outlier_weighted_RMSE.pkl'

In [34]:
# get correlated weighted results
weighted_result_df = pd.DataFrame()
for out_mod in outlier_model_name:
    exp_name_weighted = f'univariate_{out_mod}_outlier_weighted.pkl'
    df_weighted = get_results(base_result_path, data_name, exp_name_weighted,out_mod)
    weighted_result_df = weighted_result_df.append(df_weighted)

model_map = {'PW-AE':'PW-AE-EMD', 'AE':'AE-EMD','DAE':'DAE-EMD','HST':'HST', 'ILOF':'ILOF', 'Kit-Net':'Kit-Net'}
weighted_result_df['model_name'] = weighted_result_df['model_name'].map(model_map) 

In [35]:
# get previous outlier based model results
non_weighted_result_df = get_results(base_result_path, data_name, exp_non_weighted,'ILOF')
non_weighted_result_df.replace('ILOF', previous_oultier_model_name, inplace=True) # the previous outlier based model is FPD-ILOF non weighted
RMSE_result_df = get_results(base_result_path, data_name, exp_earth_mover,RMSE_mode_name) # get RMSE results for PW-AE

In [36]:
# # GNN RESULTS are from the traffic benchmark paper, Re run the code to get the results, results may very slightly based on random seed 
# GNN results are stored seperately in the text files, results are generated using DGCRN model from the traffic benchmark paper
GNN_dict = [{'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(0.0)},
            {'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(0.05)},
            {'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(0.1)},
            {'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(0.25)},
            {'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(0.5)},
            {'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(0.75)},
            {'RMSE':13.11, 'MAE':7.5016, 'train_time':5043.312, 'model_name':'DGCRN (SOTA)','threshold':str(1.0)}]
GNN_df = pd.DataFrame(GNN_dict)

In [37]:
# merge all results
weighted_result_df = weighted_result_df.append(GNN_df)
weighted_result_df = weighted_result_df.append(non_weighted_result_df)
weighted_result_df = weighted_result_df.append(RMSE_result_df)
weighted_result_df['log_train_time'] = np.log(weighted_result_df['train_time']) # log train time
hague_df = weighted_result_df.copy()
hague_df['data_name'] = ['Hague']*len(hague_df)
hague_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold,log_train_time,data_name
0,14.223739,9.974554,285.320609,AE-EMD,0.0,5.653613,Hague
1,14.18864,9.942463,313.574481,AE-EMD,0.05,5.748037,Hague
2,14.106318,9.886723,280.069239,AE-EMD,0.1,5.635037,Hague
3,13.885396,9.726711,315.691193,AE-EMD,0.25,5.754764,Hague
4,13.567951,9.5143,318.464729,AE-EMD,0.5,5.763512,Hague


## Extract METR-LA Results (Speed Data)

In [38]:
# results save path
data_name = 'METR-LA'
base_result_path = f'../results/{data_name}/LSTM'
exp_non_weighted = f'univariate_ILOF_outlier_non_weighted.pkl'
exp_earth_mover = f'univariate_{RMSE_mode_name}_outlier_weighted_RMSE.pkl'

In [39]:
# get correlated weighted results
weighted_result_df = pd.DataFrame()
for out_mod in outlier_model_name:
    exp_name_weighted = f'univariate_{out_mod}_outlier_weighted.pkl'
    df_weighted = get_results(base_result_path, data_name, exp_name_weighted,out_mod)
    weighted_result_df = weighted_result_df.append(df_weighted)

model_map = {'PW-AE':'PW-AE-EMD', 'AE':'AE-EMD','DAE':'DAE-EMD','HST':'HST', 'ILOF':'ILOF', 'Kit-Net':'Kit-Net'}
weighted_result_df['model_name'] = weighted_result_df['model_name'].map(model_map) 

In [40]:
# get previous outlier based model results
non_weighted_result_df = get_results(base_result_path, data_name, exp_non_weighted,'ILOF')
non_weighted_result_df.replace('ILOF', previous_oultier_model_name, inplace=True) # the previous outlier based model is LOF-FPD non weighted
RMSE_result_df = get_results(base_result_path, data_name, exp_earth_mover,RMSE_mode_name) # get RMSE results for PW-AE

In [41]:
# GNN RESULTS
GNN_dict = [{'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(0.0)},
            {'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(0.05)},
            {'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(0.1)},
            {'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(0.25)},
            {'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(0.5)},
            {'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(0.75)},
            {'RMSE':4.1613, 'MAE':2.4379, 'train_time':11263.234, 'model_name':'DGCRN (SOTA)','threshold':str(1.0)}]
GNN_df = pd.DataFrame(GNN_dict)
# GNN_df['threshold'] = GNN_df['threshold'].astype(object)

In [42]:
# merge all results
weighted_result_df = weighted_result_df.append(GNN_df)
weighted_result_df = weighted_result_df.append(non_weighted_result_df)
weighted_result_df = weighted_result_df.append(RMSE_result_df)
weighted_result_df['log_train_time'] = np.log(weighted_result_df['train_time']) # log train time
metr_df = weighted_result_df.copy()
metr_df['data_name'] = ['METR-LA']*len(metr_df)
metr_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold,log_train_time,data_name
0,6.230096,3.050992,42.094377,AE-EMD,0.0,3.739914,METR-LA
1,5.905414,2.8151,56.113401,AE-EMD,0.05,4.027375,METR-LA
2,5.923784,2.870068,48.046008,AE-EMD,0.1,3.872159,METR-LA
3,5.956229,2.957772,54.877143,AE-EMD,0.25,4.005097,METR-LA
4,7.087019,3.457735,44.110559,AE-EMD,0.5,3.786699,METR-LA


## Extract PEMS-BAY Results (Speed Data)

In [43]:
# results save path
data_name = 'PEMS-BAY'
base_result_path = f'../results/{data_name}/LSTM'
exp_non_weighted = f'univariate_ILOF_outlier_non_weighted.pkl'
exp_earth_mover = f'univariate_{RMSE_mode_name}_outlier_weighted_RMSE.pkl'

In [44]:
# get correlated weighted results
weighted_result_df = pd.DataFrame()
for out_mod in outlier_model_name:
    exp_name_weighted = f'univariate_{out_mod}_outlier_weighted.pkl'
    df_weighted = get_results(base_result_path, data_name, exp_name_weighted,out_mod)
    weighted_result_df = weighted_result_df.append(df_weighted)

model_map = {'PW-AE':'PW-AE-EMD', 'AE':'AE-EMD','DAE':'DAE-EMD','HST':'HST', 'ILOF':'ILOF', 'Kit-Net':'Kit-Net'}
weighted_result_df['model_name'] = weighted_result_df['model_name'].map(model_map) 

In [45]:
# get previous outlier based model results
non_weighted_result_df = get_results(base_result_path, data_name, exp_non_weighted,'ILOF')
non_weighted_result_df.replace('ILOF', previous_oultier_model_name, inplace=True) # the previous outlier based model is LOF-FPD non weighted
RMSE_result_df = get_results(base_result_path, data_name, exp_earth_mover,RMSE_mode_name) # get RMSE results for PW-AE

In [46]:
# GNN RESULTS
GNN_dict = [{'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(0.0)},
            {'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(0.05)},
            {'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(0.1)},
            {'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(0.25)},
            {'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(0.5)},
            {'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(0.75)},
            {'RMSE':1.5912, 'MAE':0.8734, 'train_time':25900.6657, 'model_name':'DGCRN (SOTA)','threshold':str(1.0)}]
GNN_df = pd.DataFrame(GNN_dict)
# GNN_df['threshold'] = GNN_df['threshold'].astype(object)

In [47]:
# merge all results
weighted_result_df = weighted_result_df.append(GNN_df)
weighted_result_df = weighted_result_df.append(non_weighted_result_df)
weighted_result_df = weighted_result_df.append(RMSE_result_df)
weighted_result_df['log_train_time'] = np.log(weighted_result_df['train_time']) # log train time
pems_df = weighted_result_df.copy()
pems_df['data_name'] = ['PEMS-BAY']*len(pems_df)
pems_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold,log_train_time,data_name
0,5.648171,4.191589,238.936651,AE-EMD,0.0,5.476198,PEMS-BAY
1,5.139868,4.70102,267.900101,AE-EMD,0.01,5.590614,PEMS-BAY
2,4.418648,3.947434,180.289714,AE-EMD,0.05,5.194565,PEMS-BAY
3,4.530839,4.041856,206.450383,AE-EMD,0.1,5.33006,PEMS-BAY
4,3.985867,3.218648,668.765258,AE-EMD,0.25,6.505433,PEMS-BAY


## Plots & Analysis

In [48]:
# merge all results and make necessary changes
weighted_result_df = hague_df.append(metr_df)
weighted_result_df = weighted_result_df.append(pems_df)
weighted_result_df = weighted_result_df[weighted_result_df['threshold'] != '0.01']
weighted_result_df['model_name'] = weighted_result_df['model_name'].replace('DGCRN (SOTA)','DGCRN')
weighted_result_df['model_name'] = weighted_result_df['model_name'].replace('previous_outlier_based_model','FPD-LOF')
# reset index
weighted_result_df.reset_index(drop=True, inplace=True)
# only keep 2 decimal places in all float columns
weighted_result_df = weighted_result_df.round(2)
weighted_result_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold,log_train_time,data_name
0,14.22,9.97,285.32,AE-EMD,0.0,5.65,Hague
1,14.19,9.94,313.57,AE-EMD,0.05,5.75,Hague
2,14.11,9.89,280.07,AE-EMD,0.1,5.64,Hague
3,13.89,9.73,315.69,AE-EMD,0.25,5.75,Hague
4,13.57,9.51,318.46,AE-EMD,0.5,5.76,Hague


### a. Optimal neighborhood sensors selection

In [51]:
# Accuracy comparison hague
sample_df = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE-EMD') & (weighted_result_df['data_name'] == 'Hague') ] 
fig = px.histogram(sample_df, x="threshold", y="RMSE",
             color='model_name', barmode='group',
             height=480, width=640, text_auto=True)
fig.update_yaxes(range=[12, 15], title='RMSE Score (Lower is better)')
fig.update_xaxes(title='Threshold')
fig = fig.update_layout(showlegend=False)
fig.show()
pio.write_image(fig, 'plots/optimal_num_hague.png',scale=4, width=640, height=480)

In [52]:
# Accuracy comparison METR-LA
sample_df = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE-EMD') & (weighted_result_df['data_name'] == 'METR-LA') ] 
fig = px.histogram(sample_df, x="threshold", y="RMSE",
             color='model_name', barmode='group',
             height=480, width=700, text_auto=True)
fig.update_yaxes(range=[2, 12], title='RMSE Score (Lower is better)')
fig.update_xaxes(title='Threshold')
fig.update_layout(showlegend=False)
fig.show()
pio.write_image(fig, 'plots/optimal_num_metr.png',scale=4, width=640, height=480)

In [53]:
# Accuracy comparison PEMS-BAY
sample_df = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE-EMD') & (weighted_result_df['data_name'] == 'PEMS-BAY') ] 
fig = px.histogram(sample_df, x="threshold", y="RMSE",
             color='model_name', barmode='group',
             height=400, width=700, text_auto=True)
fig.update_yaxes(range=[2, 8], title='RMSE Score (Lower is better)')
fig.update_xaxes(title='Threshold')
fig.update_layout(showlegend=False)
fig.show()
pio.write_image(fig, 'plots/pems_optim_no.png',scale=4, width=640, height=480)

In [54]:
# Time comparision of different models
sample_df = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE-EMD')]
# replace column name data_name with Dataset
sample_df = sample_df.rename(columns={'data_name':'Dataset', 'threshold':'Threshold'})
fig = px.histogram(sample_df, x="Dataset", y="log_train_time",
             color='Threshold', barmode='group',text_auto=True)
fig.update_yaxes(range=[2, 8], title='Log Training Time (s) (Convergence)')
# fig.update_xaxes(title='Threshold')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01, 
    orientation="h",
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/training_time_optim.png',height=500, width = 1000, scale=4)

### b. Earth Mover Distance Effects

In [55]:
# RMSE score comparison of EMD vs PW-AE
sample_df = weighted_result_df
sample_df = sample_df[(sample_df['model_name'] == 'PW-AE-EMD') | (sample_df['model_name'] == 'PW-AE')]
sample_df['model_name'] = sample_df['model_name'].replace({'PW-AE-EMD': 'OWAM-EMD', 'PW-AE': 'OWAM-RMSE'})
# replace column model_name with Model Name
sample_df = sample_df.rename(columns={'model_name': 'Model'})
sample_df = sample_df[((sample_df['threshold'] == '0.5') &  (sample_df['data_name'] == 'METR-LA')) | ((sample_df['threshold'] == '1.0') & (sample_df['data_name'] == 'Hague'))  |((sample_df['threshold'] == '0.1') & (sample_df['data_name'] == 'PEMS-BAY'))]
# Accuracy comparison of EMD vs PW-AE
fig = px.histogram(sample_df, x="data_name", y="RMSE",
             color='Model', barmode='group', text_auto=True, height=450, width=800)
fig.update_yaxes(title='RMSE Score (Lower is better)', range=[2, 18])
fig.update_xaxes(title='Dataset')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/emd_rmse.png',height=450, width = 800, scale=4)

In [56]:
# time comparison of EMD vs PW-AE
sample_df = weighted_result_df
sample_df = sample_df[(sample_df['model_name'] == 'PW-AE-EMD') | (sample_df['model_name'] == 'PW-AE')]
# replace the PW-AE-EMD with OWAM-EMD and PW-AE with OWAM
sample_df['model_name'] = sample_df['model_name'].replace({'PW-AE-EMD': 'OWAM-EMD', 'PW-AE': 'OWAM-RMSE'})
sample_df = sample_df.rename(columns={'model_name': 'Model'})
sample_df = sample_df[((sample_df['threshold'] == '0.5') &  (sample_df['data_name'] == 'METR-LA')) | ((sample_df['threshold'] == '1.0') & (sample_df['data_name'] == 'Hague'))  |((sample_df['threshold'] == '0.1') & (sample_df['data_name'] == 'PEMS-BAY'))]
# Accuracy comparison of EMD vs PW-AE
fig = px.histogram(sample_df, x="data_name", y="log_train_time",
             color='Model', barmode='group', text_auto=True, height=450, width=800)
fig.update_yaxes(title='Log Training Time (s) (Convergence)', range=[2, 10])
fig.update_xaxes(title='Dataset')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/emd_time.png',height=450, width = 800, scale=4)

### C. Compare OWAM with other models

In [62]:
# Hague Results
# Extract evaluation time from the outlier results and per instance prediction time saved using the terminal 
# everything is constant, only variable is outlier time
data_name = 'hague'
LSTM_thresh_pred_time = 0.00063 # per instance prediction time
LSTM_Baseline_pred_time = 0.00063     # per instance prediction time
total_fpds = 19728
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'
out_time_dict = {}
outlier_model_name_list = ['AE','DAE','PW-AE','HST','Kit-Net','ILOF']
for outlier_model_name in outlier_model_name_list:
    out_result_path = os.path.join(base_outlier_result_path, outlier_model_name)
    out_result_path = os.path.join(out_result_path,'instance_train_time_seconds.pkl')
    # read pickle file
    with open(out_result_path, 'rb') as f:
        scores = pickle.load(f)
    target_time_taken = np.mean(scores)/2
    out_time_dict[outlier_model_name] = (target_time_taken/total_fpds)*1000 + LSTM_thresh_pred_time*1000
out_time_dict

{'AE': 0.6735531221956587,
 'DAE': 0.7648710124675213,
 'PW-AE': 1.0057053116554504,
 'HST': 2.139927231445808,
 'Kit-Net': 0.7046862082378337,
 'ILOF': 1.0834773329923961}

In [63]:
# prediction time for each model for the hague dataset
instant_df_hague = pd.DataFrame.from_dict(out_time_dict, orient='index', columns=['Instance Pred Time (ms)'])
instant_df_hague['model_name'] = instant_df_hague.index
instant_df_hague = instant_df_hague.reset_index(drop=True)
# add row for LSTM
instant_df_hague = instant_df_hague.append({'Instance Pred Time (ms)':LSTM_Baseline_pred_time*1000, 'model_name':'LSTM Baseline'}, ignore_index=True)
instant_df_hague = instant_df_hague.append({'Instance Pred Time (ms)':out_time_dict['ILOF'], 'model_name':'OBIS'}, ignore_index=True)
instant_df_hague = instant_df_hague.append({'Instance Pred Time (ms)':(280/47347)*1000, 'model_name':'DGCRN'}, ignore_index=True)
instant_df_hague['Eval Time'] = instant_df_hague['Instance Pred Time (ms)']*47347/1000
instant_df_hague['data_name'] = 'Hague'
instant_df_hague.replace('AE','AE-EMD', inplace=True)
instant_df_hague.replace('DAE','DAE-EMD', inplace=True)
instant_df_hague.replace('PW-AE','PW-AE-EMD', inplace=True)
instant_df_hague

Unnamed: 0,Instance Pred Time (ms),model_name,Eval Time,data_name
0,0.673553,AE-EMD,31.89072,Hague
1,0.764871,DAE-EMD,36.214348,Hague
2,1.005705,PW-AE-EMD,47.617129,Hague
3,2.139927,HST,101.319135,Hague
4,0.704686,Kit-Net,33.364778,Hague
5,1.083477,ILOF,51.299401,Hague
6,0.63,LSTM Baseline,29.82861,Hague
7,1.083477,OBIS,51.299401,Hague
8,5.913785,DGCRN,280.0,Hague


In [64]:
# METR-LA results
# Extract evaluation time from the outlier results and per instance prediction time saved using the terminal 
# everything is constant, only variable is outlier time
data_name = 'METR-LA'
LSTM_thresh_pred_time = 0.0011
LSTM_Baseline_pred_time = 0.00162   
total_fpds = 2856
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'
out_time_dict = {}
outlier_model_name_list = ['AE','DAE','PW-AE','HST','Kit-Net','ILOF']
for outlier_model_name in outlier_model_name_list:
    out_result_path = os.path.join(base_outlier_result_path, outlier_model_name)
    out_result_path = os.path.join(out_result_path,'instance_train_time_seconds.pkl')
    # read pickle file
    with open(out_result_path, 'rb') as f:
        scores = pickle.load(f)
    target_time_taken = np.mean(scores)/2
    out_time_dict[outlier_model_name] = (target_time_taken/total_fpds)*1000 + LSTM_thresh_pred_time*1000
out_time_dict

{'AE': 1.2689849334953038,
 'DAE': 1.4708697171199563,
 'PW-AE': 1.180431825714693,
 'HST': 3.9927487086700646,
 'Kit-Net': 1.1639800960526345,
 'ILOF': 1.5728045876880246}

In [66]:
# prediction time for each model for the METR-LA dataset
instant_df_metr = pd.DataFrame.from_dict(out_time_dict, orient='index', columns=['Instance Pred Time (ms)'])
instant_df_metr['model_name'] = instant_df_metr.index
instant_df_metr = instant_df_metr.reset_index(drop=True)
# add row for LSTM
instant_df_metr = instant_df_metr.append({'Instance Pred Time (ms)':LSTM_Baseline_pred_time*1000, 'model_name':'LSTM Baseline'}, ignore_index=True)
instant_df_metr = instant_df_metr.append({'Instance Pred Time (ms)':out_time_dict['ILOF'], 'model_name':'OBIS'}, ignore_index=True)
instant_df_metr = instant_df_metr.append({'Instance Pred Time (ms)':(310.0430/6855)*1000, 'model_name':'DGCRN'}, ignore_index=True)
instant_df_metr['Eval Time'] = instant_df_metr['Instance Pred Time (ms)']*6855/1000
instant_df_metr['data_name'] = data_name
instant_df_metr.replace('AE','AE-EMD', inplace=True)
instant_df_metr.replace('DAE','DAE-EMD', inplace=True)
instant_df_metr.replace('PW-AE','PW-AE-EMD', inplace=True)
instant_df_metr

Unnamed: 0,Instance Pred Time (ms),model_name,Eval Time,data_name
0,1.268985,AE-EMD,8.698892,METR-LA
1,1.47087,DAE-EMD,10.082812,METR-LA
2,1.180432,PW-AE-EMD,8.09186,METR-LA
3,3.992749,HST,27.370292,METR-LA
4,1.16398,Kit-Net,7.979084,METR-LA
5,1.572805,ILOF,10.781575,METR-LA
6,1.62,LSTM Baseline,11.1051,METR-LA
7,1.572805,OBIS,10.781575,METR-LA
8,45.228738,DGCRN,310.043,METR-LA


In [67]:
# PEMS-BAY Results 
# Extract evaluation time from the outlier results and per instance prediction time saved using the terminal 
# everything is constant, only variable is outlier time
data_name = 'PEMS-BAY'
LSTM_thresh_pred_time = 0.014
LSTM_Baseline_pred_time = 0.0163
total_fpds = 4343
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'
out_time_dict = {}
outlier_model_name_list = ['AE','DAE','PW-AE','HST','Kit-Net','ILOF']
for outlier_model_name in outlier_model_name_list:
    out_result_path = os.path.join(base_outlier_result_path, outlier_model_name)
    out_result_path = os.path.join(out_result_path,'instance_train_time_seconds.pkl')
    # read pickle file
    with open(out_result_path, 'rb') as f:
        scores = pickle.load(f)
    target_time_taken = np.mean(scores)/2
    out_time_dict[outlier_model_name] = (target_time_taken/total_fpds)*1000 + LSTM_thresh_pred_time*1000
out_time_dict

{'AE': 14.102198195678314,
 'DAE': 14.378615172531486,
 'PW-AE': 14.227371001146889,
 'HST': 16.15370989983464,
 'Kit-Net': 14.07740835460319,
 'ILOF': 14.480932375562078}

In [68]:
# # prediction time for each model for the PEMS-BAY dataset
instant_df_pems = pd.DataFrame.from_dict(out_time_dict, orient='index', columns=['Instance Pred Time (ms)'])
instant_df_pems['model_name'] = instant_df_pems.index
instant_df_pems = instant_df_pems.reset_index(drop=True)
# add row for LSTM
instant_df_pems = instant_df_pems.append({'Instance Pred Time (ms)':LSTM_Baseline_pred_time*1000, 'model_name':'LSTM Baseline'}, ignore_index=True)
instant_df_pems = instant_df_pems.append({'Instance Pred Time (ms)':out_time_dict['ILOF'], 'model_name':'OBIS'}, ignore_index=True)
instant_df_pems = instant_df_pems.append({'Instance Pred Time (ms)':(780.0430/6855)*1000, 'model_name':'DGCRN'}, ignore_index=True)
instant_df_pems['Eval Time'] = instant_df_pems['Instance Pred Time (ms)']*6855/1000
instant_df_pems['data_name'] = data_name
instant_df_pems.replace('AE','AE-EMD', inplace=True)
instant_df_pems.replace('DAE','DAE-EMD', inplace=True)
instant_df_pems.replace('PW-AE','PW-AE-EMD', inplace=True)

Unnamed: 0,Instance Pred Time (ms),model_name,Eval Time,data_name
0,14.102198,AE-EMD,96.670569,PEMS-BAY
1,14.378615,DAE-EMD,98.565407,PEMS-BAY
2,14.227371,PW-AE-EMD,97.528628,PEMS-BAY
3,16.15371,HST,110.733681,PEMS-BAY
4,14.077408,Kit-Net,96.500634,PEMS-BAY
5,14.480932,ILOF,99.266791,PEMS-BAY
6,16.3,LSTM Baseline,111.7365,PEMS-BAY
7,14.480932,OBIS,99.266791,PEMS-BAY
8,113.791831,DGCRN,780.043,PEMS-BAY


In [74]:
# merge all instant_dfs and make necessary changes
instant_df_all = pd.concat([instant_df_hague, instant_df_metr, instant_df_pems])
instant_df_all.head()

Unnamed: 0,Instance Pred Time (ms),model_name,Eval Time,data_name
0,0.673553,AE-EMD,31.89072,Hague
1,0.764871,DAE-EMD,36.214348,Hague
2,1.005705,PW-AE-EMD,47.617129,Hague
3,2.139927,HST,101.319135,Hague
4,0.704686,Kit-Net,33.364778,Hague


In [75]:
# # extract data for plotting
sample_df = weighted_result_df
sample_df_obis = sample_df[(sample_df['model_name'] == previous_oultier_model_name) & (sample_df['threshold'] == '1.0')]
sample_df_copy = sample_df_obis.copy()
sample_df_copy['model_name'].replace('OBIS','LSTM Baseline', inplace=True)
sample_df = weighted_result_df
sample_df = sample_df[(sample_df['model_name'] == 'PW-AE-EMD') | (sample_df['model_name'] == previous_oultier_model_name) | (sample_df['model_name'] == 'DGCRN')]
sample_df = sample_df[((sample_df['threshold'] == '0.5') &  (sample_df['data_name'] == 'METR-LA')) | ((sample_df['threshold'] == '1.0') & (sample_df['data_name'] == 'Hague'))  |((sample_df['threshold'] == '0.1') & (sample_df['data_name'] == 'PEMS-BAY'))]
sample_df  = sample_df.append(sample_df_copy)
# replace PW-AE-EMD with OWAM
sample_df.replace('PW-AE-EMD','OWAM', inplace=True)
# change column anme model_name to Model
sample_df.rename(columns={'model_name':'Model'}, inplace=True) 
sample_df.head()

Unnamed: 0,RMSE,MAE,train_time,Model,threshold,log_train_time,data_name
20,13.48,9.41,677.74,OWAM,1.0,6.52,Hague
48,13.11,7.5,5043.31,DGCRN,1.0,8.53,Hague
55,13.55,9.47,682.81,OBIS,1.0,6.53,Hague
81,7.52,3.49,37.9,OWAM,0.5,3.64,METR-LA
109,4.16,2.44,11263.23,DGCRN,0.5,9.33,METR-LA


In [76]:
# plot of accuracy comparison of OWAM vs previous models
fig = px.histogram(sample_df, x="data_name", y="RMSE",
             color='Model', barmode='group', text_auto=True, height=500, width=1080)
fig.update_yaxes(title='RMSE Score (Lower is better)',range=[0,15])
fig.update_xaxes(title='Dataset')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
    orientation="h",
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/performance_RMSE.png',height=500, width = 1080, scale=4)


In [78]:
# merge all results into one dataframe  
sample_df = weighted_result_df
sample_df = sample_df[(sample_df['model_name'] == 'PW-AE-EMD') | (sample_df['model_name'] == previous_oultier_model_name) | (sample_df['model_name'] == 'DGCRN')]
sample_df = sample_df[((sample_df['threshold'] == '0.5') &  (sample_df['data_name'] == 'METR-LA')) | ((sample_df['threshold'] == '1.0') & (sample_df['data_name'] == 'Hague'))  |((sample_df['threshold'] == '0.1') & (sample_df['data_name'] == 'PEMS-BAY'))]
sample_df  = sample_df.append(sample_df_copy)
sample_df = sample_df.merge(instant_df_all, on=['model_name','data_name'], how='left')
sample_df['log_eval_time'] = np.log(sample_df['Eval Time'])
sample_df['log Instance Pred Time (ms)'] = np.log(sample_df['Instance Pred Time (ms)'])
sample_df = sample_df.round(2)
sample_df.replace('PW-AE-EMD','OWAM', inplace=True)
# change column anme model_name to Model
sample_df.rename(columns={'model_name':'Model'}, inplace=True)  
sample_df.head()

Unnamed: 0,RMSE,MAE,train_time,Model,threshold,log_train_time,data_name,Instance Pred Time (ms),Eval Time,log_eval_time,log Instance Pred Time (ms)
0,13.48,9.41,677.74,OWAM,1.0,6.52,Hague,1.01,47.62,3.86,0.01
1,13.11,7.5,5043.31,DGCRN,1.0,8.53,Hague,5.91,280.0,5.63,1.78
2,13.55,9.47,682.81,OBIS,1.0,6.53,Hague,1.08,51.3,3.94,0.08
3,7.52,3.49,37.9,OWAM,0.5,3.64,METR-LA,1.18,8.09,2.09,0.17
4,4.16,2.44,11263.23,DGCRN,0.5,9.33,METR-LA,45.23,310.04,5.74,3.81


In [79]:
# train time plots
fig = px.histogram(sample_df, x="data_name", y="log_train_time",
             color='Model', barmode='group', text_auto=True, height=500, width=900)
fig.update_yaxes(title='Log Training Time (s) (Convergence)')
fig.update_xaxes(title='Dataset')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    orientation="h",
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/performance_time.png',height=500, width = 900, scale=4)

In [80]:
# evaluation time plots
fig = px.histogram(sample_df, x="data_name", y="log_eval_time",
             color='Model', barmode='group', text_auto=True, height=500, width=900)
fig.update_yaxes(title='Log Test Time (s)')
fig.update_xaxes(title='Dataset')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    orientation="h",
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/performance_time_eval.png',height=500, width = 900, scale=4)

In [82]:
# all results for the Hague dataset
# the LSTM baseline model
sample_df_copy = sample_df_copy[['data_name','model_name','RMSE','MAE','train_time']]
# extract results for Hague
sample_df_copy_copy = sample_df_copy[sample_df_copy['data_name']=='Hague']
sample_df = weighted_result_df[(weighted_result_df['data_name'] == 'Hague') & (weighted_result_df['threshold'] == '1.0') & (weighted_result_df['model_name'] != 'PW-AE') & (weighted_result_df['model_name'] != 'ILOF')]
sample_df = sample_df.append(sample_df_copy_copy)
sample_df = sample_df.reset_index(drop=True)
sample_df = sample_df[['model_name','RMSE','MAE','train_time']]
sample_df['log_train_time'] = np.log(sample_df['train_time'])
sample_df = sample_df.merge(instant_df_hague, on='model_name', how='left').round(2)
sample_df

Unnamed: 0,model_name,RMSE,MAE,train_time,log_train_time,Instance Pred Time (ms),Eval Time,data_name
0,AE-EMD,13.54,9.46,275.27,5.62,0.67,31.89,Hague
1,DAE-EMD,13.48,9.43,1821.92,7.51,0.76,36.21,Hague
2,PW-AE-EMD,13.48,9.41,677.74,6.52,1.01,47.62,Hague
3,HST,13.63,9.52,244.31,5.5,2.14,101.32,Hague
4,Kit-Net,13.54,9.47,305.37,5.72,0.7,33.36,Hague
5,DGCRN,13.11,7.5,5043.31,8.53,5.91,280.0,Hague
6,OBIS,13.55,9.47,682.81,6.53,1.08,51.3,Hague
7,LSTM Baseline,13.55,9.47,682.81,6.53,0.63,29.83,Hague


In [83]:
# all results for the METR-LA dataset
sample_df_copy_copy = sample_df_copy[sample_df_copy['data_name']=='METR-LA']
sample_df = weighted_result_df[(weighted_result_df['data_name'] == 'METR-LA') & (weighted_result_df['threshold'] == '0.1') & (weighted_result_df['model_name'] != 'PW-AE') & (weighted_result_df['model_name'] != 'ILOF')]
sample_df = sample_df.append(sample_df_copy_copy)
sample_df = sample_df.reset_index(drop=True)
sample_df = sample_df[['model_name','RMSE','MAE','train_time']]
sample_df['log_train_time'] = np.log(sample_df['train_time'])
sample_df = sample_df.merge(instant_df_metr, on='model_name', how='left').round(2)
sample_df

Unnamed: 0,model_name,RMSE,MAE,train_time,log_train_time,Instance Pred Time (ms),Eval Time,data_name
0,AE-EMD,5.92,2.87,48.05,3.87,1.27,8.7,METR-LA
1,DAE-EMD,5.93,2.85,53.89,3.99,1.47,10.08,METR-LA
2,PW-AE-EMD,5.88,2.76,46.18,3.83,1.18,8.09,METR-LA
3,HST,6.82,3.44,47.53,3.86,3.99,27.37,METR-LA
4,Kit-Net,5.88,2.81,55.44,4.02,1.16,7.98,METR-LA
5,DGCRN,4.16,2.44,11263.23,9.33,45.23,310.04,METR-LA
6,OBIS,7.42,3.58,91.37,4.51,1.57,10.78,METR-LA
7,LSTM Baseline,8.83,4.56,106.06,4.66,1.62,11.11,METR-LA


In [84]:
# all results for the PEMS-BAY dataset
sample_df_copy_copy = sample_df_copy[sample_df_copy['data_name']=='PEMS-BAY']
sample_df = weighted_result_df[(weighted_result_df['data_name'] == 'PEMS-BAY') & (weighted_result_df['threshold'] == '0.5') & (weighted_result_df['model_name'] != 'PW-AE') & (weighted_result_df['model_name'] != 'ILOF')]
sample_df = sample_df.append(sample_df_copy_copy)
sample_df = sample_df.reset_index(drop=True)
sample_df = sample_df[['model_name','RMSE','MAE','train_time']]
sample_df['log_train_time'] = np.log(sample_df['train_time'])
sample_df = sample_df.merge(instant_df_pems, on='model_name', how='left').round(2)
sample_df

Unnamed: 0,model_name,RMSE,MAE,train_time,log_train_time,Instance Pred Time (ms),Eval Time,data_name
0,AE-EMD,4.25,3.03,819.62,6.71,14.1,96.67,PEMS-BAY
1,DAE-EMD,4.04,3.17,310.78,5.74,14.38,98.57,PEMS-BAY
2,PW-AE-EMD,4.04,2.91,303.94,5.72,14.23,97.53,PEMS-BAY
3,HST,4.82,3.97,372.86,5.92,16.15,110.73,PEMS-BAY
4,Kit-Net,4.31,3.25,356.96,5.88,14.08,96.5,PEMS-BAY
5,DGCRN,1.59,0.87,25900.67,10.16,113.79,780.04,PEMS-BAY
6,OBIS,4.38,3.65,377.49,5.93,14.48,99.27,PEMS-BAY
7,LSTM Baseline,5.43,4.38,405.05,6.0,16.3,111.74,PEMS-BAY


# RQ - Does incorporating the Outlier Weighted Autoencoder Model (OWAM) in traffic flow prediction improve its performance?

- Does updating the real-time traffic model using OWAM lead to improved performance in traffic predictions as compared to static settings?

In [85]:
# declare variables for result analysis
outlier_model_name_list = ['PW-AE']
time_window_list = [1, 3, 6, 12, 24, 24*7, 24*30]
RMSE_list = ['No_update','1','3','6','12','24','168', '720']
time_list = ['1','3','6','12','24','168', '720']

In [86]:
# function to extract data from the results
def extract_data(data_name, exp_name, threshold):
    result_dict = {}
    for outlier_model_name in outlier_model_name_list:
        # print(f"Processing {outlier_model_name}")
        result_dict[outlier_model_name] = {}
        if exp_name == 'out':
            exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}.pkl'
        else:
            exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}_no_out_update.pkl'
            
        real_time_load_path = os.path.join(base_result_path, exp_name)
        real_time_results = load_real_time_results(real_time_load_path, data_name)
        for target, value in real_time_results.items():
            result_dict[outlier_model_name][target] = {}
            result_dict[outlier_model_name][target]['No_update'] = value[threshold]['No_update']['RMSE']
            for time_window in time_window_list:
                time_key = str(time_window)
                result_dict[outlier_model_name][target][time_key] = {}
                time_key_RMSE, time_key_train_time = extract_time_key_results(value[threshold][time_key]['incremental_weighted_update'])
                result_dict[outlier_model_name][target][time_key]['RMSE'] = time_key_RMSE
                result_dict[outlier_model_name][target][time_key]['train_time'] = time_key_train_time


    # extract the results for the global RMSE and time
    global_rmse_df = pd.DataFrame(columns=['time_window', 'RMSE', 'model_name'])
    global_time_df = pd.DataFrame(columns=['time_window', 'train_time', 'model_name'])

    for out_name, target_results in result_dict.items():
        rmse_dict = {}
        time_dict = {}
        for rmse_name in RMSE_list:
            rmse_dict[rmse_name] = []
        for time_name in time_list:
            time_dict[time_name] = []

        for target_name, time_results in target_results.items():
            for rmse_name in RMSE_list:
                if rmse_name == 'No_update':
                    rmse_dict[rmse_name].append(time_results[rmse_name])
                else:
                    rmse_dict[rmse_name].append(time_results[rmse_name]['RMSE'])
                    time_dict[rmse_name].append(time_results[rmse_name]['train_time'])

        avg_rmse_dict = {}
        avg_time_dict = {}
        for rmse_name in RMSE_list:
            avg_rmse_dict[rmse_name] = np.mean(rmse_dict[rmse_name])
        for time_name in time_list:
            avg_time_dict[time_name] = np.mean(time_dict[time_name])

        rmse_df = pd.DataFrame(avg_rmse_dict.items(), columns=['time_window', 'RMSE'])
        rmse_df['model_name'] = out_name
        global_rmse_df = global_rmse_df.append(rmse_df)
        time_df = pd.DataFrame(avg_time_dict.items(), columns=['time_window', 'train_time'])
        time_df['model_name'] = out_name
        global_time_df = global_time_df.append(time_df)

    time_window_name = {'No_update':'No_update', '1':'1 hours','3':'3 hours','6':'6 hours','12':'12 hours','24':'24 hours','168':'1 week','720':'1 month', '2160':'3 months', '4320':'6 months'}
    global_rmse_df['time_window'] = global_rmse_df['time_window'].map(time_window_name)
    global_time_df['time_window'] = global_time_df['time_window'].map(time_window_name)   

    return global_rmse_df, global_time_df             

##  hague Process & Plots 

In [89]:
data_name = 'hague'
exp_name = 'out'
threshold = 1
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [90]:
global_rmse_df_out, global_time_df_out = extract_data(data_name, exp_name='out', threshold=threshold)
global_rmse_df_out['Update Type'] = 'Dynamic Update'
global_time_df_out['Update Type'] = 'Dynamic Update'
global_time_df_out[data_name] = data_name
global_rmse_df_no_out, global_time_df_no_out = extract_data(data_name, exp_name='no_out', threshold=threshold)
global_rmse_df_no_out['Update Type'] = 'Static Update'
global_time_df_no_out['Update Type'] = 'Static Update'
global_time_df_no_out[data_name] = data_name

In [92]:
# append static and dynamic results
global_rmse_df_hague = global_rmse_df_out.append(global_rmse_df_no_out)
gloabl_time_df_hague = global_time_df_out.append(global_time_df_no_out)
# round to 2 decimal places
global_rmse_df_hague = global_rmse_df_hague.round(2)
global_time_df_out = global_time_df_out.round(2)

In [95]:
# plot the results of the real-time experiments
sample_df = global_rmse_df_hague
fig = px.line(sample_df, x="time_window", y="RMSE",
             color='Update Type', height=500, width=800, markers=True)
fig.update_yaxes(title='RMSE Score (Lower is better)')
fig.update_xaxes(title='Time Window')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
    orientation="h",
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/real-time-hague-rmse.png',height=500, width = 800, scale=4)



## METR Process & Plots

In [96]:
data_name = 'METR-LA'
exp_name = 'out'
threshold = 0.1
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [97]:
global_rmse_df_out, global_time_df_out = extract_data(data_name, exp_name='out', threshold=threshold)
global_rmse_df_out['Update Type'] = 'Dynamic Update'
global_time_df_out['Update Type'] = 'Dynamic Update'
global_time_df_out[data_name] = data_name
global_rmse_df_no_out, global_time_df_no_out = extract_data(data_name, exp_name='no_out', threshold=threshold)
global_rmse_df_no_out['Update Type'] = 'Static Update'
global_time_df_no_out['Update Type'] = 'Static Update'
global_time_df_no_out[data_name] = data_name

In [98]:
# append static and dynamic results
global_rmse_df_metr = global_rmse_df_out.append(global_rmse_df_no_out)
gloabl_time_df_metr = global_time_df_out.append(global_time_df_no_out)
global_rmse_df_metr = global_rmse_df_metr.round(2)
gloabl_time_df_metr = gloabl_time_df_metr.round(2)

In [99]:
# plot the results
sample_df = global_rmse_df_metr
fig = px.line(sample_df, x="time_window", y="RMSE",
             color='Update Type', height=500, width=800, markers=True)
fig.update_yaxes(title='RMSE Score (Lower is better)')
fig.update_xaxes(title='Time Window')
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99,
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/real-time-metr-rmse.png',height=500, width = 800, scale=4)


## PEMS-BAY Process & Plots

In [100]:
data_name = 'PEMS-BAY'
exp_name = 'out'
threshold = 0.5
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [101]:
global_rmse_df_out, global_time_df_out = extract_data(data_name, exp_name='out', threshold=threshold)
global_rmse_df_out['Update Type'] = 'Dynamic Update'
global_time_df_out['Update Type'] = 'Dynamic Update'
global_time_df_out[data_name] = data_name
global_rmse_df_no_out, global_time_df_no_out = extract_data(data_name, exp_name='no_out', threshold=threshold)
global_rmse_df_no_out['Update Type'] = 'Static Update'
global_time_df_no_out['Update Type'] = 'Static Update'
global_time_df_no_out[data_name] = data_name

In [102]:
# merge static and dynamic results
global_rmse_df_pems = global_rmse_df_out.append(global_rmse_df_no_out)
gloabl_time_df_pems = global_time_df_out.append(global_time_df_no_out)
global_rmse_df_pems = global_rmse_df_pems.round(2)
gloabl_time_df_pems = gloabl_time_df_pems.round(2)

In [103]:
# plot the results
sample_df = global_rmse_df_pems
fig = px.line(sample_df, x="time_window", y="RMSE",
             color='Update Type', height=500, width=800, markers=True)
fig.update_yaxes(title='RMSE Score (Lower is better)')
fig.update_xaxes(title='Time Window')
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99,
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/real-time-pems-rmse.png',height=500, width = 800, scale=4)

## Exploratory Data Analysis 

### Analysing Hague results

In [106]:
# load result of no update, 1-hour, 3-hour update and 1-week update
outlier_model_name_list = ['PW-AE']
data_name = 'hague'
time_window_list = [1,3,24*7]
RMSE_list = ['No_update','1','3','168']
time_list = ['1','3','168']
threshold = 1
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [107]:
# extract results with dataframes and intersections
result_dict = {}
for outlier_model_name in outlier_model_name_list:
    # print(f"Processing {outlier_model_name}")
    result_dict[outlier_model_name] = {}
    exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}.pkl'
    real_time_load_path = os.path.join(base_result_path, exp_name)
    real_time_results = load_real_time_results(real_time_load_path, data_name)
    for target, value in real_time_results.items():
        result_dict[outlier_model_name][target] = {}
        result_dict[outlier_model_name][target]['No_update'] = {}
        result_dict[outlier_model_name][target]['No_update']['RMSE'] = value[threshold]['No_update']['RMSE']
        result_dict[outlier_model_name][target]['No_update']['df'] = value[threshold]['No_update']['df']
        # result_dict[outlier_model_name][target]['No_update']['intersections'] = value[threshold]['No_update']['intersections']
        for time_window in time_window_list:
            time_key = str(time_window)
            print(f"Processing {outlier_model_name} {target} {time_key}")
            result_dict[outlier_model_name][target][time_key] = {}
            time_key_RMSE, time_key_train_time = extract_time_key_results(value[threshold][time_key]['incremental_weighted_update'])
            df, intersections,time_keys = extract_df_intc_results(value[threshold][time_key]['incremental_weighted_update'])
            result_dict[outlier_model_name][target][time_key]['RMSE'] = time_key_RMSE
            result_dict[outlier_model_name][target][time_key]['train_time'] = time_key_train_time
            result_dict[outlier_model_name][target][time_key]['df'] = df
            result_dict[outlier_model_name][target][time_key]['intersections'] = intersections
            result_dict[outlier_model_name][target][time_key]['time_keys'] = time_keys

Processing PW-AE T1_North 1
Processing PW-AE T1_North 3
Processing PW-AE T1_North 168
Processing PW-AE T1_South 1
Processing PW-AE T1_South 3
Processing PW-AE T1_South 168
Processing PW-AE T2_North 1
Processing PW-AE T2_North 3
Processing PW-AE T2_North 168
Processing PW-AE T2_South 1
Processing PW-AE T2_South 3
Processing PW-AE T2_South 168


In [113]:
# load the raw data
# read hague processed data
load_path1 = '../data/hague/processed/GNN_raw_data_North_T1.h5'
df1 = pd.read_hdf(load_path1)
# add "_N" to the column names of the North trajectories
df1.columns = [str(col) + '_N' for col in df1.columns]

load_path2 = '../data/hague/processed/GNN_raw_data_North_T2.h5'
df2 = pd.read_hdf(load_path2)
# add "_N" to the column names of the North trajectories
df2.columns = [str(col) + '_N' for col in df2.columns]

load_path3 = '../data/hague/processed/GNN_raw_data_South_T1.h5'
df3 = pd.read_hdf(load_path3)
# add "_S" to the column names of the South trajectories
df3.columns = [str(col) + '_S' for col in df3.columns]

load_path4 = '../data/hague/processed/GNN_raw_data_South_T2.h5'
df4 = pd.read_hdf(load_path4)
# add "_S" to the column names of the South trajectories
df4.columns = [str(col) + '_S' for col in df4.columns]

# merge the dataframes on index
df = pd.concat([df1, df2, df3, df4], axis=1)

In [114]:
# get the time for the first update
time_key_start = result_dict['PW-AE']['T1_North']['168']['time_keys'][0]
timestamps = df[df.index>=time_key_start].index

In [115]:
# get results for the updated windows
H1_update_df = result_dict['PW-AE']['T1_North']['1']['df']
H3_update_df = result_dict['PW-AE']['T1_North']['3']['df']
daily_update_df = result_dict['PW-AE']['T1_North']['168']['df']
daily_update_df = daily_update_df.rename(columns={'Predicted':'weekly_update'}) # rename predicted to daily update
row_size = min(len(timestamps), daily_update_df.shape[0])
daily_update_df = daily_update_df.iloc[:row_size]

In [116]:
# merge results for differnet windows
No_update_preds = result_dict['PW-AE']['T1_North']['No_update']['df'].iloc[:row_size,:]
No_update_preds = No_update_preds.rename(columns={'Predicted':'no_update'})
H3_update_df = result_dict['PW-AE']['T1_North']['3']['df']
H3_update_df = H3_update_df.rename(columns={'Predicted':'3_hour_update'})
H1_update_df = result_dict['PW-AE']['T1_North']['1']['df']
H1_update_df = H1_update_df.rename(columns={'Predicted':'1_hour_update'})
daily_update_df['no_update'] = No_update_preds['no_update']
daily_update_df['3_hour_update'] = list(H3_update_df['3_hour_update'].iloc[:row_size])
daily_update_df['1_hour_update'] = list(H1_update_df['1_hour_update'].iloc[:row_size])
daily_update_df['timestamp'] = timestamps[:row_size]
daily_update_df.index = daily_update_df['timestamp']

In [118]:
# plot the results
sample_df = daily_update_df
sample_df = daily_update_df.head(12*24*14)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', 'no_update','3_hour_update','weekly_update'], height=500, width=900,  color_discrete_sequence=['#636EFA','#990099','#2CA02C','#d62728'])
fig.update_yaxes(title_text='Number of vehicles at K504')
fig.update_xaxes(title_text='Time')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
orientation="h",
), legend_font=dict(size=12))
fig.show()
pio.write_image(fig, 'plots/hague_realtime_weekcompare.png',height=400, width = 850, scale=4)


In [119]:
# plot no update vs real time
sample_df = daily_update_df.head(12*24*14)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', 'no_update'], height=400, width=700, color_discrete_sequence=['#636EFA', '#990099'])
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
orientation="h",
), legend_font=dict(size=12))
fig.update_yaxes(title_text='Number of vehicles at K504')
fig.update_xaxes(title_text='Time')
fig.show()
pio.write_image(fig, 'plots/hague_realtime_weekcompare_noupdate.png',height=400, width = 700, scale=4)

In [120]:
# plot 3 hour update vs real data
sample_df = daily_update_df.head(12*24*14)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', '3_hour_update'], height=400, width=700,color_discrete_sequence=['#636EFA','#2CA02C'])
fig.update_yaxes(title_text='Number of vehicles at K504')
fig.update_xaxes(title_text='Time')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
orientation="h"
))
pio.write_image(fig, 'plots/hague_3hour_update.png',height=400, width = 700, scale=4)
fig.show()

In [123]:
# plot week vs realtime
sample_df = daily_update_df.head(12*24*14)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', 'weekly_update'], height=400, width=700, color_discrete_sequence=['#636EFA','#d62728'])
fig.add_vline(x=result_dict[outlier_model_name]['T1_North']['168']['time_keys'][0], line_width=2, line_dash="dash", line_color="green")
fig.add_vline(x=result_dict[outlier_model_name]['T1_North']['168']['time_keys'][1], line_width=2, line_dash="dash", line_color="green")
fig.update_yaxes(title_text='Number of vehicles at K504')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99,
orientation="h"
))
fig.update_yaxes(title_text='Number of vehicles at K504')
fig.update_xaxes(title_text='Time')
pio.write_image(fig, 'plots/hague_realtime_weeky_week.png',height=400, width = 700, scale=4)
fig.show()

In [124]:
# check the weekly changes of the intersections
result_dict[outlier_model_name]['T1_North']['168']['intersections'][0], result_dict[outlier_model_name]['T1_North']['168']['intersections'][1]

(K504    1.000000
 K198    0.130556
 K561    0.111364
 K556    0.098646
 K557    0.082957
 K263    0.048260
 K502    0.030212
 K559   -0.050497
 K503   -0.075756
 Name: K504, dtype: float64,
 K504    1.000000
 K557    0.217992
 K556    0.166993
 K559    0.046335
 K198    0.033929
 K561    0.001044
 K503   -0.026163
 K263   -0.038274
 K502   -0.095533
 Name: K504, dtype: float64)

### Analysing METR LA dataset

In [125]:
# define the parameters for the result analysis
outlier_model_name_list = ['PW-AE']
data_name = 'METR-LA'
time_window_list = [1,24]
RMSE_list = ['No_update','1','24']
time_list = ['1','24']
threshold = 0.1
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [126]:
# extract results from real time results
result_dict = {}
for outlier_model_name in outlier_model_name_list:
    # print(f"Processing {outlier_model_name}")
    result_dict[outlier_model_name] = {}
    exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}.pkl'
    real_time_load_path = os.path.join(base_result_path, exp_name)
    real_time_results = load_real_time_results(real_time_load_path, data_name)
    for target, value in real_time_results.items():
        result_dict[outlier_model_name][target] = {}
        result_dict[outlier_model_name][target]['No_update'] = {}
        result_dict[outlier_model_name][target]['No_update']['RMSE'] = value[threshold]['No_update']['RMSE']
        result_dict[outlier_model_name][target]['No_update']['df'] = value[threshold]['No_update']['df']
        # result_dict[outlier_model_name][target]['No_update']['intersections'] = value[threshold]['No_update']['intersections']
        for time_window in time_window_list:
            time_key = str(time_window)
            print(f"Processing {outlier_model_name} {target} {time_key}")
            result_dict[outlier_model_name][target][time_key] = {}
            time_key_RMSE, time_key_train_time = extract_time_key_results(value[threshold][time_key]['incremental_weighted_update'])
            df, intersections,time_keys = extract_df_intc_results(value[threshold][time_key]['incremental_weighted_update'])
            result_dict[outlier_model_name][target][time_key]['RMSE'] = time_key_RMSE
            result_dict[outlier_model_name][target][time_key]['train_time'] = time_key_train_time
            result_dict[outlier_model_name][target][time_key]['df'] = df
            result_dict[outlier_model_name][target][time_key]['intersections'] = intersections
            result_dict[outlier_model_name][target][time_key]['time_keys'] = time_keys

Processing PW-AE 716339 1
Processing PW-AE 716339 24
Processing PW-AE 765164 1
Processing PW-AE 765164 24
Processing PW-AE 716328 1
Processing PW-AE 716328 24
Processing PW-AE 717445 1
Processing PW-AE 717445 24
Processing PW-AE 717463 1
Processing PW-AE 717463 24


In [138]:
# read data of meter-la 
load_path = '../data/METR-LA/metr-la.h5'
save_path = '../data/METR-LA/processed/OWRI_df_format.pickle'
df = pd.read_hdf(load_path, 'df')
df.index = pd.to_datetime(df.index.year*10000000000 + df.index.month*100000000 + df.index.day*1000000 + df.index.hour*10000 + df.index.minute*100 + df.index.second, format='%Y%m%d%H%M%S')
df.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
# get 1st update time
time_key_start = result_dict['PW-AE']['716339']['24']['time_keys'][0]
timestamps = df[df.index>=time_key_start].index

In [140]:
H3_update_df = result_dict['PW-AE']['716339']['1']['df']
daily_update_df = result_dict['PW-AE']['716339']['24']['df']
daily_update_df = daily_update_df.rename(columns={'Predicted':'daily_update'}) # rename predicted to daily update
row_size = min(len(timestamps), daily_update_df.shape[0])
daily_update_df = daily_update_df.iloc[:row_size]

In [141]:
No_update_preds = result_dict['PW-AE']['716339']['No_update']['df'].iloc[:row_size,:]
No_update_preds = No_update_preds.rename(columns={'Predicted':'no_update'})
H3_update_df = result_dict['PW-AE']['716339']['1']['df']
H3_update_df = H3_update_df.rename(columns={'Predicted':'3_hour_update'})
daily_update_df['no_update'] = No_update_preds['no_update']
daily_update_df['3_hour_update'] = list(H3_update_df['3_hour_update'].iloc[:row_size])
daily_update_df['timestamp'] = timestamps[:row_size]
daily_update_df.index = daily_update_df['timestamp']

In [144]:
sample_df = daily_update_df.head(12*24*7)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', 'no_update','3_hour_update','daily_update'], title='model predictions')
fig.show()

In [147]:
sample_df = daily_update_df.head(12*24*7)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', 'no_update'], title='model predictions')
fig.show()

In [148]:
sample_df = daily_update_df.head(12*24*7)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', '3_hour_update'], title='model predictions')
fig.show()

In [156]:
sample_df = daily_update_df.head(12*24*7)
fig = px.line(sample_df, x=sample_df.timestamp, y=['Real', 'daily_update'], title='model predictions')
fig.show()

In [157]:
# changes in the intersections
result_dict[outlier_model_name]['716339']['24']['intersections'][30][:10]

716339    1.000000
717460    0.972004
717595    0.960745
764120    0.958814
717459    0.946649
763995    0.942649
760024    0.941608
717572    0.938599
773062    0.938577
717463    0.938545
Name: 716339, dtype: float64

In [158]:
result_dict[outlier_model_name]['716339']['24']['intersections'][31][:10]

716339    1.000000
760650    0.839946
717453    0.818448
765171    0.809715
769388    0.809134
769373    0.808420
716953    0.794077
759591    0.793202
764101    0.792632
773013    0.791258
Name: 716339, dtype: float64

# -------------------------- END ----------------------------