In [1]:
import keras
import math
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
from torch.utils.data import Dataset, DataLoader
import pickle
pd.set_option('display.max_rows', 500)
import os
import tensorflow as tf
import plotly.express as px
import torch
import torch.nn as nn
from math import sqrt
# import rmse from sklearn
from sklearn.metrics import mean_squared_error


# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)
tf.random.set_seed(0)
# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# Supporting Functions

In [2]:
def mod_dict(results):
    new_dict = {}
    for key,val in results.items():
        new_key1 = key+'_'+'North'
        new_dict[new_key1] = val['North']
        new_key2 = key+'_'+'South'
        new_dict[new_key2] = val['South']
    return new_dict

In [3]:
def extract_results(results):
    thresholds = [0,0.05,0.1,0.25,0.5, 0.75, 1]
    result_dict = {}
    for val in thresholds:
        result_dict[val] = {}
        result_dict[val]['RMSE'] = []
        result_dict[val]['MAE'] = []
        result_dict[val]['train_time'] = []

    c=0
    for intersection, value1 in results.items():
        c+=1
        if c%1==0:
            for threshold, value2 in value1.items():
                result_dict[threshold]['RMSE'].append(value2['RMSE'])
                result_dict[threshold]['MAE'].append(value2['MAE'])
                result_dict[threshold]['train_time'].append(value2['train_time'])

    for intersection, value1 in results.items():
        for threshold, value2 in value1.items():
            result_dict[threshold]['RMSE'] = np.mean(result_dict[threshold]['RMSE'])
            result_dict[threshold]['MAE'] = np.mean(result_dict[threshold]['MAE'])
            result_dict[threshold]['train_time'] = np.mean(result_dict[threshold]['train_time'])

    return result_dict

In [4]:
def get_results(base_result_path, data_name, exp_name,out_mod):
    load_path = os.path.join(base_result_path, exp_name)
    out_mod = 'LOF' if out_mod == 'ILOF' else out_mod
    with open(load_path, 'rb') as f:
        results = pickle.load(f)

    if data_name == 'hauge':
        results = mod_dict(results)

    result_dict = extract_results(results)
    
    # make dataframe
    df = pd.DataFrame(result_dict).T
    df['model_name'] = out_mod
    df['threshold'] = df.index
    df['threshold'] = df['threshold'].astype(str)
    df.reset_index(drop=True, inplace=True)
    
    return df

In [5]:
def load_real_time_results(load_path, data_name):
    # laod results
    with open(load_path, 'rb') as f:
        results = pickle.load(f)
    if data_name == 'hauge':
        results = mod_dict(results)

    return results

In [6]:
def extract_time_key_results(results_time_dict):
    incremental_weighted_update_real = []
    incremental_weighted_update_predicted = []
    train_time = []
    for key,val in results_time_dict.items():
        incremental_weighted_update_real.extend(val['df']['Real'].to_list())
        incremental_weighted_update_predicted.extend(val['df']['Predicted'].to_list())
        train_time.append(val['train_time'])

    RMSE = sqrt(mean_squared_error(incremental_weighted_update_real,incremental_weighted_update_predicted))
    return RMSE, np.sum(train_time)

# RQ1 - Can we improve Traffic Flow Prediction in Urban Areas by Incorporating an Outlier Detection Model ?

## a.  Hauge data processing

In [139]:
# results save path
outlier_model_name = ["AE", "DAE", "ILOF","PW-AE"]
data_name = 'hauge'
base_result_path = f'../results/{data_name}/LSTM'
exp_non_weighted = f'univariate_ILOF_outlier_non_weighted.pkl'
earth_mover_mode_name = 'PW-AE'
exp_earth_mover = f'univariate_{earth_mover_mode_name}_outlier_weighted_earth_mover.pkl'

In [140]:
non_weighted_result_df = get_results(base_result_path, data_name, exp_non_weighted,'ILOF')
non_weighted_result_df.replace('LOF', 'previous_model', inplace=True)
non_weighted_result_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,14.201854,9.952364,816.109886,previous_model,0.0
1,14.187509,9.940928,714.036277,previous_model,0.05
2,14.093222,9.871032,760.451049,previous_model,0.1
3,13.815778,9.688582,745.636975,previous_model,0.25
4,13.683858,9.614662,881.166588,previous_model,0.5


In [141]:
earth_mover_result_df = get_results(base_result_path, data_name, exp_earth_mover,earth_mover_mode_name)
earth_mover_result_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,14.237238,9.985933,837.051269,PW-AE,0.0
1,14.199468,9.959284,713.855409,PW-AE,0.05
2,14.08234,9.874168,813.275704,PW-AE,0.1
3,13.716329,9.600639,849.626819,PW-AE,0.25
4,13.500914,9.415205,823.755387,PW-AE,0.5


In [142]:
# combine results
weighted_result_df = pd.DataFrame()
for out_mod in outlier_model_name:
    exp_name_weighted = f'univariate_{out_mod}_outlier_weighted.pkl'
    df_weighted = get_results(base_result_path, data_name, exp_name_weighted,out_mod)
    weighted_result_df = weighted_result_df.append(df_weighted)

weighted_result_df = weighted_result_df.append(non_weighted_result_df)
weighted_result_df.head()  

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,14.223739,9.974554,285.320609,AE,0.0
1,14.18864,9.942463,313.574481,AE,0.05
2,14.106318,9.886723,280.069239,AE,0.1
3,13.885396,9.726711,315.691193,AE,0.25
4,13.567951,9.5143,318.464729,AE,0.5


In [143]:
# Accuracy comparison
fig = px.histogram(weighted_result_df, x="threshold", y="RMSE",
             color='model_name', barmode='group',
             height=400, text_auto=True, title='RMSE comparison')
fig.update_yaxes(range=[13, 15])
fig.show()

In [144]:
# LSTM model training time comparison
fig = px.histogram(weighted_result_df, x="threshold", y="train_time",
             color='model_name', barmode='group',
             height=400, title='LSTM model convergence training time comparison', text_auto=True)
fig.update_yaxes(range=[100, 900])
fig.show()

In [145]:
# only keep model_name PW-AE and previous_model
weighted_result_df_top = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE') | (weighted_result_df['model_name'] == 'previous_model')]
weighted_result_df_top.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,14.202511,9.961286,362.339037,PW-AE,0.0
1,14.210553,9.969166,269.114585,PW-AE,0.05
2,14.134993,9.90093,330.223091,PW-AE,0.1
3,13.773916,9.642527,267.894557,PW-AE,0.25
4,13.623948,9.511959,310.539581,PW-AE,0.5


In [146]:
# Accuracy comparison of best vs previous models
fig = px.histogram(weighted_result_df_top, x="threshold", y="RMSE",
             color='model_name', barmode='group', text_auto=True,
             height=400, title='RMSE comparison of best vs previous models')
fig.update_yaxes(range=[13, 15])
fig.show()

In [147]:
# Accuracy comparison of top 2 models
fig = px.histogram(weighted_result_df_top, x="threshold", y="train_time",
             color='model_name', barmode='group', text_auto=True,
             height=400, title='train time comparison of best vs previous models')
# fig.update_yaxes(range=[13, 15])
fig.show()

In [148]:
# earth mover almost always better than euclidean distance
earth_mover_result_df['type'] = 'Earth_mover_distance'
weighted_result_df['type'] = 'Euclidiean distance'
earth_plot_df = weighted_result_df.append(earth_mover_result_df)
earth_plot_df = earth_plot_df[(earth_plot_df['model_name'] == 'PW-AE')]
earth_plot_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold,type
0,14.202511,9.961286,362.339037,PW-AE,0.0,Euclidiean distance
1,14.210553,9.969166,269.114585,PW-AE,0.05,Euclidiean distance
2,14.134993,9.90093,330.223091,PW-AE,0.1,Euclidiean distance
3,13.773916,9.642527,267.894557,PW-AE,0.25,Euclidiean distance
4,13.623948,9.511959,310.539581,PW-AE,0.5,Euclidiean distance


In [149]:
# earth mover almost always better than euclidean distance
fig = px.histogram(earth_plot_df, x="threshold", y="RMSE",
             color='type', barmode='group', text_auto=True,
             height=400, title='RMSE comparison of PW-AE earth mover and euclidean distance')
fig.update_yaxes(range=[13, 15])
fig.show()

In [150]:
# earth mover almost always better than euclidean distance
fig = px.histogram(earth_plot_df, x="threshold", y="train_time",
             color='type', barmode='group', text_auto=True,
             height=400, title='Time comparison of PW-AE earth mover and euclidean distance')
# fig.update_yaxes(range=[200, 900])
fig.show()

In [151]:
time_and_acc = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE')]
time_and_acc = time_and_acc[['RMSE', 'MAE', 'train_time', 'threshold']]
time_and_acc

Unnamed: 0,RMSE,MAE,train_time,threshold
0,14.202511,9.961286,362.339037,0.0
1,14.210553,9.969166,269.114585,0.05
2,14.134993,9.90093,330.223091,0.1
3,13.773916,9.642527,267.894557,0.25
4,13.623948,9.511959,310.539581,0.5
5,13.542933,9.457212,297.800454,0.75
6,13.5388,9.49206,245.464726,1.0


In [152]:
# normalize dataframe 
df_min_max_scaled = time_and_acc.copy()
# apply normalization techniques
for column in df_min_max_scaled.columns[:-1]:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min()) 
df_min_max_scaled

Unnamed: 0,RMSE,MAE,train_time,threshold
0,0.988027,0.984608,1.0,0.0
1,1.0,1.0,0.202353,0.05
2,0.887517,0.866716,0.72521,0.1
3,0.350004,0.361975,0.191914,0.25
4,0.126756,0.106937,0.556793,0.5
5,0.006153,0.0,0.447795,0.75
6,0.0,0.068067,0.0,1.0


In [153]:
fig = px.line(df_min_max_scaled, x='threshold', y=['train_time', 'RMSE'],title='RMSE vs Time')
fig.show()

## b. METR data processing 

In [117]:
# results save path
outlier_model_name = ["AE", "DAE", "PW-AE","HST", "ILOF","Kit-Net"]
data_name = 'METR-LA'
base_result_path = f'../results/{data_name}/LSTM'
exp_non_weighted = f'univariate_ILOF_outlier_non_weighted.pkl'
earth_mover_mode_name = 'PW-AE'
exp_earth_mover = f'univariate_{earth_mover_mode_name}_outlier_weighted_earth_mover.pkl'

In [118]:
non_weighted_result_df = get_results(base_result_path, data_name, exp_non_weighted,'ILOF')
non_weighted_result_df.replace('LOF', 'previous_model', inplace=True)
non_weighted_result_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,7.8181,3.594596,117.427032,previous_model,0.0
1,7.37324,3.430944,127.939163,previous_model,0.05
2,7.421433,3.575357,91.373586,previous_model,0.1
3,8.35308,3.891049,84.910483,previous_model,0.25
4,8.804216,4.078267,76.758607,previous_model,0.5


In [119]:
earth_mover_result_df = get_results(base_result_path, data_name, exp_earth_mover,earth_mover_mode_name)
earth_mover_result_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,6.59858,2.663144,126.684436,PW-AE,0.0
1,6.529431,2.611389,124.194214,PW-AE,0.05
2,7.283436,2.957125,78.365214,PW-AE,0.1
3,7.480658,3.100775,86.167378,PW-AE,0.25
4,7.918084,3.470771,81.914097,PW-AE,0.5


In [120]:
# combine results
weighted_result_df = pd.DataFrame()
for out_mod in outlier_model_name:
    exp_name_weighted = f'univariate_{out_mod}_outlier_weighted.pkl'
    df_weighted = get_results(base_result_path, data_name, exp_name_weighted,out_mod)
    weighted_result_df = weighted_result_df.append(df_weighted)

weighted_result_df = weighted_result_df.append(non_weighted_result_df)
weighted_result_df.head()  

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,6.230096,3.050992,42.094377,AE,0.0
1,5.905414,2.8151,56.113401,AE,0.05
2,5.923784,2.870068,48.046008,AE,0.1
3,5.956229,2.957772,54.877143,AE,0.25
4,7.087019,3.457735,44.110559,AE,0.5


In [121]:
# Accuracy comparison
fig = px.histogram(weighted_result_df, x="threshold", y="RMSE",
             color='model_name', barmode='group',
             height=400, text_auto=True, title='RMSE comparison')
fig.update_yaxes(range=[4, 10])
fig.show()

In [122]:
# LSTM model training time comparison
fig = px.histogram(weighted_result_df, x="threshold", y="train_time",
             color='model_name', barmode='group',
             height=400, title='LSTM model convergence training time comparison', text_auto=True)
# fig.update_yaxes(range=[0, 2100])
fig.show()

In [123]:
# only keep model_name PW-AE and previous_model
weighted_result_df_top = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE') | (weighted_result_df['model_name'] == 'previous_model')]
weighted_result_df_top.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold
0,6.276688,3.114317,56.724551,PW-AE,0.0
1,5.943635,2.921393,36.121481,PW-AE,0.05
2,5.879924,2.761172,46.180924,PW-AE,0.1
3,6.978379,3.412793,59.475604,PW-AE,0.25
4,7.524788,3.491569,37.902906,PW-AE,0.5


In [124]:
# Accuracy comparison of best vs previous models
fig = px.histogram(weighted_result_df_top, x="threshold", y="RMSE",
             color='model_name', barmode='group', text_auto=True,
             height=400, title='RMSE comparison of best vs previous models')
fig.update_yaxes(range=[4, 10])
fig.show()

In [125]:
# train time comparison of previous vs best models
fig = px.histogram(weighted_result_df_top, x="threshold", y="train_time",
             color='model_name', barmode='group', text_auto=True,
             height=400, title='train time comparison of best vs previous models')
# fig.update_yaxes(range=[4, 10])
fig.show()

In [126]:
# earth mover almost always better than euclidean distance
earth_mover_result_df['type'] = 'Earth_mover_distance'
weighted_result_df['type'] = 'Euclidiean distance'
earth_plot_df = weighted_result_df.append(earth_mover_result_df)
earth_plot_df = earth_plot_df[(earth_plot_df['model_name'] == 'PW-AE')]
earth_plot_df.head()

Unnamed: 0,RMSE,MAE,train_time,model_name,threshold,type
0,6.276688,3.114317,56.724551,PW-AE,0.0,Euclidiean distance
1,5.943635,2.921393,36.121481,PW-AE,0.05,Euclidiean distance
2,5.879924,2.761172,46.180924,PW-AE,0.1,Euclidiean distance
3,6.978379,3.412793,59.475604,PW-AE,0.25,Euclidiean distance
4,7.524788,3.491569,37.902906,PW-AE,0.5,Euclidiean distance


In [129]:
# earth mover almost always better than euclidean distance
fig = px.histogram(earth_plot_df, x="threshold", y="RMSE",
             color='type', barmode='group', text_auto=True,
             height=400, title='RMSE comparison of PW-AE earth mover and euclidean distance')
fig.update_yaxes(range=[4, 9])
fig.show()

In [131]:
# earth mover almost always better than euclidean distance
fig = px.histogram(earth_plot_df, x="threshold", y="train_time",
             color='type', barmode='group', text_auto=True,
             height=400, title='train time comparison of PW-AE earth mover and euclidean distance')
# fig.update_yaxes(range=[4, 9])
fig.show()

In [132]:
time_and_acc = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE')]
time_and_acc = time_and_acc[['RMSE', 'MAE', 'train_time', 'threshold']]
time_and_acc

Unnamed: 0,RMSE,MAE,train_time,threshold
0,6.276688,3.114317,56.724551,0.0
1,5.943635,2.921393,36.121481,0.05
2,5.879924,2.761172,46.180924,0.1
3,6.978379,3.412793,59.475604,0.25
4,7.524788,3.491569,37.902906,0.5
5,8.453616,4.21552,59.078034,0.75
6,7.823115,4.156466,45.320423,1.0


In [133]:
# normalize dataframe 
df_min_max_scaled = time_and_acc.copy()
# apply normalization techniques
for column in df_min_max_scaled.columns[:-1]:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min()) 
df_min_max_scaled

Unnamed: 0,RMSE,MAE,train_time,threshold
0,0.154161,0.24282,0.882203,0.0
1,0.024755,0.110167,0.0,0.05
2,0.0,0.0,0.430735,0.1
3,0.426801,0.44805,1.0,0.25
4,0.639107,0.502216,0.076279,0.5
5,1.0,1.0,0.982976,0.75
6,0.755021,0.959395,0.393889,1.0


In [135]:
fig = px.line(df_min_max_scaled, x='threshold', y=['train_time', 'RMSE'],title='RMSE vs Time')
fig.show()

# RQ.2 - How does the inclusion of outlier-based updating strategy in the LSTM model affect the accuracy of real-time traffic flow predictions?

## a. Hauge data processing

In [195]:
# results save path
outlier_model_name_list = ['AE','DAE','PW-AE','HST','Kit-Net','ILOF']
data_name = 'hauge'
time_window_list = [3, 6, 12, 24, 24*7, 24*30, 24*30*3, 24*30*6]
RMSE_list = ['No_update','3','6','12','24','168','720','2160','4320']
time_list = ['3','6','12','24','168','720','2160','4320']
threshold = 1
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [196]:
# outlier instance time 
out_time_dict = {}
for outlier_model_name in outlier_model_name_list:
    out_result_path = os.path.join(base_outlier_result_path, outlier_model_name)
    out_result_path = os.path.join(out_result_path,'instance_train_time_seconds.pkl')
    # read pickle file
    with open(out_result_path, 'rb') as f:
        scores = pickle.load(f)
    target_time_taken = np.mean(scores)/2
    out_time_dict[outlier_model_name] = target_time_taken
out_time_dict

{'AE': 0.8592159946759542,
 'DAE': 2.6607353339592614,
 'PW-AE': 1.502115396161874,
 'HST': 29.787844421962898,
 'Kit-Net': 1.4734095161159833,
 'ILOF': 8.94620082527399}

In [197]:
result_dict = {}
for outlier_model_name in outlier_model_name_list:
    # print(f"Processing {outlier_model_name}")
    result_dict[outlier_model_name] = {}
    exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}.pkl'
    real_time_load_path = os.path.join(base_result_path, exp_name)
    real_time_results = load_real_time_results(real_time_load_path, data_name)
    for target, value in real_time_results.items():
        result_dict[outlier_model_name][target] = {}
        result_dict[outlier_model_name][target]['No_update'] = value[threshold]['No_update']['RMSE']
        for time_window in time_window_list:
            time_key = str(time_window)
            result_dict[outlier_model_name][target][time_key] = {}
            time_key_RMSE, time_key_train_time = extract_time_key_results(value[threshold][time_key]['incremental_weighted_update'])
            result_dict[outlier_model_name][target][time_key]['RMSE'] = time_key_RMSE
            result_dict[outlier_model_name][target][time_key]['train_time'] = time_key_train_time

In [198]:
global_rmse_df = pd.DataFrame(columns=['time_window', 'RMSE', 'model_name'])
global_time_df = pd.DataFrame(columns=['time_window', 'train_time', 'model_name'])

for out_name, target_results in result_dict.items():
    rmse_dict = {}
    time_dict = {}
    for rmse_name in RMSE_list:
        rmse_dict[rmse_name] = []
    for time_name in time_list:
        time_dict[time_name] = []

    for target_name, time_results in target_results.items():
        for rmse_name in RMSE_list:
            if rmse_name == 'No_update':
                rmse_dict[rmse_name].append(time_results[rmse_name])
            else:
                rmse_dict[rmse_name].append(time_results[rmse_name]['RMSE'])
                time_dict[rmse_name].append(time_results[rmse_name]['train_time'])

    avg_rmse_dict = {}
    avg_time_dict = {}
    for rmse_name in RMSE_list:
        avg_rmse_dict[rmse_name] = np.mean(rmse_dict[rmse_name])
    for time_name in time_list:
        avg_time_dict[time_name] = np.mean(time_dict[time_name])

    rmse_df = pd.DataFrame(avg_rmse_dict.items(), columns=['time_window', 'RMSE'])
    rmse_df['model_name'] = out_name
    global_rmse_df = global_rmse_df.append(rmse_df)
    time_df = pd.DataFrame(avg_time_dict.items(), columns=['time_window', 'train_time'])
    time_df['model_name'] = out_name
    global_time_df = global_time_df.append(time_df)

In [199]:
time_window_name = {'No_update':'No_update', '3':'3 hours','6':'6 hours','12':'12 hours','24':'24 hours','168':'1 week','720':'1 month', '2160':'3 months', '4320':'6 months'}
global_rmse_df['time_window'] = global_rmse_df['time_window'].map(time_window_name)
global_time_df['time_window'] = global_time_df['time_window'].map(time_window_name)

In [201]:
fig = px.histogram(global_rmse_df, x="time_window", y="RMSE",
             color='model_name', barmode='group',
             height=400, text_auto=True, title='RMSE comparison for different time windows')
fig.update_yaxes(range=[14, 28])
fig.show()

In [202]:
# add the time to global_time_df for each outlier model
global_time_df['instance_train_time'] = global_time_df['model_name'].map(out_time_dict)
global_time_df['train_time'] = global_time_df['train_time'] + global_time_df['instance_train_time']
global_time_df.head()   

Unnamed: 0,time_window,train_time,model_name,instance_train_time
0,3 hours,272.827856,AE,0.859216
1,6 hours,135.031604,AE,0.859216
2,12 hours,141.656309,AE,0.859216
3,24 hours,107.707638,AE,0.859216
4,1 week,80.378627,AE,0.859216


In [203]:
fig = px.histogram(global_time_df, x="time_window", y="train_time",
             color='model_name', barmode='group',
             height=400, text_auto=True, title='Training time comparison for different time windows')
# fig.update_yaxes(range=[12, 16])
fig.show()

In [204]:
outlier_time_df = pd.DataFrame(out_time_dict.items(), columns=['model_name', 'instance_train_time'])
fig = px.bar(outlier_time_df, x='model_name', y='instance_train_time', text_auto=True, title='outlier model training time')
fig.show()

## b. METR-LA data processing

In [205]:
# results save path
outlier_model_name_list = ['AE','DAE','PW-AE','HST','Kit-Net','ILOF']
data_name = 'METR-LA'
time_window_list = [3, 6, 12, 24, 24*7, 24*30]
RMSE_list = ['No_update','3','6','12','24','168','720']
time_list = ['3','6','12','24','168','720']
threshold = 0.1
base_result_path = f'../results/{data_name}/real_time_modeling'
base_outlier_result_path = f'../results/{data_name}/outlier_scores'

In [206]:
# outlier instance time 
out_time_dict = {}
for outlier_model_name in outlier_model_name_list:
    out_result_path = os.path.join(base_outlier_result_path, outlier_model_name)
    out_result_path = os.path.join(out_result_path,'instance_train_time_seconds.pkl')
    # read pickle file
    with open(out_result_path, 'rb') as f:
        scores = pickle.load(f)
    target_time_taken = np.mean(scores)/2
    out_time_dict[outlier_model_name] = target_time_taken
out_time_dict

{'AE': 0.12805331732339906,
 'DAE': 0.21488652770645952,
 'PW-AE': 0.22971329424116346,
 'HST': 8.261690311961704,
 'Kit-Net': 0.18272715432632372,
 'ILOF': 1.3503299024369981}

In [207]:
result_dict = {}
for outlier_model_name in outlier_model_name_list:
    print(f"Processing {outlier_model_name}")
    # print(f"Processing {outlier_model_name}")
    result_dict[outlier_model_name] = {}
    exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}.pkl'
    real_time_load_path = os.path.join(base_result_path, exp_name)
    real_time_results = load_real_time_results(real_time_load_path, data_name)
    for target, value in real_time_results.items():
        result_dict[outlier_model_name][target] = {}
        result_dict[outlier_model_name][target]['No_update'] = value[threshold]['No_update']['RMSE']
        for time_window in time_window_list:
            time_key = str(time_window)
            result_dict[outlier_model_name][target][time_key] = {}
            time_key_RMSE, time_key_train_time = extract_time_key_results(value[threshold][time_key]['incremental_weighted_update'])
            result_dict[outlier_model_name][target][time_key]['RMSE'] = time_key_RMSE
            result_dict[outlier_model_name][target][time_key]['train_time'] = time_key_train_time

Processing AE
Processing DAE
Processing PW-AE
Processing HST
Processing Kit-Net
Processing ILOF


In [208]:
global_rmse_df = pd.DataFrame(columns=['time_window', 'RMSE', 'model_name'])
global_time_df = pd.DataFrame(columns=['time_window', 'train_time', 'model_name'])

for out_name, target_results in result_dict.items():
    rmse_dict = {}
    time_dict = {}
    for rmse_name in RMSE_list:
        rmse_dict[rmse_name] = []
    for time_name in time_list:
        time_dict[time_name] = []

    for target_name, time_results in target_results.items():
        for rmse_name in RMSE_list:
            if rmse_name == 'No_update':
                rmse_dict[rmse_name].append(time_results[rmse_name])
            else:
                rmse_dict[rmse_name].append(time_results[rmse_name]['RMSE'])
                time_dict[rmse_name].append(time_results[rmse_name]['train_time'])

    avg_rmse_dict = {}
    avg_time_dict = {}
    for rmse_name in RMSE_list:
        avg_rmse_dict[rmse_name] = np.mean(rmse_dict[rmse_name])
    for time_name in time_list:
        avg_time_dict[time_name] = np.mean(time_dict[time_name])

    rmse_df = pd.DataFrame(avg_rmse_dict.items(), columns=['time_window', 'RMSE'])
    rmse_df['model_name'] = out_name
    global_rmse_df = global_rmse_df.append(rmse_df)
    time_df = pd.DataFrame(avg_time_dict.items(), columns=['time_window', 'train_time'])
    time_df['model_name'] = out_name
    global_time_df = global_time_df.append(time_df)

In [209]:
time_window_name = {'No_update':'No_update', '3':'3 hours','6':'6 hours','12':'12 hours','24':'24 hours','168':'1 week','720':'1 month', '2160':'3 months', '4320':'6 months'}
global_rmse_df['time_window'] = global_rmse_df['time_window'].map(time_window_name)
global_time_df['time_window'] = global_time_df['time_window'].map(time_window_name)

In [211]:
fig = px.histogram(global_rmse_df, x="time_window", y="RMSE",
             color='model_name', barmode='group',
             height=400, text_auto=True, title='RMSE comparison for different time windows')
fig.update_yaxes(range=[2, 14])
fig.show()

In [212]:
# add the time to global_time_df for each outlier model
global_time_df['instance_train_time'] = global_time_df['model_name'].map(out_time_dict)
global_time_df['train_time'] = global_time_df['train_time'] + global_time_df['instance_train_time']
global_time_df.head()   

Unnamed: 0,time_window,train_time,model_name,instance_train_time
0,3 hours,36.340468,AE,0.128053
1,6 hours,19.426848,AE,0.128053
2,12 hours,19.951791,AE,0.128053
3,24 hours,15.946948,AE,0.128053
4,1 week,13.025129,AE,0.128053


In [213]:
fig = px.histogram(global_time_df, x="time_window", y="train_time",
             color='model_name', barmode='group',
             height=400, text_auto=True, title='Training time comparison for different time windows')
# fig.update_yaxes(range=[12, 16])
fig.show()

In [214]:
outlier_time_df = pd.DataFrame(out_time_dict.items(), columns=['model_name', 'instance_train_time'])
fig = px.bar(outlier_time_df, x='model_name', y='instance_train_time', text_auto=True, title='outlier model training time')
fig.show()

# -------------------------- END ----------------------------

# EXTRA

In [None]:
for key, val in results['T1_North'][0.75]['incremental_weighted_update'].items():
    temp = val['df']
    print(key, val['df'])
    break

2019-03-18 00:00:00       Real  Predicted
0      6.0   8.503822
1      2.0   7.115705
2      6.0   6.609240
3      4.0   6.347741
4      1.0   6.762699
...    ...        ...
8623   5.0   7.490222
8624   7.0   7.134953
8625   3.0   7.974505
8626   1.0   6.641669
8627   3.0   4.435976

[8628 rows x 2 columns]


[6.0,
 2.0,
 6.0,
 4.0,
 1.0,
 6.0,
 4.0,
 8.0,
 5.0,
 5.0,
 9.0,
 3.0,
 2.0,
 4.0,
 3.0,
 7.0,
 3.0,
 1.0,
 4.0,
 0.0,
 0.0,
 1.0,
 0.0,
 4.0,
 5.0,
 4.0,
 4.0,
 1.0,
 1.0,
 1.0,
 1.0,
 2.0,
 1.0,
 2.0,
 2.0,
 2.0,
 3.0,
 4.0,
 2.0,
 0.0,
 0.0,
 4.0,
 4.0,
 0.0,
 2.0,
 4.0,
 2.0,
 3.0,
 3.0,
 5.0,
 4.0,
 7.0,
 3.0,
 3.0,
 9.0,
 6.0,
 2.0,
 0.0,
 6.0,
 8.0,
 9.0,
 9.0,
 8.0,
 7.0,
 6.0,
 9.0,
 18.0,
 12.0,
 10.0,
 9.0,
 14.0,
 17.0,
 10.0,
 11.0,
 13.0,
 21.0,
 11.0,
 14.0,
 17.0,
 32.0,
 36.0,
 40.0,
 27.0,
 23.0,
 33.0,
 22.0,
 25.0,
 37.0,
 25.0,
 38.0,
 31.0,
 35.0,
 48.0,
 47.0,
 42.0,
 41.0,
 37.0,
 47.0,
 35.0,
 37.0,
 33.0,
 47.0,
 51.0,
 50.0,
 56.0,
 72.0,
 58.0,
 68.0,
 50.0,
 37.0,
 83.0,
 94.0,
 65.0,
 81.0,
 64.0,
 56.0,
 82.0,
 83.0,
 74.0,
 74.0,
 66.0,
 68.0,
 51.0,
 81.0,
 90.0,
 76.0,
 91.0,
 62.0,
 77.0,
 90.0,
 94.0,
 57.0,
 95.0,
 89.0,
 76.0,
 96.0,
 90.0,
 82.0,
 104.0,
 105.0,
 86.0,
 87.0,
 107.0,
 77.0,
 93.0,
 86.0,
 99.0,
 64.0,
 118.0,
 92.0,
 81.0,
 111.0

In [None]:
for key, val in results['T1_North'][0.75]['No_update'].items():
    print(key, val['RMSE'])

2019-03-18 00:00:00 10.034977205608882
2019-04-17 00:00:00 10.125589612582637
2019-05-17 00:00:00 10.26841978818012
2019-06-16 00:00:00 10.657804582696627
2019-07-16 00:00:00 10.394426290103299
2019-08-15 00:00:00 9.992311955587327
2019-09-14 00:00:00 10.57775355353195
2019-10-14 00:00:00 10.578103362699437
2019-11-13 00:00:00 11.151554946928925
2019-12-13 00:00:00 10.675813408980103
2020-01-12 00:00:00 9.764005725125983
2020-02-11 00:00:00 10.3348357636312
2020-03-12 00:00:00 10.24559931374693


In [None]:
result_dict

{'DAE': {'3H': 41.397123053008265,
  '3H_no_update': 46.75430629091232,
  '6H': 37.36996224538998,
  '6H_no_update': 43.56662402159998,
  '12H': 23.43286523704221,
  '12H_no_update': 23.864920628094133,
  '24H': 18.8146171252517,
  '24H_no_update': 18.199053084826467,
  '168H': 16.640360187183795,
  '168H_no_update': 15.986966450295373,
  '720H': 15.907854474144743,
  '720H_no_update': 15.12322093794813}}

In [None]:
df1 = pd.DataFrame(result_dict['incremental_weighted_update']) # take mean of all columns except timestamp
df1['mean'] = df1.mean(axis=1)
df1.rename(columns={'mean':'incremental_weighted_update'}, inplace=True) # rename mean column to incremental_weighted_update
df1 = df1[['Timestamp','incremental_weighted_update']] # select only timestamp and incremental_weighted_update columns
df1.head()

Unnamed: 0,Timestamp,incremental_weighted_update
0,2019-02-16 03:00:00,65.759712
1,2019-02-16 06:00:00,38.50881
2,2019-02-16 09:00:00,22.557152
3,2019-02-16 12:00:00,78.023195
4,2019-02-16 15:00:00,92.993168


In [None]:
df2 = pd.DataFrame(result_dict['incremental_static_update']) # take mean of all columns except timestamp
df2['mean'] = df2.mean(axis=1)
df2.rename(columns={'mean':'incremental_static_update'}, inplace=True) # rename mean column to incremental_weighted_update
df2 = df2[['Timestamp','incremental_static_update']]
df2.head()

Unnamed: 0,Timestamp,incremental_static_update
0,2019-02-16 03:00:00,65.232043
1,2019-02-16 06:00:00,37.630549
2,2019-02-16 09:00:00,22.580669
3,2019-02-16 12:00:00,78.710645
4,2019-02-16 15:00:00,92.033099


In [None]:
df3 = pd.DataFrame(result_dict['No_update']) # take mean of all columns except timestamp
df3['mean'] = df3.mean(axis=1)
df3.rename(columns={'mean':'No_update'}, inplace=True) # rename mean column to incremental_weighted_update
df3 = df3[['Timestamp','No_update']]
df3.head()

Unnamed: 0,Timestamp,No_update
0,2019-02-16 03:00:00,65.232043
1,2019-02-16 06:00:00,65.964367
2,2019-02-16 09:00:00,34.41229
3,2019-02-16 12:00:00,42.026543
4,2019-02-16 15:00:00,67.290734


In [None]:
# merge dataframes on timestamp
df = pd.merge(df1, df2, on='Timestamp')
df = pd.merge(df, df3, on='Timestamp')
df.head()

Unnamed: 0,Timestamp,incremental_weighted_update,incremental_static_update,No_update
0,2019-02-16 03:00:00,65.759712,65.232043,65.232043
1,2019-02-16 06:00:00,38.50881,37.630549,65.964367
2,2019-02-16 09:00:00,22.557152,22.580669,34.41229
3,2019-02-16 12:00:00,78.023195,78.710645,42.026543
4,2019-02-16 15:00:00,92.993168,92.033099,67.290734


In [None]:
fig = px.line(df, x='Timestamp', y=['incremental_weighted_update', 'incremental_static_update','No_update'],title='RMSE vs time')
fig.show()

In [None]:
# calculate mean of all columns 
temp = pd.DataFrame(df.mean(axis = 0), columns=['RMSE'])
temp['model'] = temp.index
temp.reset_index(drop=True, inplace=True)
temp

Unnamed: 0,RMSE,model
0,41.270614,incremental_weighted_update
1,40.442322,incremental_static_update
2,46.087276,No_update


In [None]:
fig = px.bar(temp, x='model', y='RMSE', text_auto=True, title='RMSE vs model')
fig.show()

#### ---------------------------- METR-LA data processing ---------------------------- 

In [None]:
# results save path
outlier_model_name = 'DAE'
data_name = 'METR-LA'
time_window = 3
threshold = 0.05
base_result_path = f'../results/{data_name}/real_time_modeling'
exp_name = f'univariate_real_time_{outlier_model_name}_threshold_{threshold}_{time_window}H.pkl'
load_path = os.path.join(base_result_path, exp_name)

In [None]:
# laod results
with open(load_path, 'rb') as f:
    results = pickle.load(f)

In [None]:
model_types = ['incremental_weighted_update','incremental_static_update','No_update']
result_dict = {}
result_dict['incremental_weighted_update'] = {}
result_dict['incremental_static_update'] = {}
result_dict['No_update'] = {}
result_dict['incremental_weighted_update']['Timestamp'] = []
result_dict['incremental_static_update']['Timestamp'] = []
result_dict['No_update']['Timestamp'] = []


for intersection, value1 in results.items():
    data = value1[0.05]
    incremental_weighted_update_data = data['incremental_weighted_update']
    incremental_static_update_data = data['incremental_static_update']
    no_update_data = data['No_update']

    result_dict['incremental_weighted_update'][intersection] = []
    result_dict['incremental_static_update'][intersection] = []
    result_dict['No_update'][intersection] = []
    result_dict['incremental_weighted_update']['Timestamp'] = []
    result_dict['incremental_static_update']['Timestamp'] = []
    result_dict['No_update']['Timestamp'] = []

    for key, value in incremental_weighted_update_data.items():
        result_dict['incremental_weighted_update'][intersection].append(value['RMSE'])
        result_dict['incremental_weighted_update']['Timestamp'].append(key)
    for key, value in incremental_static_update_data.items():
        result_dict['incremental_static_update'][intersection].append(value['RMSE'])
        result_dict['incremental_static_update']['Timestamp'].append(key)
    for key, value in no_update_data.items():
        result_dict['No_update'][intersection].append(value['RMSE'])
        result_dict['No_update']['Timestamp'].append(key)


In [None]:
df1 = pd.DataFrame(result_dict['incremental_weighted_update']) # take mean of all columns except timestamp
df1['mean'] = df1.mean(axis=1)
df1.rename(columns={'mean':'incremental_weighted_update'}, inplace=True) # rename mean column to incremental_weighted_update
df1 = df1[['Timestamp','incremental_weighted_update']] # select only timestamp and incremental_weighted_update columns
df1.head()

Unnamed: 0,Timestamp,incremental_weighted_update
0,2012-04-29 15:00:00,15.794091
1,2012-04-29 18:00:00,44.533632
2,2012-04-29 21:00:00,32.097457
3,2012-04-30 00:00:00,29.35802
4,2012-04-30 03:00:00,9.959567


In [None]:
df2 = pd.DataFrame(result_dict['incremental_static_update']) # take mean of all columns except timestamp
df2['mean'] = df2.mean(axis=1)
df2.rename(columns={'mean':'incremental_static_update'}, inplace=True) # rename mean column to incremental_weighted_update
df2 = df2[['Timestamp','incremental_static_update']]
df2.head()

Unnamed: 0,Timestamp,incremental_static_update
0,2012-04-29 15:00:00,15.963201
1,2012-04-29 18:00:00,43.990069
2,2012-04-29 21:00:00,33.783782
3,2012-04-30 00:00:00,28.429096
4,2012-04-30 03:00:00,12.301638


In [None]:
df3 = pd.DataFrame(result_dict['No_update']) # take mean of all columns except timestamp
df3['mean'] = df3.mean(axis=1)
df3.rename(columns={'mean':'No_update'}, inplace=True) # rename mean column to incremental_weighted_update
df3 = df3[['Timestamp','No_update']]
df3.head()

Unnamed: 0,Timestamp,No_update
0,2012-04-29 15:00:00,15.963201
1,2012-04-29 18:00:00,39.637098
2,2012-04-29 21:00:00,12.889817
3,2012-04-30 00:00:00,19.167736
4,2012-04-30 03:00:00,18.787956


In [None]:
# merge dataframes on timestamp
df = pd.merge(df1, df2, on='Timestamp')
df = pd.merge(df, df3, on='Timestamp')
df.head()

Unnamed: 0,Timestamp,incremental_weighted_update,incremental_static_update,No_update
0,2012-04-29 15:00:00,15.794091,15.963201,15.963201
1,2012-04-29 18:00:00,44.533632,43.990069,39.637098
2,2012-04-29 21:00:00,32.097457,33.783782,12.889817
3,2012-04-30 00:00:00,29.35802,28.429096,19.167736
4,2012-04-30 03:00:00,9.959567,12.301638,18.787956


In [None]:
fig = px.line(df, x='Timestamp', y=['incremental_weighted_update', 'incremental_static_update','No_update'],title='RMSE vs time')
fig.show()

In [None]:
# calculate mean of all columns 
temp = pd.DataFrame(df.mean(axis = 0), columns=['RMSE'])
temp['model'] = temp.index
temp.reset_index(drop=True, inplace=True)
temp

Unnamed: 0,RMSE,model
0,13.516418,incremental_weighted_update
1,13.029078,incremental_static_update
2,18.527539,No_update


In [None]:
fig = px.bar(temp, x='model', y='RMSE', text_auto=True, title='RMSE vs model')
fig.show()

In [None]:
# results save path
base_result_path = '../results/METR-LA/LSTM'
exp_name = 'multivariate_AE_weighted_vector_real_time_results.pkl'
results_save_path = os.path.join(base_result_path, exp_name)

In [None]:
columns_names = ['717508','772151','767366','769405','769388','716951','717497','717483','717468','772596','772597','717469','764120','772178','769941','760024','771673','771667','765604','765176','773927','717480','717456','717495','716953','717481','767572','718371','769403','769359','716943','717491','767610','717446','717452','717453','717447','716956','717490','716942','773880','718204','769358','769402','767573','769953','773062','765164','772140','772168','769372','716571','716968','717492','717486','769819','717445','773869','769831','717450','716941','716955','717493','764858','769373','765171','768469','773012','717580','773953','763995','718499','716339','718089','718076','717608','769467','772669','717595','773013','717583','759772','764424','717804','764949','717582','773996','717592','759602','764781','767471','718064','767470','765273','764794','773954','717578','717587','773995','717585','769926','716328','765265','761003','718072','769847','718066','761599','765099','760650','717816','761604','717590','774012','717576','774204','767495','767494','759591','716331','762329','767523','767053','773024','774011','764766','767509','717818','767455','769444','716939','717825','767454','717819','773974','717570','718496','717821','737529','718045','716337','717571','773975','717573','764760','769867','718090','772513','717610','717823','769443','717572','773023','773939','767541','769418','769430','716958','717461','717460','716554','769431','764101','767554','773904','765182','718141','760987','717502','773906','767542','772167','717099','767350','767620','717489','717462','717463','767609','717488','767621','767351','774067','767751','717513','764106','768066','764853','717498','717473','717472','769806','717466','717499','769345','767585','718379','773916','767750','717504','717510','716949','717458','717465','717459','716960','769346']

In [None]:
with open(results_save_path, 'rb') as f:
    results1 = pickle.load(f)

In [None]:
org_df = pd.DataFrame(results1[0]['df']['Real'],columns=columns_names)
pred_df = pd.DataFrame(results1[0]['df']['Predicted'],columns=columns_names)

In [None]:
sqrt(mean_squared_error(org_df['773062'].values,pred_df['773062'].values))

In [None]:
org_df['717445']

# Real-Time result Analysis

In [75]:
target = '737529'
threshold = 0.05
time_window = 24

In [76]:
base_result_path = '../results/METR-LA/real_time_modeling/'
exp_name = f'univariate_AE_real_time_processing_threshold_{threshold}_{time_window}H_results_{target}.pkl'
results_load_path = os.path.join(base_result_path, exp_name)

In [77]:
# load results
with open(results_load_path, 'rb') as f:
    results_real_time = pickle.load(f)

In [78]:
results_real_time[target][threshold].keys()

dict_keys(['incremental_weighted_update', 'incremental_update', 'No_update', 'base_model'])

In [79]:
# get the average rmse and mae from all the experiments
def get_results(results_real_time, target, threshold, exp = 'No_update'):
    rmse = []
    mae = []
    train_time = []
    for key,val in results_real_time[target][threshold][exp].items():
        rmse.append(val['RMSE'])
        mae.append(val['MAE'])
        if exp != 'No_update':
            train_time.append(val['train_time'])
        else:
            train_time.append(0)
    print(f'RMSE: {np.mean(rmse)}')
    print(f'MAE: {np.mean(mae)}')
    print(f'Train time: {np.mean(train_time)}')
    # return rmse, mae, train_time

In [80]:
get_results(results_real_time, target, threshold, exp = 'incremental_weighted_update')

RMSE: 12.044180856281702
MAE: 7.845166206359863
Train time: 0.021019895197981496


In [81]:
get_results(results_real_time, target, threshold, exp = 'incremental_update')

RMSE: 12.00008107819342
MAE: 7.815212726593018
Train time: 0.01784764710119215


In [82]:
get_results(results_real_time, target, threshold, exp = 'No_update')

RMSE: 11.296915006448161
MAE: 7.359277725219727
Train time: 0.0


In [72]:
get_results(results_real_time, target, threshold, exp = 'incremental_weighted_update')

RMSE: 12.335480324885852
MAE: 10.86701488494873
Train time: 0.005507672962389494


In [73]:
get_results(results_real_time, target, threshold, exp = 'incremental_update')

RMSE: 12.402022270190002
MAE: 10.887054443359375
Train time: 0.005451359497873407


In [74]:
get_results(results_real_time, target, threshold, exp = 'No_update')

RMSE: 16.142935782701766
MAE: 12.458470344543457
Train time: 0.0


In [56]:
get_results(results_real_time, target, threshold, exp = 'incremental_weighted_update')

RMSE: 11.751373090139266
MAE: 10.418865203857422
Train time: 0.005428610851890162


In [57]:
get_results(results_real_time, target, threshold, exp = 'incremental_update')

RMSE: 11.72788030884934
MAE: 10.371858596801758
Train time: 0.00535873764439633


In [58]:
get_results(results_real_time, target, threshold, exp = 'No_update')

RMSE: 15.328964989254983
MAE: 12.401187896728516
Train time: 0.0


In [None]:
# time_and_acc = weighted_result_df[(weighted_result_df['model_name'] == 'PW-AE')]
# time_and_acc = time_and_acc[['RMSE', 'MAE', 'train_time', 'threshold']]
# time_and_acc

In [None]:
# # normalize dataframe 
# df_min_max_scaled = time_and_acc.copy()
# # apply normalization techniques
# for column in df_min_max_scaled.columns[:-1]:
#     df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min()) 
# df_min_max_scaled

In [None]:
# fig = px.line(df_min_max_scaled, x='threshold', y=['train_time', 'RMSE'],title='RMSE vs Threshold')
# fig.show()