In [1]:
# this notebook is an extension of the code - https://github.com/lucasczz/DAADS to run the autoencoder models and other outlier models
# import libraries
import torch.nn as nn
import pandas as pd
import json
import os
import numpy as np
import pickle
import json
import scipy
from torch.utils.data import Dataset, DataLoader
import torch
from collections import defaultdict
import time
# import matplotlib.pyplot as plt
# import seaborn as sns
from datetime import timedelta

# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)

# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')


# importing from daads
import sys
# set path for the DAADS package
sys.path.insert(0, os.getcwd().replace('notebooks', 'DAADS'))
from tools.evaluate import aggregate_dataframe, test_then_train

# ========== FUNCTION ==============

In [2]:
# prepare input for AEs, 
# input is combined list of all intersections (vector -> [fpds + hourofday+ weekofday + intersectoion]) for each week and each hour sorted based on time
# output is -> list of feature vector and time for all interctions combined --> [[fpds + hourofday+ weekofday + intersectoion], time]
def prepare_input_for_PWAE(intersection_data):
    combined_fpds_for_AEs = []
    for i in range(7):
        for j in range(24):
            for fpd,time_instance in zip(intersection_data[i][j][0],intersection_data[i][j][1]):
                temp = fpd.astype(np.float32).tolist()
                temp.extend([time_instance, 0])
                combined_fpds_for_AEs.append(temp)

    # combined_fpds_for_AEs = sorted(combined_fpds_for_AEs, key=lambda x:x[12])
    return combined_fpds_for_AEs

In [3]:
# function to load data from pickle file
def load_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [4]:
# function to save outlier results
def save_outlier_results(outlier_results, path):
    outlier_df = pd.DataFrame(outlier_results,columns=["outlier_score",'timestamp']) # convert to dataframe
    outlier_df['timestamp'] = pd.to_datetime(outlier_df['timestamp']) # convert timestamp to datetime
    outlier_df.set_index('timestamp',inplace=True) # set timestamp as index
    outlier_df.to_csv(path) # save to csv

# =========== MAIN ===========

In [5]:
# provide the models you want to train, and their hyperparameters if needed for more reference go to - (https://github.com/lucasczz/DAADS)
MODELS = ["AE", "DAE","PW-AE", "ILOF", "HST"]
# MODELS = ["PW-AE"]
CONFIGS = {
    "AE": {"lr": 0.01, "latent_dim": 0.2},
    "DAE": {"lr": 0.01},
    "PW-AE": {"lr": 0.01},
    "OC-SVM": {},
    "HST": {"n_trees": 25, "height": 15},
}
earths_movers_distance = True

# use EMD or RMSE as metric for reconstruction error 
# to use this make necesary change in the DAADS code
# the loss function for EMD is already added in the DAADS code but it needed to be updated when usind EMD
# change the loss function in the DAADS code to the following
# 1. go to "OWRI/DAADS/IncrementalTorch/IncrementalTorch/anomaly.py" and change loss_fn="emd" in all the classes for which you want to use EMD
# 2. the available loss functions are (File location - "OWRI/DAADS/IncrementalTorch/IncrementalTorch/utils/module_finder")- 
        # "mse": F.mse_loss,
        # "rmse": rmse_loss,
        # "emd": earth_mover_loss,
        # "mae": F.l1_loss,
        # "smooth_mae": F.smooth_l1_loss,
        # "bce": F.binary_cross_entropy,
        # "ce": F.cross_entropy,
        # "kld": F.kl_div,
        # "huber": F.huber_loss

# 3. EMD is defined at the above location as - 
    # # earth mover's distance loss
    # def earth_mover_loss(input, target, size_average=None, reduce=None, reduction="mean"):
    #     # Compute the distance matrix between the bins of the distributions
    #     distance_matrix = cdist(np.arange(len(input)).reshape(-1, 1).astype('float64'), np.arange(len(target)).reshape(-1, 1).astype('float64'), metric='cityblock')
    #     # Compute the EMD between the two distributions
    #     emd_distance = emd(input.astype('float64'), target.astype('float64'), distance_matrix)
    #     return emd_distance

In [6]:
# load config file for hague
with open('../utils/configs.json') as f:
    config = json.load(f)

## ---------------------------- Hague Outlier Processing ---------------------------- 

In [7]:
# declare variable and load data from pickle file
data_name = "hague"
load_fpds_path = f"../data/{data_name}/processed/featured_fpds_raw.pickle"
featured_fpds = load_pickle(load_fpds_path)

In [None]:
for model_name in MODELS:

    print(f"processing model - {model_name} ")
    AE_model = model_name
    data_name = data_name
    load_fpds_path = f"../data/{data_name}/processed/featured_fpds_raw.pickle"
    featured_fpds = load_pickle(load_fpds_path)
    model_save_path = f"../results/{data_name}/outlier_scores/{AE_model}"
    if earths_movers_distance:
        model_save_path = model_save_path + "_EMD"

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    time_list = []
    # for each trajectory, direction and intersection, run AE model
    for trajectory in config['trajectories']:
        for direction in config['trajectories'][trajectory]:
            for intersection in config['trajectories'][trajectory][direction]:
                # print trajectory, direction, intersection
                print(f"processing trajectory - {trajectory},  direction - {direction}, and intersection - {intersection} ")
                intersection_data = featured_fpds[trajectory][direction]['fpds'][intersection]
                intersection_data_flatten = prepare_input_for_PWAE(intersection_data) # flatten data
                df = pd.DataFrame(intersection_data_flatten, columns=['var'+str(i) for i in range(1, 13)]+['timestamp','Isanomaly']) # convert to dataframe
                train_data = df[['var'+str(i) for i in range(1, 13)]].to_dict('records') # convert to dict
                df = df.sort_values(by='timestamp')
                scores,total_time = test_then_train(train_data, dataset='OWRI',model=AE_model,seed=42,**CONFIGS.get(AE_model, {}))
                time_list.append(total_time)
                outlier_results = [[x,y] for x,y in zip(scores, df['timestamp'].to_list())]
                # save outlier scores to csv
                AE_score_save_path = os.path.join(model_save_path,f"{intersection}_{direction}.csv") # path to save outlier scores
                save_outlier_results(outlier_results, AE_score_save_path)
                # print(f"{intersection} done!")
                print('-----------------------------------------------')


    # save time to pickle file
    time_save_path = os.path.join(model_save_path,'instance_train_time_seconds.pkl') # path to save time
    with open(time_save_path, 'wb') as f:
        pickle.dump(time_list, f)

## ---------------------------- METR-LA Outloer Processing ---------------------------- 

In [9]:
# declare variable and load data from pickle file
data_name = "METR-LA"

In [None]:
# AE for metr-la dataset
# an input vector is a 12-dimensional vector of the traffic data of the last 12 points (i.e., 5 minutes)
# input is modified according to the daads implementation
# use debug to print out the necessary steps of the input and output for better understanding

for model_name in MODELS:

    print(f"processing model - {model_name} ")
    AE_model = model_name
    data_name = data_name
    load_fpds_path = f"../data/{data_name}/processed/featured_fpds_raw.pickle"
    featured_fpds = load_pickle(load_fpds_path)
    model_save_path = f"../results/{data_name}/outlier_scores/{AE_model}"
    if earths_movers_distance:
        model_save_path = model_save_path + "_EMD"

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    time_list = []
    for intersection in featured_fpds['fpds']:
        # print intersection
        # print(f"processing intersection - {intersection} ")
        intersection_data = featured_fpds['fpds'][intersection]
        intersection_data_flatten = prepare_input_for_PWAE(intersection_data) # flatten data
        df = pd.DataFrame(intersection_data_flatten, columns=['var'+str(i) for i in range(1, 13)]+['timestamp','Isanomaly']) # convert to dataframe
        train_data = df[['var'+str(i) for i in range(1, 13)]].to_dict('records') # convert to dict
        df = df.sort_values(by='timestamp')
        scores,total_time = test_then_train(train_data, dataset='OWRI',model=AE_model,seed=42,**CONFIGS.get(AE_model, {}))
        time_list.append(total_time)
        outlier_results = [[x,y] for x,y in zip(scores, df['timestamp'].to_list())]
        # save outlier scores to csv
        AE_score_save_path = os.path.join(model_save_path, intersection+'.csv') # path to save outlier scores
        save_outlier_results(outlier_results, AE_score_save_path)
        # print(f"{intersection} done!")
        print('-----------------------------------------------')


    # save time to pickle file
    time_save_path = os.path.join(model_save_path,'instance_train_time_seconds.pkl') # path to save time
    with open(time_save_path, 'wb') as f:
        pickle.dump(time_list, f)

## ---------------------------- PEMS-BAY Outlier Processing ---------------------------- 

In [12]:
# declare variable and load data from pickle file
data_name = "PEMS-BAY"

In [None]:
# AE for metr-la dataset
# an input vector is a 12-dimensional vector of the traffic data of the last 12 points (i.e., 5 minutes)

for model_name in MODELS:

    print(f"processing model - {model_name} ")
    AE_model = model_name
    data_name = data_name
    load_fpds_path = f"../data/{data_name}/processed/featured_fpds_raw.pickle"
    featured_fpds = load_pickle(load_fpds_path)
    model_save_path = f"../results/{data_name}/outlier_scores/{AE_model}"
    if earths_movers_distance:
        model_save_path = model_save_path + "_EMD"

    if not os.path.exists(model_save_path):
        os.makedirs(model_save_path)

    time_list = []
    for intersection in featured_fpds['fpds']:
        # print intersection
        # print(f"processing intersection - {intersection} ")
        intersection_data = featured_fpds['fpds'][intersection]
        intersection_data_flatten = prepare_input_for_PWAE(intersection_data) # flatten data
        df = pd.DataFrame(intersection_data_flatten, columns=['var'+str(i) for i in range(1, 13)]+['timestamp','Isanomaly']) # convert to dataframe
        train_data = df[['var'+str(i) for i in range(1, 13)]].to_dict('records') # convert to dict
        df = df.sort_values(by='timestamp')
        scores, total_time = test_then_train(train_data, dataset='OWRI',model=AE_model,seed=42,**CONFIGS.get(AE_model, {}))
        time_list.append(total_time)
        outlier_results = [[x,y] for x,y in zip(scores, df['timestamp'].to_list())]
        # save outlier scores to csv
        AE_score_save_path = os.path.join(model_save_path, str(intersection)+'.csv') # path to save outlier scores
        save_outlier_results(outlier_results, AE_score_save_path)
        # print(f"{intersection} done!")
        print('-----------------------------------------------')


    # save time to pickle file
    time_save_path = os.path.join(model_save_path,'instance_train_time_seconds.pkl') # path to save time
    with open(time_save_path, 'wb') as f:
        pickle.dump(time_list, f)

## ----------- END --------------