In [None]:
import torch.nn as nn
import pandas as pd
import json
import os
import numpy as np
import pickle
import json
import scipy
from torch.utils.data import Dataset, DataLoader
import torch
from collections import defaultdict
import time
# import matplotlib.pyplot as plt
# import seaborn as sns
from datetime import timedelta

# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)

# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')


# importing from daads
import sys
sys.path.insert(0, '/Users/himanshu/Master_Thesis/code/OWRI/DAADS')
from tools.evaluate import aggregate_dataframe, test_then_train

# ========== FUNCTION ==============

In [None]:
# prepare input for AEs, 
# input is combined list of all intersections (vector -> [fpds + hourofday+ weekofday + intersectoion]) for each week and each hour sorted based on time
# output is -> list of feature vector and time for all interctions combined --> [[fpds + hourofday+ weekofday + intersectoion], time]
def prepare_input_for_AEs(intersection_data, intersection):
    combined_fpds_for_AEs = []
    for i in range(7):
        for j in range(24):
            for fpd,time_instance in zip(intersection_data[i][j][0],intersection_data[i][j][1]):
                combined_fpds_for_AEs.append([fpd.astype(np.float32),time_instance, intersection])

    combined_fpds_for_AEs = sorted(combined_fpds_for_AEs, key=lambda x:x[1])
    return np.array(combined_fpds_for_AEs)

In [None]:
# prepare input for AEs, 
# input is combined list of all intersections (vector -> [fpds + hourofday+ weekofday + intersectoion]) for each week and each hour sorted based on time
# output is -> list of feature vector and time for all interctions combined --> [[fpds + hourofday+ weekofday + intersectoion], time]
def prepare_input_for_PWAE(intersection_data):
    combined_fpds_for_AEs = []
    for i in range(7):
        for j in range(24):
            for fpd,time_instance in zip(intersection_data[i][j][0],intersection_data[i][j][1]):
                temp = fpd.astype(np.float32).tolist()
                temp.extend([time_instance, 0])
                combined_fpds_for_AEs.append(temp)

    # combined_fpds_for_AEs = sorted(combined_fpds_for_AEs, key=lambda x:x[12])
    return combined_fpds_for_AEs

In [None]:
# function to load data from pickle file
def load_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [None]:
def my_collate(batch):
    # Transpose the batch to get a list of samples
    transposed = zip(*batch)
    
    # Convert each sample to a tensor
    tensor_list = [torch.tensor(samples) for samples in transposed]
    
    # Return the list of tensors
    return tensor_list

In [None]:
class AutoEncoderDataset(Dataset):
    def __init__(self, data):
        self.data = data[:,0]
        self.timestamp = data[:,1]
        self.intersection = data[:,2]
        self.dim = self.data[0].shape[0]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        # get integer
        my_int = self.data[idx]
        
        # get numpy datetime64 timestamp
        my_timestamp = str(self.timestamp[idx])
        
        # get string type
        my_str = str(self.intersection[idx])
        
        # return as list
        return [my_int, my_timestamp, my_str]

In [None]:
class AE(nn.Module):
    def __init__(self, input_shape):
        super(AE, self).__init__()
        self.enc = nn.Sequential(
            nn.Linear(input_shape, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.dec = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_shape),
            nn.ReLU()
        )
    def forward(self, x):
        encode = self.enc(x)
        decode = self.dec(encode)
        return decode

In [None]:
def AE_outlier_detection(AE_dataset, device, lr, w_d, epochs, intersection):
    model = AE(AE_dataset.dim)
    model.to(device)
    criterion = nn.MSELoss(reduction='mean')
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=w_d)
    train_dataloader = DataLoader(AE_dataset, batch_size=1, shuffle=False)
    
    
    # --------------------- TRAINING ---------------------
    # train model
    outlier_results = []
    start = time.time()
    for epoch in range(epochs):
        running_loss = 0.0
        for bx, data in enumerate(train_dataloader):
            bt = data[0]
            sensor_time = data[1][0]
            sample = model(bt.to(device))
            loss = criterion(bt.to(device), sample) # calculate loss for input and recreated output
            outlier_results.append([loss.item(),sensor_time]) # append loss, time and intersection to list
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
    end = time.time()
    print(f"time taken: {timedelta(seconds=end-start)}")
    return outlier_results

In [None]:
# function to save outlier results
def save_outlier_results(outlier_results, path):
    outlier_df = pd.DataFrame(outlier_results,columns=["outlier_score",'timestamp']) # convert to dataframe
    outlier_df['timestamp'] = pd.to_datetime(outlier_df['timestamp']) # convert timestamp to datetime
    outlier_df.set_index('timestamp',inplace=True) # set timestamp as index
    outlier_df.to_csv(path) # save to csv

# =========== MAIN ===========

In [None]:
# Configurations for my model
# # define AE model parameters
# lr = 1e-2         # learning rate
# w_d = 1e-5        # weight decay
# epochs = 1       # number of epochs
# device = 'mps' if torch.backends.mps.is_available() else 'cpu'

# DAADS MODELS = ["AE", "DAE","PW-AE"]
MODELS = ["AE", "AE", "DAE", "RRCF", "HST", "PW-AE", "xStream", "Kit-Net", "ILOF"]
CONFIGS = {
    "AE": {"lr": 0.02, "latent_dim": 0.1},
    "DAE": {"lr": 0.02},
    "PW-AE": {"lr": 0.1},
    "OC-SVM": {},
    "HST": {"n_trees": 25, "height": 15},
}
AE_model = 'PW-AE'

In [None]:
# # load data from pickle file
# load_fpds_path = f"../data/hauge/processed/featured_fpds_raw.pickle"
# featured_fpds = load_pickle(load_fpds_path)

In [None]:
# # load config file
# with open('../utils/configs.json') as f:
#     config = json.load(f)

In [None]:
# # for each trajectory, direction and intersection, run AE model
# for trajectory in config['trajectories']:
#     for direction in config['trajectories'][trajectory]:
#         for intersection in config['trajectories'][trajectory][direction]:
#             # print trajectory, direction, intersection
#             print(f"processing trajectory - {trajectory},  direction - {direction}, and intersection - {intersection} ")
#             intersection_data = featured_fpds[trajectory][direction]['fpds'][intersection]
#             intersection_data_flatten = prepare_input_for_PWAE(intersection_data) # flatten data
#             df = pd.DataFrame(intersection_data_flatten, columns=['var'+str(i) for i in range(1, 13)]+['timestamp','Isanomaly'])
#             train_data = df[['var'+str(i) for i in range(1, 13)]].to_dict('records')
#             scores,total_time = test_then_train(dataset='OWRI',model=AE_model,seed=42,data = train_data,**CONFIGS.get(AE_model, {}))
#             outlier_results = [[x,y] for x,y in zip(scores, df['timestamp'].to_list())]
#             # AE_dataset = AutoEncoderDataset(intersection_data_flatten) # create AE dataset
#             # outlier_results = AE_outlier_detection(AE_dataset, device, lr, w_d, epochs, intersection) # run AE model
#             # save outlier scores to csv
#             AE_score_save_path = f"../results/hauge/outlier_scores/{AE_model}/{intersection}_{direction}.csv"
#             save_outlier_results(outlier_results, AE_score_save_path)
#             print(f"{intersection} done!")
#             print('-----------------------------------------------')

In [None]:
# load data from pickle file
load_fpds_path= f"../data/METR-LA/METR_OWRI/featured_fpds_raw.pickle"
featured_fpds = load_pickle(load_fpds_path)

In [None]:
# AE for metr-la dataset
for intersection in featured_fpds['fpds']:
    # print intersection
    print(f"processing intersection - {intersection} ")
    intersection_data = featured_fpds['fpds'][intersection]
    intersection_data_flatten = prepare_input_for_PWAE(intersection_data) # flatten data
    df = pd.DataFrame(intersection_data_flatten, columns=['var'+str(i) for i in range(1, 13)]+['timestamp','Isanomaly'])
    train_data = df[['var'+str(i) for i in range(1, 13)]].to_dict('records')
    df = df.sort_values(by='timestamp')
    scores,total_time = test_then_train(dataset='OWRI',model=AE_model,seed=42,data = train_data,**CONFIGS.get(AE_model, {}))
    outlier_results = [[x,y] for x,y in zip(scores, df['timestamp'].to_list())]
    # AE_dataset = AutoEncoderDataset(intersection_data_flatten) # create AE dataset
    # outlier_results = AE_outlier_detection(AE_dataset, device, lr, w_d, epochs, intersection) # run AE model
    # save outlier scores to csv
    AE_score_save_path = f"../results/METR-LA/outlier_scores/{AE_model}/{intersection}.csv"
    save_outlier_results(outlier_results, AE_score_save_path)
    print(f"{intersection} done!")
    print('-----------------------------------------------')


In [None]:
df

# EXTRA

In [None]:
direction = "North"
trajectory = "T1"
intersection = "K502"
intersection_data = featured_fpds[trajectory][direction][intersection]

In [None]:
# total numbe of data points
l = 0
for i in range(7):
    for j in range(24):
        l += len(intersection_data[i][j][0])
print(f"total number of data points for intersection - {intersection} is {l}")

In [None]:
# prepare input for AEs
intersection_data_flatten = prepare_input_for_AEs(intersection_data, intersection)

In [None]:
AE_dataset = AutoEncoderDataset(intersection_data_flatten)
AE_dataset.dim # feature vector dimension

In [None]:
train_dataloader = DataLoader(AE_dataset, batch_size=1, shuffle=False)
# train_dataloader = DataLoader(AE_dataset, batch_size=1, shuffle=False, collate_fn=my_collate)

In [None]:
# check if dataloader is working
for i in train_dataloader:
    print(i)
    break

In [None]:
# define model parameters
lr = 1e-2         # learning rate
w_d = 1e-5        # weight decay
epochs = 1
metrics = defaultdict(list)
outlier_loss = []
outlier_results = []
device = 'mps' if torch.backends.mps.is_available() else 'cpu'


In [None]:
model = AE(AE_dataset.dim)
model.to(device)
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=w_d)

In [None]:
# train model
start = time.time()
for epoch in range(epochs):
    ep_start = time.time()
    running_loss = 0.0
    for bx, data in enumerate(train_dataloader):
        bt = data[0]
        sensor_time = data[1][0]
        sample = model(bt.to(device))
        loss = criterion(bt.to(device), sample) # calculate loss for input and recreated output
        outlier_loss.append(loss.item()) # append loss to list
        outlier_results.append([loss.item(),sensor_time]) # append loss, time and intersection to list
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        # # print average loss for every 25% batches
        # if bx % int(AE_dataset.__len__()/4) == 0:
        #     print('[EPOCH] {}/{}\t[BATCH] {}/{}\t[LOSS] {}'.format(epoch+1,epochs,bx+1,AE_dataset.__len__(),running_loss/(bx+1)))
    epoch_loss = running_loss/AE_dataset.__len__()
    metrics['train_loss'].append(epoch_loss)
    ep_end = time.time()
    print('-----------------------------------------------')
    print('[EPOCH] {}/{}\n[LOSS] {}'.format(epoch+1,epochs,epoch_loss))
    print('Epoch Complete in {}'.format(timedelta(seconds=ep_end-ep_start)))
end = time.time()
print('-----------------------------------------------')
print('[{} Training Completed: {}]'.format(intersection, timedelta(seconds=end-start)))

In [None]:
_, ax = plt.subplots(1,1,figsize=(10,5))
ax.set_title('Loss')
ax.plot(outlier_loss)

In [None]:
# get the top 5% of the loss values
top_5_percent = np.percentile(outlier_loss, 97)
top_5_percent

In [None]:
lower_threshold = 0.0
upper_threshold = top_5_percent
plt.figure(figsize=(12,6))
plt.title('Loss Distribution')
sns.distplot(outlier_loss,bins=100,kde=True, color='blue')
plt.axvline(upper_threshold, 0.0, 10, color='r')
plt.axvline(lower_threshold, 0.0, 10, color='b')

In [None]:
sns.boxenplot(outlier_loss)

In [None]:
outlier_df = pd.DataFrame(outlier_results,columns=[intersection,'timestamp'])
outlier_df.head()

In [None]:
outlier_df['timestamp'] = pd.to_datetime(outlier_df['timestamp'])

In [None]:
# make timestamp as index
outlier_df.set_index('timestamp',inplace=True)

In [None]:
outlier_df

In [None]:
AE_score_save_path = f"../results/hauge/outlier_scores/AE/{intersection}.csv"

In [None]:
outlier_df.to_csv(AE_score_save_path)

In [None]:
type(outlier_df['timestamp'][0])

In [None]:
outlier_df['timestamp'] = outlier_df['timestamp'].apply(lambda x: x[0])
outlier_df['intersection'] = outlier_df['intersection'].apply(lambda x: x[0])

In [None]:
# create seperate columns for each intersection
outlier_df_intersection = outlier_df.pivot(index='timestamp', columns='intersection', values='loss')

In [None]:
fig, ax = plt.subplots(facecolor='w')
sns.heatmap(outlier_df_intersection.corr())
plt.title("Correlations for T1 North")
plt.xlabel('Intersection')
plt.ylabel('Intersection')
plt.show()

In [None]:
corr_df = outlier_df_intersection.corr()
corr_df

### Comments over correlated intersections
- Except K198 all intersections are highly correlated
- hard to get the intution as the feature vector is - **[fpds + hourofday+ weekofday + intersectoion]**

In [None]:
# for each intersection, get the top 3 correlated intersections
top_3_corr = corr_df.apply(lambda x: x.sort_values(ascending=False).index[1:4], axis=1)
top_3_corr