In [None]:
import torch.nn as nn
import pandas as pd
import json
import os
import numpy as np
import pickle
import json
import scipy
from torch.utils.data import Dataset, DataLoader
import torch
from collections import defaultdict
import time
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta

# define random seeds for Neural Networks
torch.manual_seed(0)
np.random.seed(0)

# ignore warnings jupyter notebook
import warnings
warnings.filterwarnings('ignore')

# ========== FUNCTION ==============

In [None]:
# prepare input for AEs, 
# input is combined list of all intersections (vector -> [fpds + hourofday+ weekofday + intersectoion]) for each week and each hour sorted based on time
# output is -> list of feature vector and time for all interctions combined --> [[fpds + hourofday+ weekofday + intersectoion], time]
def prepare_input_for_AEs(featured_fpds):
    combined_fpds_for_AEs = []
    for intersection in featured_fpds.keys():
        for i in range(7):
            for j in range(24):
                for l,m in zip(featured_fpds[intersection][i][j][0],featured_fpds[intersection][i][j][1]):
                    combined_fpds_for_AEs.append([l.astype(np.float32),m, intersection])

    combined_fpds_for_AEs = sorted(combined_fpds_for_AEs, key=lambda x:x[1])
    return np.array(combined_fpds_for_AEs)

In [None]:
# function to load data from pickle file
def load_pickle(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [None]:
def my_collate(batch):
    # Transpose the batch to get a list of samples
    transposed = zip(*batch)
    
    # Convert each sample to a tensor
    tensor_list = [torch.tensor(samples) for samples in transposed]
    
    # Return the list of tensors
    return tensor_list

In [None]:
class AutoEncoderDataset(Dataset):
    def __init__(self, data):
        self.data = data[:,0]
        self.timestamp = data[:,1]
        self.intersection = data[:,2]
        self.dim = self.data[0].shape[0]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        # get integer
        my_int = self.data[idx]
        
        # get numpy datetime64 timestamp
        my_timestamp = str(self.timestamp[idx])
        
        # get string type
        my_str = str(self.intersection[idx])
        
        # return as list
        return [my_int, my_timestamp, my_str]

In [None]:
class AE(nn.Module):
    def __init__(self, input_shape):
        super(AE, self).__init__()
        self.enc = nn.Sequential(
            nn.Linear(input_shape, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU()
        )
        self.dec = nn.Sequential(
            nn.Linear(16, 32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_shape),
            nn.ReLU()
        )
    def forward(self, x):
        encode = self.enc(x)
        decode = self.dec(encode)
        return decode

# =========== MAIN ===========

In [None]:
# load data from pickle file
direction = "North"
trajectory = "T1"
load_fpds_path = f"../data/hauge/processed/featured_fpds_{direction}_{trajectory}.pickle"
featured_fpds = load_pickle(load_fpds_path)

In [None]:
# prepare input for AEs
combined_fpds_for_AEs = prepare_input_for_AEs(featured_fpds)

In [None]:
AE_dataset = AutoEncoderDataset(combined_fpds_for_AEs)  # create dataset object
AE_dataset.dim # feature vector dimension

In [None]:
train_dataloader = DataLoader(AE_dataset, batch_size=1, shuffle=False)
# train_dataloader = DataLoader(AE_dataset, batch_size=1, shuffle=False, collate_fn=my_collate)

In [None]:
# check if dataloader is working
for i in train_dataloader:
    print(i)
    break

In [None]:
# define model parameters
lr = 1e-2         # learning rate
w_d = 1e-5        # weight decay
epochs = 1
metrics = defaultdict(list)
outlier_loss = []
outlier_results = []
device = 'mps' if torch.backends.mps.is_available() else 'cpu'


In [None]:
model = AE(AE_dataset.dim)
model.to(device)
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=w_d)

In [None]:
# train model
start = time.time()
for epoch in range(epochs):
    ep_start = time.time()
    running_loss = 0.0
    for bx, data in enumerate(train_dataloader):
        bt = data[0]
        sensor_time = data[1]
        intersection = data[2]     
        sample = model(bt.to(device))
        loss = criterion(bt.to(device), sample) # calculate loss for input and recreated output
        outlier_loss.append(loss.item()) # append loss to list
        outlier_results.append([loss.item(),sensor_time, intersection]) # append loss, time and intersection to list
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        # print average loss for every 25% batches
        if bx % int(AE_dataset.__len__()/4) == 0:
            print('[EPOCH] {}/{}\t[BATCH] {}/{}\t[LOSS] {}'.format(epoch+1,epochs,bx+1,AE_dataset.__len__(),running_loss/(bx+1)))
    epoch_loss = running_loss/AE_dataset.__len__()
    metrics['train_loss'].append(epoch_loss)
    ep_end = time.time()
    print('-----------------------------------------------')
    print('[EPOCH] {}/{}\n[LOSS] {}'.format(epoch+1,epochs,epoch_loss))
    print('Epoch Complete in {}'.format(timedelta(seconds=ep_end-ep_start)))
end = time.time()
print('-----------------------------------------------')
print('[System Complete: {}]'.format(timedelta(seconds=end-start)))

In [None]:
_, ax = plt.subplots(1,1,figsize=(10,5))
ax.set_title('Loss')
ax.plot(outlier_loss)

In [None]:
# get the top 5% of the loss values
top_5_percent = np.percentile(outlier_loss, 97)
top_5_percent

In [None]:
lower_threshold = 0.0
upper_threshold = top_5_percent
plt.figure(figsize=(12,6))
plt.title('Loss Distribution')
sns.distplot(outlier_loss,bins=100,kde=True, color='blue')
plt.axvline(upper_threshold, 0.0, 10, color='r')
plt.axvline(lower_threshold, 0.0, 10, color='b')

In [None]:
sns.boxenplot(outlier_loss)

In [None]:
outlier_df = pd.DataFrame(outlier_results,columns=['loss','timestamp','intersection'])
outlier_df.head()

In [None]:
outlier_df['timestamp'] = outlier_df['timestamp'].apply(lambda x: x[0])
outlier_df['intersection'] = outlier_df['intersection'].apply(lambda x: x[0])

In [None]:
# create seperate columns for each intersection
outlier_df_intersection = outlier_df.pivot(index='timestamp', columns='intersection', values='loss')

In [None]:
fig, ax = plt.subplots(facecolor='w')
sns.heatmap(outlier_df_intersection.corr())
plt.title("Correlations for T1 North")
plt.xlabel('Intersection')
plt.ylabel('Intersection')
plt.show()

In [None]:
corr_df = outlier_df_intersection.corr()
corr_df

### Comments over correlated intersections
- Except K198 all intersections are highly correlated
- hard to get the intution as the feature vector is - **[fpds + hourofday+ weekofday + intersectoion]**

In [None]:
# for each intersection, get the top 3 correlated intersections
top_3_corr = corr_df.apply(lambda x: x.sort_values(ascending=False).index[1:4], axis=1)
top_3_corr