# First Notebook: training_residual_and_Safe_margin.ipynb
* Training on **January** to **August** data
* Requires:
    * None
* Generates:
    * `optimized_residual_train`
    * `optimized_safe_margin`

In [2]:
%matplotlib inline

In [3]:
%load_ext autoreload

In [4]:
%autoreload 2

In [7]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import random
import importlib
import os
import json
import time
import importlib
import sys

import numpy as np
import pickle5 as pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx
import matplotlib.dates as md

from pprint import pprint
from copy import deepcopy
from scipy.stats import hmean
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

random.seed()

In [9]:
sys.path.append("..")
from src.common_functions import *

### Parameters

In [11]:
clustering_version = '0027'

In [36]:
# Confirm directories are in place
if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned'))
cleaned_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents'))
incidents_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT'))
incidents_GT_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')

if not os.path.exists(os.path.join(data_dir, f'{clustering_version}_results')):
    os.mkdir(os.path.join(data_dir, f'{clustering_version}_results'))
results = os.path.join(data_dir, f'{clustering_version}_results')

In [37]:
print(cleaned_dir)
files = os.listdir(cleaned_dir)
pprint(files)

/home/jovyan/work/Anomaly_Detection_2021/training/../data/generated_clusters/0027_incident_ratios/cleaned
['10_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '09_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '07_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '08_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '02_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '06_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '01_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '11_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '04_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '03_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '12_2019_ratios_0027_gran_5_incidents_cleaned.pkl',
 '05_2019_ratios_0027_gran_5_incidents_cleaned.pkl']


# Parameters
* Be sure to run this at the start

In [21]:
start_time = '06:00'
end_time   = '20:55'
training_months = (0, 8) # January to August
cross_validation_months = (9, 10) # September and October
testing_months = (11, 12) # November and December'
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
          'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
          'november': 11, 'december': 12}

# Entire Year Data 
First divide into training and testing to do the rest calculation

In [22]:
info_ratio = []
i = 0
while i< len(files):
    fp = os.path.join(cleaned_dir, files[i])
    with open(fp, 'rb') as handle:
        info_ratio.append( pickle.load(handle))
    i+=1
print(len(info_ratio))
combined_ratio_frame = pd.concat(info_ratio)
print(len(combined_ratio_frame))

12
105108


In [23]:
combined_ratio_frame = combined_ratio_frame.between_time(start_time, end_time)
combined_ratio_frame =  combined_ratio_frame[(combined_ratio_frame.index.month >= months['january']) & (combined_ratio_frame.index.month <= months['august'])]
print(len(combined_ratio_frame))
training = combined_ratio_frame

43740


# Select Clusters here
* Right now it selects the 25 clusters with the most incidents throughout 2019

In [30]:
# Load all clusters

fp = os.path.join(cluster_dir, f'{clustering_version}_clusters.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)

In [39]:
files_GT = os.listdir(incidents_GT_dir)
incident_GT = []
i = 0
while i< len(files_GT):
    fp = os.path.join(incidents_GT_dir, files_GT[i])
    with open(fp, 'rb') as handle:
        incident_GT.append( pickle.load(handle))
    i+=1
incident_GT_Frame = pd.concat(incident_GT)

In [49]:
# Adjust the number of clusters
NUMBER_OF_CLUSTERS = 1

In [50]:
_df = incident_GT_Frame.groupby('cluster_head').sum()\
                       .sort_values('Total_Number_Incidents', ascending=False).head(NUMBER_OF_CLUSTERS)
display(_df.head())
cluster_list = _df.index.tolist()
print(len(cluster_list))

Unnamed: 0_level_0,XDSegID,Total_Number_Incidents
cluster_head,Unnamed: 1_level_1,Unnamed: 2_level_1
1524373007,109521962360,86


1


# Filename generation
> Make sure you run this

In [51]:
new_filename = f"{clustering_version}_{len(cluster_list)}C_{datetime.datetime.now().strftime('%m-%d-%Y')}"
new_filename

'0027_1C_07-09-2021'

In [52]:
fp = os.path.join(results, f'used_clusters_list_{new_filename}.pkl')
with open(fp, 'wb') as handle:
    pickle.dump(cluster_list, handle)

In [53]:
info_ratio = []
i = 0
files = os.listdir(cleaned_dir)
while i < len(files):
    fp = os.path.join(cleaned_dir, files[i])
    with open(fp, 'rb') as handle:
        info_ratio.append(pickle.load(handle))
    i += 1
combined_ratio_frame = pd.concat(info_ratio)

combined_ratio_frame = combined_ratio_frame.between_time(start_time, end_time)
combined_ratio_frame =  combined_ratio_frame[(combined_ratio_frame.index.month >= months['january']) & (combined_ratio_frame.index.month <= months['august'])]
training = combined_ratio_frame

training_cluster_list = training[cluster_list]
training_cluster_list.columns = cluster_list
Q_mean_list = {} # Qmean for each of the cluster 
for column in training_cluster_list:
    Q_mean_list[column] = {}
    mad = training_cluster_list[column].mad()
    std = training_cluster_list[column].std()
    median = training_cluster_list[column].median()
    grouped = training_cluster_list[column].groupby([training_cluster_list[column].index.hour,
                                                     training_cluster_list[column].index.minute])
    Q_mean = {}
    for key,group in grouped:
        Q_mean[key] = group.mean()
    Q_mean_list[column]['Q_mean'] = Q_mean
    Q_mean_list[column]['mad'] = mad
    Q_mean_list[column]['std'] = std
    Q_mean_list[column]['median'] = median

# generate safe_margin for all values of kappa
kappa_L = [0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0]
safe_margin = {}
for key in Q_mean_list.keys():
    safe_margin[key] = {}
    for k in kappa_L:
        safe_margin[key][k] = {'upper':{},'lower':{}}
        mad = Q_mean_list[key]['std']

        Q_mean = Q_mean_list[key]['Q_mean']
        for key1 in Q_mean.keys(): 
            safe_margin[key][k]['upper'][key1] = Q_mean[key1] + mad * k
            safe_margin[key][k]['lower'][key1] = Q_mean[key1] - mad * k

fp = os.path.join(results, f'optimized_safe_margin_{new_filename}.pkl')
with open(fp, 'wb') as handle:
    pickle.dump(safe_margin, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'Saved optimized_safe_margin_{new_filename}.pkl')
    
residual = {}

for column in tqdm(training_cluster_list.columns):
    grouped = training_cluster_list[column].groupby([training_cluster_list[column].index.hour,
                                                     training_cluster_list[column].index.minute])
    sm_per_C = safe_margin[column]
    R_per_C = {}
    for key in sm_per_C.keys():
        nabla_dict = calculate_nabla(grouped, sm_per_C[key])

        nabla_frame = pd.DataFrame(list(nabla_dict.items()),columns = ['time','nabla'])
        nabla_frame.set_index('time', inplace=True)
        SF_List = [3,5,7,9]
        RUC = {}
        for sf in SF_List:
            RUC[sf] = faster_calculate_residual(nabla_frame,sf)
        R_per_C[key] = RUC

    residual[column] = R_per_C

fp = os.path.join(results, f'optimized_residual_train_{new_filename}.pkl')
with open(fp, 'wb') as handle:
    pickle.dump(residual, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print(f'Saved optimized_residual_train_{new_filename}.pkl')

Saved optimized_safe_margin_0027_1C_07-09-2021.pkl


  0%|          | 0/1 [00:00<?, ?it/s]

Saved optimized_residual_train_0027_1C_07-09-2021.pkl
