# Seventh Notebook: test_analysis_QR.ipynb
* Cross validating on **October**, **November**, and **December** data
* Requires:
    * `optimized_safe_margin`
    * `optimized_standard_limit`
    * `optimized_residual_Test_QR`
* Generates:
    * `optimized_results`
    * `optimized_actual_detection_frame`
    * `optimized_detection_report_Frame`

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import random
import importlib
import os
import json
import time
import importlib
import sys

import numpy as np
import pickle5 as pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx
import matplotlib.dates as md

from pprint import pprint
from copy import deepcopy
from scipy.stats import hmean
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

random.seed()

In [3]:
sys.path.append("..")
from src.common_functions import *

### Parameters
* Couldn't think of a quick solution to the cluster list since i separated notebooks. just put the length here first

In [4]:
start_time = '06:00'
end_time   = '20:55'
training_months = (0, 8) # January to August
cross_validation_months = (9, 10) # September and October
testing_months = (11, 12) # November and December'
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
          'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
          'november': 11, 'december': 12}

In [9]:
clustering_version = '0027'
cluster_list = [1] * 25

In [6]:
# Confirm directories are in place
if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned'))
cleaned_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents'))
incidents_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT'))
incidents_GT_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')

if not os.path.exists(os.path.join(data_dir, f'{clustering_version}_results')):
    os.mkdir(os.path.join(data_dir, f'{clustering_version}_results'))
results = os.path.join(data_dir, f'{clustering_version}_results')

# Loading cluster list and regenerating filename

In [12]:
file_datetime = '07-14-2021'
# file_datetime = datetime.datetime.now().strftime('%m-%d-%Y')

new_filename = f"{clustering_version}_{len(cluster_list)}C_{file_datetime}"
new_filename

'0027_25C_07-14-2021'

In [13]:
# new_filename = f"{clustering_version}_{len(cluster_list)}C_{datetime.datetime.now().strftime('%m-%d-%Y')}"
# new_filename

In [14]:
# Load all clusters

fp = os.path.join(cluster_dir, f'{clustering_version}_clusters.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)

In [15]:
fp = os.path.join(results, f'used_clusters_list_{new_filename}.pkl')
with open(fp, 'rb') as handle:
    cluster_list = pickle.load(handle)
cluster_list

[1524373007,
 1524331139,
 1524367555,
 1524373538,
 1524313548,
 449629894,
 156110240,
 1524356290,
 1524276985,
 449636438,
 429350149,
 1524355946,
 1524343901,
 160092856,
 441552685,
 449614988,
 449631121,
 1524397645,
 1524563195,
 1524340452,
 449617816,
 449614858,
 449621051,
 449629707,
 441420512]

In [16]:
files_GT = os.listdir(incidents_GT_dir)
incident_GT = []
i = 0
while i< len(files_GT):
    fp = os.path.join(incidents_GT_dir, files_GT[i])
    with open(fp, 'rb') as handle:
        incident_GT.append( pickle.load(handle))
    i+=1
incident_GT_Frame = pd.concat(incident_GT)

In [17]:
test_files = os.listdir(incidents_dir)
info_ratio_incidents = []
i = 0
while i< len(test_files):
    fp = os.path.join(incidents_dir, test_files[i])
    with open(fp, 'rb') as handle:
        info_ratio_incidents.append( pickle.load(handle))
    i+=1
combined_ratio_frame_incidents = pd.concat(info_ratio_incidents)

In [18]:
fp_safe_margin = os.path.join(results, f'optimized_safe_margin_{new_filename}.pkl')
with open(fp_safe_margin, 'rb') as handle:
    safe_margin = pickle.load(handle)

fp_standard_limit = os.path.join(results, f'optimized_standard_limit_{new_filename}.pkl')
with open(fp_standard_limit, 'rb') as handle:
    standard_limit_5C = pickle.load(handle)
standard_limit_5C_Frame = pd.DataFrame(standard_limit_5C)

fp_test_res = os.path.join(results, f'optimized_residual_Test_QR_{new_filename}_test.pkl')
with open(fp_test_res, 'rb') as handle:
    test_residual = pickle.load(handle)
    
fp_safe_margin = os.path.join(results, f'optimized_hyper_mapping_{new_filename}.pkl')
with open(fp_safe_margin, 'rb') as handle:
    hyper_mapping = pickle.load(handle)

In [19]:
time_start = time.time()

cross_validated_kappa_SF = hyper_mapping

testing_incident_GT = incident_GT_Frame.between_time(start_time, end_time)
testing_incident_GT_Clist =  testing_incident_GT[testing_incident_GT['cluster_head'].isin (cluster_list)]
testing_incident_GT_Clist =  testing_incident_GT_Clist[(testing_incident_GT_Clist.index.month >= months['october']) 
                                                     & (testing_incident_GT_Clist.index.month <= months['december'])]

testing = combined_ratio_frame_incidents.between_time(start_time, end_time)
testing =  testing[(testing.index.month>9) & (testing.index.month<=12) ]

testing_Clist = testing[list(cross_validated_kappa_SF.keys())]
testing_Clist.columns = list(cross_validated_kappa_SF.keys())

detection_report = []
for column in testing_Clist: #per cluster 
    grouped = testing_Clist[column].groupby([testing_Clist[column].index.hour,
                                             testing_Clist[column].index.minute])
    sm_per_C = safe_margin[column] # safe margin list for each cluster
    kappa = cross_validated_kappa_SF[column]['kappa']
    SF = cross_validated_kappa_SF[column]['SF']
    for key1, group in grouped:
        for index, item in group.iteritems():
            if(pd.isna(item)):continue
            if((item > sm_per_C[kappa]['upper'][key1] ) or (item < sm_per_C[kappa]['lower'][key1] )):
                res_SF = test_residual[column][kappa]
                std_limit = standard_limit_5C_Frame[(standard_limit_5C_Frame['cluster_id']== column) &
                                            (standard_limit_5C_Frame['ka ppa']== kappa) &
                                            (standard_limit_5C_Frame['SF']== SF)]

                index_ar = std_limit.index
                if(res_SF[SF][index] >0):
                    if(res_SF[SF][index]>std_limit.at[index_ar[0],'tau_max']):
                        temp = {'cluster_id':column,'kappa':kappa,'SF':SF,
                                'time':index,'RUC':res_SF[SF][index],'tau_max':std_limit.at[index_ar[0],'tau_max']}
                        detection_report.append(temp)
                else:
                    if(res_SF[SF][index]<std_limit.at[index_ar[0],'tau_min']):
                        temp = {'cluster_id':column,'kappa':kappa,'SF':SF,
                                'time':index,'RUC':res_SF[SF][index],'tau_min':std_limit.at[index_ar[0],'tau_min']}
                        detection_report.append(temp)
detection_report_Frame = pd.DataFrame(detection_report)
detection_report_Frame.set_index('time',inplace = True)

group_detection_report_by_cluster_id = detection_report_Frame.groupby('cluster_id')
actual_detection = []
detection_GT = []
for key,group in group_detection_report_by_cluster_id:
    foucsed_cluster = testing_incident_GT_Clist[testing_incident_GT_Clist['cluster_head']==key]
    for index,row in group.iterrows():
        detection_type = 0
        for index1,row1 in foucsed_cluster.iterrows():
            #iterate only incidents happend for the cluster 
            if((index.month == index1.month) and (index.day == index1.day)):
                #This means incident and detection are on the same day
                if((index.hour >= (index1.hour-2)) & (index.hour <= (index1.hour+2))):
                    #this means successful detection of the incident
                    detection_type = 1
                    temp1 = {'cluster_id':key,'time':index1}
                    detection_GT.append(temp1)
                elif((index.hour >= 6) & (index.hour <= 10) or
                    (index.hour >= 16) & (index.hour <= 18)):
                    #this means detected an incident
                    detection_type = 2
                else:
                    detection_type =3
                break
        temp = {'cluster_id':key,'time':index,'detection_type':detection_type}
        actual_detection.append(temp)

actual_detection_Frame = pd.DataFrame(actual_detection)
actual_detection_Frame.set_index('time',inplace = True)

actual_detection_Frame['detection_number'] = 0
group_actual_detection_Frame = actual_detection_Frame.groupby(['cluster_id'])
for key1, group in group_actual_detection_Frame:
    prev = None
    detection = 0
    group.sort_index(inplace=True)
    for index,item in group.iterrows():
        if((prev == None)):
            prev = index
        else:
            if((prev.month == index.month) & (prev.day == index.day)):
                if(prev.hour == index.hour):
                    diff = index.minute - prev.minute
                    if(diff == 5):
                        group.at[index,'detection_number'] = detection
                        prev = index
                        continue
                    else:
                        detection = detection + 1
                else:
                    H_diff = index.hour - prev.hour
                    if(H_diff == 1):
                        if((index.minute  == 0) & (prev.minute == 55)):
                            group.at[index,'detection_number'] = detection
                            prev = index
                            continue
                        else:
                            detection = detection + 1
                    else:
                        detection = detection + 1
            else: 
                detection = detection + 1
            group.at[index,'detection_number'] = detection
            prev = index
    for index1,item in group.iterrows():
        if(actual_detection_Frame[actual_detection_Frame['cluster_id'] == key1].at[index1,'detection_number'] == 0):
            actual_detection_Frame.at[index1,'detection_number'] = item.detection_number

report = {}
group_by_cluster  = actual_detection_Frame.groupby('cluster_id')
for key, group in group_by_cluster:
    report[key] = {}
    report[key]['cluster_id'] = key
    print('Cluster Id: ',key)
    total_actual_incident = len(testing_incident_GT_Clist[testing_incident_GT_Clist['cluster_head']==key])
    print('Total Actual Incident: ',total_actual_incident)

    report[key]['total_actual_incident'] = total_actual_incident

    group = group[~group.index.duplicated(keep='first')]
    total = len(list(group['detection_number'].unique()))
    incident_frame = testing_incident_GT_Clist[testing_incident_GT_Clist['cluster_head']==key]
    count = 0
    print("incident length: ",len(incident_frame))
    report[key]['incident_frame'] = len(incident_frame)

    temp = group[group['detection_type'] == 1]
    for index,row in incident_frame.iterrows():
        focused_window = temp[(temp.index.month == index.month)&
                                    (temp.index.day == index.day)&
                                    (temp.index.hour >= (index.hour - 2))&
                                    (temp.index.hour <= (index.hour + 2))]
        if(len(focused_window)>0):
            count = count + 1
    detection = len(list(group[group['detection_type'] == 1]['detection_number'].unique()))
    c_detection = len(list(group[group['detection_type'] == 2]['detection_number'].unique()))
    fa_alarm  = total - detection - c_detection
    print('total: ',total,' detection: ',detection,' c_detection: ',c_detection,' fa_alarm: ',fa_alarm,' count: ',count)
    report[key]['results'] = {'total': total, 'detection': detection, 'c_detection': c_detection, 'fa_alarm': fa_alarm, 'count': count}

# pprint(report)
   
# Saving and backing up
fp = os.path.join(results, f'optimized_results_{new_filename}.pkl')
with open(fp, 'wb') as handle:
    pickle.dump(report, handle)
    
fp = os.path.join(results, f'optimized_actual_detection_frame_{new_filename}.pkl')
actual_detection_Frame.to_pickle(fp)

fp = os.path.join(results, f'optimized_detection_report_Frame_{new_filename}.pkl')
detection_report_Frame.to_pickle(fp)

elapsed_time = time.time() - time_start
print(f"Done in {elapsed_time} s")

Cluster Id:  156110240
Total Actual Incident:  12
incident length:  12
total:  512  detection:  29  c_detection:  27  fa_alarm:  456  count:  11
Done in 10.507197141647339 s
