# Fifth Notebook: analyse_detection_QR.ipynb
* Cross validating on **September** and **October** data
* Requires:
    * `optimized_detection_report`
* Generates:
    * `optimized_hyper_mapping`
    * `optimized_actual_detection_Frame`: For use with graphing

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import random
import importlib
import os
import json
import time
import importlib
import sys

import numpy as np
import pickle5 as pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx
import matplotlib.dates as md

from pprint import pprint
from copy import deepcopy
from scipy.stats import hmean
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

random.seed()

In [5]:
sys.path.append("..")
from src.common_functions import *

### Parameters
* Couldn't think of a quick solution to the cluster list since i separated notebooks. just put the length here first

In [6]:
start_time = '06:00'
end_time   = '20:55'
training_months = (0, 8) # January to August
cross_validation_months = (9, 10) # September and October
testing_months = (11, 12) # November and December'
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
          'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
          'november': 11, 'december': 12}

In [7]:
clustering_version = '0027'
cluster_list = [1] * 1

In [8]:
# Confirm directories are in place
if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned'))
cleaned_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents'))
incidents_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT'))
incidents_GT_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')

if not os.path.exists(os.path.join(data_dir, f'{clustering_version}_results')):
    os.mkdir(os.path.join(data_dir, f'{clustering_version}_results'))
results = os.path.join(data_dir, f'{clustering_version}_results')

# Loading cluster list and regenerating filename

In [9]:
new_filename = f"{clustering_version}_{len(cluster_list)}C_{datetime.datetime.now().strftime('%m-%d-%Y')}"
new_filename

'0027_1C_07-09-2021'

In [10]:
# Load all clusters

fp = os.path.join(cluster_dir, f'{clustering_version}_clusters.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)

In [11]:
fp = os.path.join(results, f'used_clusters_list_{new_filename}.pkl')
with open(fp, 'rb') as handle:
    cluster_list = pickle.load(handle)
cluster_list

[1524373007]

In [13]:
files_GT = os.listdir(incidents_GT_dir)
incident_GT = []
i = 0
while i< len(files_GT):
    fp = os.path.join(incidents_GT_dir, files_GT[i])
    with open(fp, 'rb') as handle:
        incident_GT.append( pickle.load(handle))
    i+=1
incident_GT_Frame = pd.concat(incident_GT)

In [14]:
fp_detection_report = os.path.join(results, f"optimized_detection_report_{new_filename}.pkl")
with open(fp_detection_report, 'rb') as handle:
    detection_report_Frame = pickle.load(handle)

testing_incident_GT = incident_GT_Frame.between_time(start_time, end_time)
testing_incident_GT = testing_incident_GT[(testing_incident_GT.index.month >= months['september']) & (testing_incident_GT.index.month <= months['october'])]
testing_incident_GT_Clist = testing_incident_GT[testing_incident_GT['cluster_head'].isin(cluster_list)]

group_detection_report_by_cluster_id = detection_report_Frame.groupby('cluster_id')
group_gt_incident_cluster_head = testing_incident_GT_Clist.groupby('cluster_head')
actual_detection = []
detection_GT = []
for key, gorup in tqdm(group_detection_report_by_cluster_id):
    group_by_kappa_sf = gorup.groupby(['kappa','SF'])
    for (key1,key2), group in group_by_kappa_sf:
        for index,row in group.iterrows():
            detection_type = 0
            if key in group_gt_incident_cluster_head.groups.keys():
                for index1,row1 in group_gt_incident_cluster_head.get_group(key).iterrows():
                    #iterate only incidents happend for the cluster 
                    if((index.month == index1.month) and (index.day == index1.day)):
                        #This means incident and detection are on the same day
                        if((index.hour >= (index1.hour-2)) & (index.hour <= (index1.hour+2))):
                            #this means successful detection of the incident
                            detection_type = 1
                            temp1 = {'cluster_id':key,'kappa':key1,'SF':key2,'time':index1}
                            detection_GT.append(temp1)
                        elif((index.hour >= 6) & (index.hour <= 10) or
                            (index.hour >= 16) & (index.hour <= 18)):
                            #this means detected an incident
                            detection_type = 2
                        else:
                            detection_type =3
                        break
                temp = {'cluster_id':key,'kappa':key1,'SF':key2,'time':index,'detection_type':detection_type}
                actual_detection.append(temp)

actual_detection_Frame = pd.DataFrame(actual_detection)
actual_detection_Frame.set_index('time',inplace = True)
detection_GT_Frame = pd.DataFrame(detection_GT)
detection_GT_Frame.set_index('time',inplace = True)

actual_detection_Frame['detection_number'] = 0
group_actual_detection_Frame = actual_detection_Frame.groupby(['cluster_id'])
for key1, group in group_actual_detection_Frame:
    group_c = group.groupby(['kappa','SF'])
    for (key2, key3), grp in group_c:
        current = None
        detection = 0
        grp.sort_index(inplace=True)
        for index,item in grp.iterrows():
            if((current == None)):
                current = index
            else:
                if((current.month == index.month) & (current.day == index.day)):
                    if(current.hour == index.hour):
                        diff = index.minute - current.minute
                        if(diff == 5):
                            grp.at[index,'detection_number'] = detection
                            current = index
                            continue
                        else:
                            detection = detection + 1
                    else:
                        H_diff = index.hour - current.hour
                        if(H_diff == 1):
                            if((index.minute  == 0) & (current.minute == 55)):
                                grp.at[index,'detection_number'] = detection
                                current = index
                                continue
                            else:
                                detection = detection + 1
                        else:
                            detection = detection + 1
                else: 
                    detection = detection + 1
                grp.at[index,'detection_number'] = detection
                current = index
        for index,item in grp.iterrows():
            actual_detection_Frame.at[index,'detection_number'] = item.detection_number

hyper_mapping = {}
group_actual_detection_Frame = actual_detection_Frame.groupby(['cluster_id'])
for key1, group in group_actual_detection_Frame:
    group_c = group.groupby(['kappa','SF'])
    min_fa =  sys.maxsize
    min_decision_fa =  (-1.0)*sys.maxsize
    total_incident = len(testing_incident_GT_Clist[(testing_incident_GT_Clist['cluster_head']==key1)])
    min_missed = sys.maxsize
    print("CLUSTER: ",key1)
    print('total incident: ',total_incident)
    for (key2,key3),grp in group_c:
        valid_detection = len(list(grp[grp['detection_type'] == 1]['detection_number'].unique())) + len(list(grp[grp['detection_type'] == 2]['detection_number'].unique()))
        total_detection = len(list(grp['detection_number'].unique()))
        false_alarm = total_detection - valid_detection
        df3 = detection_GT_Frame[(detection_GT_Frame['cluster_id']==key1)&
                                            (detection_GT_Frame['kappa']==key2)&
                                            (detection_GT_Frame['SF']==key3)]
        df3 = df3[~df3.index.duplicated(keep='first')]
        print('len(df3):',len(df3))
        detection = len(df3)
        fraction_of_detection = detection /total_incident
        print("fraction_of_detection: ",fraction_of_detection)
        missed = abs(total_incident - detection)
        fraction_FA  = false_alarm/ total_detection
        print('fraction_FA: ',fraction_FA)
        decision_factor = fraction_of_detection - fraction_FA
        print('decision_factor:', decision_factor)
        if((min_decision_fa < decision_factor)):
            min_decision_fa = decision_factor
            hyper_mapping[key1] = {'kappa':key2,'SF':key3}
            print('false alarm: ',false_alarm)

print()
print(hyper_mapping)

print()

# Saving and backing up
fp = os.path.join(results, f"optimized_hyper_mapping_{new_filename}.pkl")
with open(fp, 'wb') as handle:
    pickle.dump(hyper_mapping, handle)
    print(f"Saved optimized_hyper_mapping_{new_filename}.pkl")

# Saving and backing up
fp = os.path.join(results, f"optimized_actual_detection_Frame_{new_filename}.pkl")
actual_detection_Frame.to_pickle(fp)
print(f"Saved optimized_actual_detection_Frame_{new_filename}.pkl")

  0%|          | 0/1 [00:00<?, ?it/s]

CLUSTER:  1524373007
total incident:  11
len(df3): 6
fraction_of_detection:  0.5454545454545454
fraction_FA:  0.5898617511520737
decision_factor: -0.04440720569752832
false alarm:  128
len(df3): 6
fraction_of_detection:  0.5454545454545454
fraction_FA:  0.533678756476684
decision_factor: 0.011775788977861468
false alarm:  103
len(df3): 5
fraction_of_detection:  0.45454545454545453
fraction_FA:  0.5268817204301075
decision_factor: -0.07233626588465297
len(df3): 5
fraction_of_detection:  0.45454545454545453
fraction_FA:  0.553763440860215
decision_factor: -0.09921798631476048
len(df3): 6
fraction_of_detection:  0.5454545454545454
fraction_FA:  0.5535714285714286
decision_factor: -0.008116883116883189
len(df3): 6
fraction_of_detection:  0.5454545454545454
fraction_FA:  0.5283018867924528
decision_factor: 0.01715265866209259
false alarm:  84
len(df3): 5
fraction_of_detection:  0.45454545454545453
fraction_FA:  0.5414012738853503
decision_factor: -0.08685581933989578
len(df3): 5
fraction_of