# Fourth Notebook: detection_QR.ipynb
* Cross validating on **September** and **October** data
* Requires:
    * `optimized_safe_margin`
    * `optimized_standard_limit`
    * `optimized_test_residual`
* Generates:
    * `optimized_detection_report`

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import random
import importlib
import os
import json
import time
import importlib
import sys

import numpy as np
import pickle5 as pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx
import matplotlib.dates as md

from pprint import pprint
from copy import deepcopy
from scipy.stats import hmean
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

random.seed()

In [5]:
sys.path.append("..")
from src.common_functions import *

### Parameters
* Couldn't think of a quick solution to the cluster list since i separated notebooks. just put the length here first

In [6]:
start_time = '06:00'
end_time   = '20:55'
training_months = (0, 8) # January to August
cross_validation_months = (9, 10) # September and October
testing_months = (11, 12) # November and December'
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
          'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
          'november': 11, 'december': 12}

In [7]:
clustering_version = '0027'
cluster_list = [1] * 1

In [8]:
# Confirm directories are in place
if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned'))
cleaned_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents'))
incidents_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT'))
incidents_GT_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')

if not os.path.exists(os.path.join(data_dir, f'{clustering_version}_results')):
    os.mkdir(os.path.join(data_dir, f'{clustering_version}_results'))
results = os.path.join(data_dir, f'{clustering_version}_results')

# Loading cluster list and regenerating filename

In [9]:
new_filename = f"{clustering_version}_{len(cluster_list)}C_{datetime.datetime.now().strftime('%m-%d-%Y')}"
new_filename

'0027_1C_07-09-2021'

In [10]:
# Load all clusters

fp = os.path.join(cluster_dir, f'{clustering_version}_clusters.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)

In [11]:
fp = os.path.join(results, f'used_clusters_list_{new_filename}.pkl')
with open(fp, 'rb') as handle:
    cluster_list = pickle.load(handle)
cluster_list

[1524373007]

In [13]:
fp_safe_margin = os.path.join(results, f'optimized_safe_margin_{new_filename}.pkl')
with open(fp_safe_margin, 'rb') as handle:
    safe_margin = pickle.load(handle)

fp_standard_limit = os.path.join(results, f'optimized_standard_limit_{new_filename}.pkl')
with open(fp_standard_limit, 'rb') as handle:
    standard_limit_5C = pickle.load(handle)
standard_limit_5C_Frame = pd.DataFrame(standard_limit_5C)

fp_test_res = os.path.join(results, f'optimized_test_residual_{new_filename}.pkl')
with open(fp_test_res, 'rb') as handle:
    test_residual = pickle.load(handle)

info_ratio_incidents = []
i = 0
test_files = os.listdir(incidents_dir)
while i< len(test_files):
    fp = os.path.join(incidents_dir, test_files[i])
    with open(fp, 'rb') as handle:
        info_ratio_incidents.append( pickle.load(handle))
    i+=1
combined_ratio_frame_incidents = pd.concat(info_ratio_incidents)

testing = combined_ratio_frame_incidents.between_time(start_time, end_time)
testing =  testing[(testing.index.month >= months['september']) & (testing.index.month <= months['october']) ]
testing_Clist = testing[cluster_list]
testing_Clist.columns = cluster_list

detection_report = []
for column in tqdm(testing_Clist.columns):
    grouped = testing_Clist[column].groupby([testing_Clist[column].index.hour,
                                             testing_Clist[column].index.minute])

    sm_per_C = safe_margin[column] # safe margin list for each cluster
    for key in sm_per_C.keys(): # for each safe margin
        for key1, group in grouped:
            group = group.dropna()

            groupDF = pd.DataFrame(group)
            groupDF['g_upper'] = groupDF[column] > sm_per_C[key]['upper'][key1]
            groupDF['l_lower'] = groupDF[column] < sm_per_C[key]['lower'][key1]
            groupDF['or'] = groupDF['g_upper'] | groupDF['l_lower']

            groupDF = groupDF[groupDF['or'] == True]
            res_SF = test_residual[column][key]
            for key2 in res_SF.keys():
                std_limit = standard_limit_5C_Frame[(standard_limit_5C_Frame['cluster_id']== column) &
                                                    (standard_limit_5C_Frame['ka ppa']== key) &
                                                    (standard_limit_5C_Frame['SF']== key2)]
                index_ar = std_limit.index
                for index, row in groupDF.iterrows():
                    temp = None
                    if(res_SF[key2][index] >0):
                        if(res_SF[key2][index]>std_limit.at[index_ar[0],'tau_max']):
                            temp = {'cluster_id':column,'kappa':key,'SF':key2,
                                    'time':index,'RUC':res_SF[key2][index],'tau_max':std_limit.at[index_ar[0],'tau_max']}
                            detection_report.append(temp)
                    else:
                        if(res_SF[key2][index]<std_limit.at[index_ar[0],'tau_min']):
                            temp = {'cluster_id':column,'kappa':key,'SF':key2,'time':index,
                                    'RUC':res_SF[key2][index],'tau_min':std_limit.at[index_ar[0],'tau_min']}
                            detection_report.append(temp)

detection_report_Frame = pd.DataFrame(detection_report)
detection_report_Frame.set_index('time',inplace = True)

# Saving and backing up
fp = os.path.join(results, f"optimized_detection_report_{new_filename}.pkl")
detection_report_Frame.to_pickle(fp)
print(f"Saved optimized_detection_report_{new_filename}.pkl")

  0%|          | 0/1 [00:00<?, ?it/s]

Saved optimized_detection_report_0027_1C_07-09-2021.pkl
