# Sixth Notebook: test_residual_QR.ipynb
* Cross validating on **October**, **November**, and **December** data
* Requires:
    * `optimized_safe_margin`
    * `optimized_hyper_mapping`
* Generates:
    * `optimized_residual`: Not to be confused with `optimized_test_residual` generated in notebook three

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [4]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import random
import importlib
import os
import json
import time
import importlib
import sys

import numpy as np
import pickle5 as pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx
import matplotlib.dates as md

from pprint import pprint
from copy import deepcopy
from scipy.stats import hmean
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

random.seed()

In [5]:
sys.path.append("..")
from src.common_functions import *

### Parameters
* Couldn't think of a quick solution to the cluster list since i separated notebooks. just put the length here first

In [6]:
start_time = '06:00'
end_time   = '20:55'
training_months = (0, 8) # January to August
cross_validation_months = (9, 10) # September and October
testing_months = (11, 12) # November and December'
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
          'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
          'november': 11, 'december': 12}

In [16]:
clustering_version = '0027'
cluster_list = [1] * 25

In [17]:
# Confirm directories are in place
if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned'))
cleaned_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents'))
incidents_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT'))
incidents_GT_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')

if not os.path.exists(os.path.join(data_dir, f'{clustering_version}_results')):
    os.mkdir(os.path.join(data_dir, f'{clustering_version}_results'))
results = os.path.join(data_dir, f'{clustering_version}_results')

# Loading cluster list and regenerating filename

In [31]:
file_datetime = '07-14-2021'
# file_datetime = datetime.datetime.now().strftime('%m-%d-%Y')

new_filename = f"{clustering_version}_{len(cluster_list)}C_{file_datetime}"
new_filename

'0027_25C_07-14-2021'

In [32]:
# Load all clusters

fp = os.path.join(cluster_dir, f'{clustering_version}_clusters.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)

In [33]:
fp = os.path.join(results, f'used_clusters_list_{new_filename}.pkl')
with open(fp, 'rb') as handle:
    cluster_list = pickle.load(handle)
cluster_list

[1524373007,
 1524331139,
 1524367555,
 1524373538,
 1524313548,
 449629894,
 156110240,
 1524356290,
 1524276985,
 449636438,
 429350149,
 1524355946,
 1524343901,
 160092856,
 441552685,
 449614988,
 449631121,
 1524397645,
 1524563195,
 1524340452,
 449617816,
 449614858,
 449621051,
 449629707,
 441420512]

In [34]:
files_GT = os.listdir(incidents_GT_dir)
incident_GT = []
i = 0
while i< len(files_GT):
    fp = os.path.join(incidents_GT_dir, files_GT[i])
    with open(fp, 'rb') as handle:
        incident_GT.append( pickle.load(handle))
    i+=1
incident_GT_Frame = pd.concat(incident_GT)

# Using the hyper parameters generated through cross validation

In [35]:
fp_safe_margin = os.path.join(results, f'optimized_hyper_mapping_{new_filename}.pkl')
with open(fp_safe_margin, 'rb') as handle:
    hyper_mapping = pickle.load(handle)

In [36]:
hyper_mapping

{156110240: {'kappa': 0.25, 'SF': 3},
 160092856: {'kappa': 0.25, 'SF': 3},
 429350149: {'kappa': 0.25, 'SF': 3},
 441420512: {'kappa': 2.0, 'SF': 9},
 441552685: {'kappa': 0.25, 'SF': 7},
 449614858: {'kappa': 0.25, 'SF': 5},
 449614988: {'kappa': 0.25, 'SF': 5},
 449617816: {'kappa': 0.25, 'SF': 7},
 449621051: {'kappa': 0.25, 'SF': 3},
 449629707: {'kappa': 0.25, 'SF': 3},
 449629894: {'kappa': 0.25, 'SF': 9},
 449631121: {'kappa': 0.25, 'SF': 5},
 449636438: {'kappa': 0.25, 'SF': 9},
 1524276985: {'kappa': 0.25, 'SF': 5},
 1524313548: {'kappa': 2.0, 'SF': 3},
 1524331139: {'kappa': 0.25, 'SF': 3},
 1524340452: {'kappa': 0.25, 'SF': 7},
 1524343901: {'kappa': 0.25, 'SF': 5},
 1524355946: {'kappa': 0.25, 'SF': 3},
 1524356290: {'kappa': 0.25, 'SF': 3},
 1524367555: {'kappa': 0.25, 'SF': 5},
 1524373007: {'kappa': 0.25, 'SF': 5},
 1524373538: {'kappa': 0.25, 'SF': 7},
 1524397645: {'kappa': 0.25, 'SF': 5},
 1524563195: {'kappa': 1.25, 'SF': 5}}

# Manually selecting hyper parameters for checking
* Just create a new dictionary of `{cluster_head: {kappa: KAPPA, SF: SF}` for each `cluster_head`
* Try to manually limit the `cluster_list`

In [41]:
hyper_mapping = {156110240: {'kappa': 0.25, 'SF': 3}}
hyper_mapping

{156110240: {'kappa': 0.25, 'SF': 3}}

In [42]:
cross_validated_kappa_SF = hyper_mapping
fp_safe_margin = os.path.join(results, f'optimized_safe_margin_{new_filename}.pkl')
with open(fp_safe_margin, 'rb') as handle:
    safe_margin = pickle.load(handle)

test_files = os.listdir(incidents_dir)
info_ratio_incidents = []
i = 0
while i< len(test_files):
    fp = os.path.join(incidents_dir, test_files[i])
    with open(fp, 'rb') as handle:
        info_ratio_incidents.append( pickle.load(handle))
    i+=1
combined_ratio_frame_incidents = pd.concat(info_ratio_incidents)

combined_ratio_frame_incidents = combined_ratio_frame_incidents.between_time(start_time, end_time)
combined_ratio_frame_incidents = combined_ratio_frame_incidents[(combined_ratio_frame_incidents.index.month >= months['october']) 
                                                              & (combined_ratio_frame_incidents.index.month <= months['december'])]
testing = combined_ratio_frame_incidents
testing_Clist = testing[list(cross_validated_kappa_SF.keys())]
testing_Clist.columns = list(cross_validated_kappa_SF.keys())
testing_Clist.columns

test_residual = {}
for column in tqdm(testing_Clist.columns):
    grouped = testing_Clist[column].groupby([testing_Clist[column].index.hour,
                                                    testing_Clist[column].index.minute])

    sm_per_C = safe_margin[column]
    kappa = cross_validated_kappa_SF[column]['kappa']
    SF = cross_validated_kappa_SF[column]['SF']
    R_per_C = {}
    nabla_dict = calculate_nabla(grouped,sm_per_C[kappa])
    nabla_frame = pd.DataFrame(list(nabla_dict.items()),columns = ['time','nabla'])
    nabla_frame.set_index('time', inplace=True)
    _grouped = nabla_frame.groupby(nabla_frame.index.floor('D'))
    RUC = {}

    RUCsf = {}
    for k, group in _grouped:
        df = group.rolling(SF, min_periods=SF).sum()
        df[0:SF] = group[0:SF]
        _RUC = df.to_dict()['nabla']
        RUCsf.update(_RUC)

    RUC[SF] = RUCsf
    R_per_C[kappa] = RUC
    test_residual [column] = R_per_C
    
# Saving and backing up
fp = os.path.join(results, f'optimized_residual_Test_QR_{new_filename}_1.pkl')
with open(fp, 'wb') as handle:
    pickle.dump(test_residual, handle)

  0%|          | 0/1 [00:00<?, ?it/s]

In [43]:
len(list(test_residual.keys()))

1