# Third Notebook: cross_validate_residual.ipynb
* Cross validating on **September** and **October** data
* Requires:
    * `optimized_safe_margin`
* Generates:
    * `optimized_test_residual`

In [12]:
%matplotlib inline

In [13]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
%autoreload 2

In [15]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import datetime
import random
import importlib
import os
import json
import time
import importlib
import sys

import numpy as np
import pickle5 as pickle
import geopandas as gpd
import matplotlib.pyplot as plt
import osmnx as ox
import networkx as nx
import matplotlib.dates as md

from pprint import pprint
from copy import deepcopy
from scipy.stats import hmean
from matplotlib.lines import Line2D
from tqdm.notebook import tqdm

random.seed()

In [16]:
sys.path.append("..")
from src.common_functions import *

### Parameters
* Couldn't think of a quick solution to the cluster list since i separated notebooks. just put the length here first

In [17]:
start_time = '06:00'
end_time   = '20:55'
training_months = (0, 8) # January to August
cross_validation_months = (9, 10) # September and October
testing_months = (11, 12) # November and December'
months = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5,
          'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10,
          'november': 11, 'december': 12}

In [18]:
clustering_version = '0027'
cluster_list = [1] * 1

In [19]:
# Confirm directories are in place
if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned'))
cleaned_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/cleaned')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents'))
incidents_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents')

if not os.path.exists(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')):
    os.mkdir(os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT'))
incidents_GT_dir = os.path.join(cluster_dir, f'{clustering_version}_incident_ratios/incidents_GT')

if not os.path.exists(os.path.join(data_dir, f'{clustering_version}_results')):
    os.mkdir(os.path.join(data_dir, f'{clustering_version}_results'))
results = os.path.join(data_dir, f'{clustering_version}_results')

# Loading cluster list and regenerating filename

In [20]:
new_filename = f"{clustering_version}_{len(cluster_list)}C_{datetime.datetime.now().strftime('%m-%d-%Y')}"
new_filename

'0027_1C_07-09-2021'

In [21]:
# Load all clusters

fp = os.path.join(cluster_dir, f'{clustering_version}_clusters.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)

In [22]:
fp = os.path.join(results, f'used_clusters_list_{new_filename}.pkl')
with open(fp, 'rb') as handle:
    cluster_list = pickle.load(handle)
cluster_list

[1524373007]

In [23]:
fp_safe_margin = os.path.join(results, f'optimized_safe_margin_{new_filename}.pkl')
with open(fp_safe_margin, 'rb') as handle:
    safe_margin = pickle.load(handle)

test_files = os.listdir(incidents_dir)
info_ratio_incidents = []
i = 0
while i< len(test_files):
    fp = os.path.join(incidents_dir, test_files[i])
    with open(fp, 'rb') as handle:
        info_ratio_incidents.append( pickle.load(handle))
    i+=1
combined_ratio_frame_incidents = pd.concat(info_ratio_incidents)

combined_ratio_frame_incidents = combined_ratio_frame_incidents.between_time(start_time, end_time)
combined_ratio_frame_incidents = combined_ratio_frame_incidents[(combined_ratio_frame_incidents.index.month >= months['september']) 
                                                              & (combined_ratio_frame_incidents.index.month <= months['october'])]

testing = combined_ratio_frame_incidents
testing_Clist = testing[cluster_list]
testing_Clist.columns = cluster_list

test_residual = {}

for column in tqdm(testing_Clist.columns):
    grouped = testing_Clist[column].groupby([testing_Clist[column].index.hour,
                                             testing_Clist[column].index.minute])
    sm_per_C = safe_margin[column]
    R_per_C = {}
    for key in sm_per_C.keys():
        nabla_dict = calculate_nabla(grouped, sm_per_C[key])

        nabla_frame = pd.DataFrame(list(nabla_dict.items()),columns = ['time','nabla'])
        nabla_frame.set_index('time', inplace=True)
        SF_List = [3,5,7,9]
        RUC = {}
        for sf in SF_List:
            RUC[sf] = faster_calculate_residual(nabla_frame,sf)
        R_per_C[key] = RUC

    test_residual[column] = R_per_C

# Saving and backing up
fp = os.path.join(results, f'optimized_test_residual_{new_filename}.pkl')
with open(fp, 'wb') as handle:
    pickle.dump(test_residual, handle)
    print(f'Saved optimized_test_residual_{new_filename}.pkl')

  0%|          | 0/1 [00:00<?, ?it/s]

Saved optimized_test_residual_0027_1C_07-09-2021.pkl
