# Generate all the ground truth incidents

* This notebook generates `cluster_version_incidents_month.pkl`
* This notebook takes a long time because of the loading and reading of data
* Requires:
    * `speed_data`
    * `cluster_clusters.pkl`
* Generates:
    * `cluster_version_incidents_month.pkl`

In [1]:
import os
import sys
import pickle5 as pickle
import pandas as pd
from pprint import pprint
from tqdm.notebook import tqdm

sys.path.append("..")
from src.utils import Read_DF, Call_Back

In [2]:
# Confirm directorys are in place

if not os.path.exists(os.path.join(os.getcwd(), '../data')):
    raise OSError("Must first download data, see README.md")
data_dir = os.path.join(os.getcwd(), '../data')

if not os.path.exists(os.path.join(data_dir, 'generated_clusters')):
    os.mkdir(os.path.join(data_dir, 'generated_clusters'))
cluster_dir = os.path.join(data_dir, 'generated_clusters')

if not os.path.exists(os.path.join(data_dir, 'speed_data')):
    os.mkdir(os.path.join(data_dir, 'speed_data'))
speed_dir = os.path.join(data_dir, 'speed_data')

# Processing Incidents





### Integrating with cluster information

In [7]:
lower_bound_correlation = 0.7
correlation_threshold = 0.85
clustering_version = f'maxr0{int(correlation_threshold * 100)}'
fp = os.path.join(cluster_dir, f'optimized_clustering_{lower_bound_correlation}_{correlation_threshold}_{clustering_version}_restricted.pkl')
with open(fp, 'rb') as handle:
    clusters = pickle.load(handle)
print(len(clusters))    

618


In [8]:
def locate_segment_cluster(segment, clusters):
    if pd.isnull(segment):
        return -1
    for i, cluster in enumerate(list(clusters)):
        if segment in clusters[cluster]:
            return cluster
    return -1

In [9]:
# Setting up output directories
if not os.path.exists(os.path.join(cluster_dir, f"{clustering_version}_incident_ratios")):
    os.mkdir(os.path.join(cluster_dir, f"{clustering_version}_incident_ratios"))
incident_ratio_dir = os.path.join(cluster_dir, f"{clustering_version}_incident_ratios")

# Setting up output directories
if not os.path.exists(os.path.join(incident_ratio_dir, f"incidents_GT")):
    os.mkdir(os.path.join(incident_ratio_dir, f"incidents_GT"))
ground_truth_dir = os.path.join(incident_ratio_dir, f"incidents_GT")
ground_truth_dir

'/home/jovyan/work/clustering/../data/generated_clusters/maxr085_incident_ratios/incidents_GT'

#### I already generated this and is in:
* `all_incidents_ground_truth.pkl`

In [13]:
# str_start_time = '06:00:00'
# str_end_time = '21:00:00'

# for month in tqdm(range(1, 13)):

#     fp = os.path.join(speed_dir, f'ALL_5m_DF_2019_{month}_1.gzip')
#     DF_1 = Read_DF(DF_All = fp, Reading_Tag = 'DF_All', MetaData = {})

#     fp = os.path.join(speed_dir, f'ALL_5m_DF_2019_{month}_15.gzip')
#     DF_15 = Read_DF(DF_All = fp, Reading_Tag = 'DF_All', MetaData = {})

#     DF_All = pd.concat([DF_1, DF_15])
#     DF_All = DF_All[['time_local', 'XDSegID', 'Total_Number_Incidents']]
#     DF_All['time'] = DF_All['time_local'].tolist()
#     DF_All = DF_All.set_index('time_local')
#     DF_All = DF_All.between_time(str_start_time, str_end_time)

#     DF_Incidents = DF_All[DF_All['Total_Number_Incidents'] > 0]
#     segs = DF_Incidents['XDSegID'].tolist()
#     cluster_heads = []
#     for s in segs:
#         cluster_heads.append(locate_segment_cluster(s, clusters))
#     DF_Incidents['cluster_head'] = cluster_heads
#     fp = os.path.join(ground_truth_dir, f"{clustering_version}_incidents_{str(month).zfill(2)}.pkl")
#     DF_Incidents[DF_Incidents['cluster_head'] != -1].to_pickle(fp)
#     break

## So that we don't have to load all of it again just to add the cluster_head information
* Load `data/all_incidents_ground_truth.pkl`

In [14]:
fp = os.path.join(data_dir, 'all_incidents_ground_truth.pkl')
all_incidents_gt = pd.read_pickle(fp)
all_incidents_gt['cluster_head'] = all_incidents_gt['XDSegID'].apply(lambda x: locate_segment_cluster(x, clusters))
all_incidents_gt = all_incidents_gt[all_incidents_gt['cluster_head'] != -1]
all_incidents_gt

Unnamed: 0_level_0,XDSegID,Total_Number_Incidents,cluster_head
time_local,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01 01:15:00,441420536,1,449614846
2019-01-01 02:00:00,449626583,1,449626584
2019-01-01 02:05:00,1524395080,1,1524308348
2019-01-01 16:30:00,1524581409,1,1524491673
2019-01-01 18:40:00,429334934,1,429334934
...,...,...,...
2019-12-31 17:00:00,1524561318,1,1524535956
2019-12-31 18:20:00,429335034,1,1524529197
2019-12-31 21:50:00,449617510,1,441552690
2019-12-31 21:50:00,449617535,1,449617534


In [15]:
for month in tqdm(range(1, 13)):
    _tdf = all_incidents_gt[all_incidents_gt.index.month == month]
    fp = os.path.join(ground_truth_dir, f"{clustering_version}_incidents_{str(month).zfill(2)}.pkl")
    _tdf.to_pickle(fp)

  0%|          | 0/12 [00:00<?, ?it/s]

In [16]:
ground_truth_dir

'/home/jovyan/work/clustering/../data/generated_clusters/maxr085_incident_ratios/incidents_GT'