In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import boto3
import awswrangler

s3_bucket = 'traffic-data-bucket'

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

## List and import the models for comparison

In [None]:
#inspect which files are out ther.
s3 = my_session.resource('s3')

my_bucket = s3.Bucket('traffic-data-bucket')
# set prefix to name of folder
for file in my_bucket.objects.filter(Prefix='model_scoring/'):
    print(file.key)

In [None]:
#GBM_01 has model GBM_02 stored. It is miss lableled
s3 = my_session.resource('s3')

my_bucket = s3.Bucket('traffic-data-bucket')
list_of_files_to_keep = [
                        #'model_scoring/individual_model_scores/AutoGluon_Baseline.csv',
                        'model_scoring/individual_model_scores/GBM_06.csv',
                        'model_scoring/individual_model_scores/GLMnet_03.csv',
                        'model_scoring/individual_model_scores/AutoGluon_Full_Training.csv']
# set prefix to name of folder

scored_frame_dict = {}
model_number = 1
for file in my_bucket.objects.filter(Prefix='model_scoring/'):
    if file.key in list_of_files_to_keep:
        this_model_df = awswrangler.s3.read_csv(path = f's3://{s3_bucket}/{file.key}', boto3_session=my_session, use_threads=True)
        print(this_model_df.shape)
        scored_frame_dict[model_number] = this_model_df
        model_number = model_number + 1

In [None]:
scored_df = pd.concat(scored_frame_dict)
scored_df.reset_index(inplace = True, drop = True)

In [None]:
scored_df.model_name.value_counts()

#### Import the modeling data frame to get actual target information

In [None]:
model_df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
target_df = model_df[['hex_id', 'collision_year', 'collision_date', 'collision_hour', 'target', 'ttv_split', 'collision_dayofweek']]

In [None]:
scored_df = scored_df.merge(target_df, on = ['hex_id', 'collision_date', 'collision_hour', 'ttv_split'], how = 'left')

In [None]:
validate_mask = (scored_df.collision_year.isin([2015, 2016, 2017, 2018, 2019])) & (scored_df.ttv_split == 'Validate')
scored_validate_df = scored_df[validate_mask]
out_of_time_mask = (scored_df.collision_date >= '2020-0101') & (scored_df.collision_date < '2020-0401')
scored_out_of_time_df = scored_df[out_of_time_mask]

In [None]:
scored_validate_df

In [None]:
target_validate_mask = (target_df.collision_year.isin([2015, 2016, 2017, 2018, 2019])) & (scored_df.ttv_split == 'Validate')
target_validate_df = target_df[target_validate_mask]
target_out_of_time_mask = (target_df.collision_date >= '2020-0101') & (target_df.collision_date < '2020-0401')
target_out_of_time_df = target_df[target_out_of_time_mask]

## Calculate AUC on the validation and create graph for visual inspection

In [None]:
model_list = scored_validate_df.model_name.unique()
all_target = target_validate_df['target'].astype(int)
len(all_target)
#nutral score
ns_score_length = len(all_target)

ns_probs = [0 for _ in range(np.int(ns_score_length))]
ns_auc = roc_auc_score(all_target, ns_probs)
# calculate roc curve
ns_fpr, ns_tpr, _ = roc_curve(all_target, ns_probs)
print('No Skill: ROC AUC=%.3f' % (ns_auc))

#initialize plot
plt.figure(figsize=(10, 10))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')

for this_model in model_list:
    
    this_scored_df = scored_validate_df[scored_validate_df.model_name == this_model]
    #print(this_scored_df.shape)
    
    this_auc = roc_auc_score(this_scored_df['target'].astype(int), this_scored_df['prediction'])
    print(this_model, ': ROC AUC=%.3f' % (this_auc))

    ## calculate roc curve
    this_fpr, this_tpr, _ = roc_curve(this_scored_df['target'].astype(int), this_scored_df['prediction'])

    plt.plot(this_fpr, this_tpr,linestyle='-', label=this_model)

### axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
## show the legend
plt.legend()
# show the plot
plt.show()

## Calculate AUC on the out of time data

In [None]:
model_list = scored_out_of_time_df.model_name.unique()
all_target = target_out_of_time_df['target'].astype(int)
len(all_target)
#nutral score
ns_score_length = len(all_target)

ns_probs = [0 for _ in range(np.int(ns_score_length))]
ns_auc = roc_auc_score(all_target, ns_probs)
# calculate roc curve
ns_fpr, ns_tpr, _ = roc_curve(all_target, ns_probs)
print('No Skill: ROC AUC=%.3f' % (ns_auc))

#initialize plot
plt.figure(figsize=(10, 10))
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')

for this_model in model_list:
    
    this_scored_df = scored_out_of_time_df[scored_out_of_time_df.model_name == this_model]
    #print(this_scored_df.shape)
    
    this_auc = roc_auc_score(this_scored_df['target'].astype(int), this_scored_df['prediction'])
    print(this_model, ': ROC AUC=%.3f' % (this_auc))

    ## calculate roc curve
    this_fpr, this_tpr, _ = roc_curve(this_scored_df['target'].astype(int), this_scored_df['prediction'])

    plt.plot(this_fpr, this_tpr,linestyle='-', label=this_model)

### axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
## show the legend
plt.legend()
# show the plot
plt.show()

## Create univariate lift charts to compare actual vs predicted collisions

In [None]:
#model_list = scored_validate_df.model_name.unique()
#var = 'collision_dayofweek'
def model_univariate(var, model_list, validation_only = True):
    model_list = scored_validate_df.model_name.unique()
    #var = 'collision_dayofweek'
    
    validation_only = True
    
    fig=plt.figure(figsize=(10,5))
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twinx()
    
    model_df_list_to_keep = ['hex_id', 'collision_year', 'collision_date', 'collision_hour', 'target', 'ttv_split', var]
    model_df_list_to_keep = [*set(model_df_list_to_keep)]
    
    graphs_df = model_df[model_df_list_to_keep]
    if validation_only:
        graphs_df = graphs_df[graphs_df.ttv_split == 'Validate']
    
    scored_df_to_join = scored_df.copy()
    if validation_only:
        scored_df_to_join = scored_df_to_join[scored_df_to_join.ttv_split == 'Validate']
    
    
    graphs_joined_df = graphs_df.merge(scored_df_to_join, on = ['hex_id', 'collision_date', 'collision_hour'], how = 'inner')

    target_grp_df = graphs_df.groupby(var).agg({'target': ['mean', 'std', 'count']})
    target_grp_df.columns = ['collision_mean', 'collision_std', 'count']
    target_grp_df.reset_index(inplace = True)
    
    full_count = np.sum(target_grp_df['count'])
    
    target_grp_df['perc'] = target_grp_df['count'].div(full_count)
    
    target_grp_df[var] = target_grp_df[var].apply(str)
    
    #display(graphs_joined_df)
    
    bar = sns.barplot(x=var,
                          y='perc',
                          data=target_grp_df,
                          color=sns.color_palette("Set2")[7],
                          ax=ax1)

    
    #start for loop
    predicted_grp_df = graphs_joined_df.groupby([var, 'model_name']).agg({'prediction': ['mean', 'std']})
    predicted_grp_df.columns = ['collision_mean', 'collision_std']
    predicted_grp_df.reset_index(inplace = True)
    predicted_grp_df[var] = predicted_grp_df[var].apply(str)
    
    #reformat target_grp for chart joining
    target_grp_df['model_name'] = 'Actual Collision Mean'
    target_grp_df_structured = target_grp_df[[var, 'model_name', 'collision_mean', 'collision_std']]
    
    predicted_grp_df = pd.concat([predicted_grp_df, target_grp_df_structured], axis = 0).reset_index(drop = True)
    
    max_collision = np.max(predicted_grp_df['collision_mean'])
    
    max_y = max_collision
    
    line = sns.lineplot(x=var,
                             y='collision_mean',
                             data=predicted_grp_df,
                             marker='s',
                             hue='model_name',
                             ax=ax2)
    
    
    #predicted_grp_df.sample()
    
    handles, labels = ax2.get_legend_handles_labels()
    ax2.legend(handles=handles, labels=labels)
    #print(handles)
    #print(labels)
    #ax2.legend(labels = ['actual', 'model_1_pred', 'model_2_pred'])
    ax2.set(ylim=(0, max_y*1.2))
    
    bar.set_xticklabels(bar.get_xticklabels(), rotation=45, horizontalalignment='right')
    
    bar.set_xlabel("", fontsize = 12)
    
    bar.set_ylabel("Frequency", fontsize = 12)
    line.set_ylabel("Probability of Collision", fontsize = 12)
    
    title_string = (var)
    #print(title_string)
    
    bar.set_title(title_string, fontsize=13, loc='left')
    plt.gcf().set_size_inches(15, 12)

In [None]:
street_features = ['la_data_city_name', 
                     'node_street_count', 'node_stop', 'node_traffic_signals',
                     'edge_speed_kph_max', 'edge_speek_kph_min',
                     'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
                     'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
                     'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
                     'amenities_restaurant_cnt', 'amenities_college_cnt',
                     'drv_edge_lanes_max_imputed_flag']

time_features = ['collision_hour',
                 'collision_month', 'collision_dayofweek', 'drv_holiday_flag'
                ]

hex_history_features = ['prev1_yr_coll_cnt', 'prev1_yr_coll_neighbor1']

weather_features = ['noaa_wind_speed', 'noaa_precipitation',
                    'noaa_temperature_average', 'noaa_temperature_max',
                    'noaa_temperature_min']


In [None]:
model_list = scored_validate_df.model_name.unique()
var = 'prev1_yr_coll_cnt'

In [None]:
model_univariate(var = var, model_list = model_list, validation_only = True)

## Save the model outputs to a pdf

In [None]:
import matplotlib.backends.backend_pdf
pdf = matplotlib.backends.backend_pdf.PdfPages("validation_univariates.pdf")
vars_to_save = ['collision_month',#'accident_count', 'ttv_split',
       'node_street_count', 'node_stop', 'node_traffic_signals',
       'la_data_city_name', 'edge_speed_kph_max', 'edge_speek_kph_min',
       'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
       'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
       'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
       'amenities_restaurant_cnt', 'amenities_college_cnt',
       'prev1_yr_coll_cnt', 'prev2_yr_coll_cnt', 'prev1_yr_coll_neighbor1',
       'prev1_yr_coll_neighbor2', 'prev2_yr_coll_neighbor1',
       'prev2_yr_coll_neighbor2', 'noaa_wind_speed', 'noaa_precipitation',
       'noaa_temperature_average', 'noaa_temperature_max',
       'noaa_temperature_min', 'drv_collision_hour_sin',
       'drv_collision_hour_cos', 'drv_holiday_flag',
       'drv_edge_lanes_max_imputed_flag']
for var in vars_to_save:
    fig = model_univariate(var = var, model_list = model_list, validation_only = True)
    pdf.savefig(orientation = 'portrait')
    
pdf.close()

## Hours vs days of the week.  Compare predicitons vs actual for different models

In [None]:
import altair as alt

heat_cols_to_keep = ['collision_date', 'collision_hour', 'collision_dayofweek', 'source', 'collision_probability']


#graph_joined_df['target'] = graph_joined_df.accident_occurred.map(int)
scored_df_to_join = scored_validate_df.copy()
scored_df_to_join = scored_df_to_join[scored_df_to_join.ttv_split == 'Validate']
scored_df_to_join['source'] = scored_df_to_join.model_name
scored_df_to_join['collision_probability'] = scored_df_to_join.prediction
scored_df_to_join = scored_df_to_join[heat_cols_to_keep]

target_validate_df['collision_probability'] = target_validate_df.target
target_validate_df['source'] = 'actual'
target_df_to_join = target_validate_df[heat_cols_to_keep]
graph_concat_df = pd.concat([scored_df_to_join, target_df_to_join], axis = 0)
#display(graph_concat_df.sample())

heat_chart = graph_concat_df.groupby(['collision_hour','collision_dayofweek', 'source']).agg({'collision_probability': ['mean']})

heat_chart = heat_chart.reset_index()

heat_chart.columns = ['collision_hour','collision_weekday', 'source', 'collision_probability']

alt.Chart(heat_chart).mark_rect().encode(
    x='collision_weekday:O',
    y='collision_hour:O',
    color=alt.Color('collision_probability:Q', title = 'collision probability')
).properties(width=200,height=500).facet(
    column='source:N'
)