### Analysis of Potential Model Features

In [None]:
import pandas as pd
from pandas.api.types import is_numeric_dtype

import numpy as np
from random import choices

import matplotlib.pyplot as plt

import seaborn as sns

import boto3
import awswrangler

s3_bucket = 'traffic-data-bucket'


In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
#model_df = pd.read_csv(root / 'X.data' / 'model_data' / 'model_data_post_transformation.csv')
model_df = awswrangler.s3.read_parquet(path = f's3://traffic-data-bucket/model_data/model_data_post_transformation.parquet',
                       boto3_session=my_session, use_threads=True
                       )

model_df.columns

In [None]:
model_df = model_df[model_df.collision_year.isin([2015,2016,2017,2108,2109])]
model_df.collision_year.unique()

In [None]:
model_df.target.value_counts()

In [None]:
def univ_plot(data, var):
    fig=plt.figure(figsize=(10,5))
    ax1 = fig.add_subplot(111)
    ax2 = ax1.twinx()
    
    data = data.copy()
    
    grp_df = data.groupby(var).agg({'actual': ['mean', 'std', 'count']
                                   })
    
    grp_df.columns = ['actual', 'actual_std', 'count']

    max_actual = np.max(grp_df['actual'])
    max_y = max_actual
    
    tall_df = pd.melt(grp_df.reset_index(), id_vars=var)  
    
    plt.tight_layout()
    
    #display(tall_df)
    
    count_mask = tall_df['variable'] == 'count'
    bar_data = tall_df[count_mask].reset_index(drop = True)
    #display(bar_data)

    bar_data[var] = bar_data[var].apply(str)
    #print(bar_data.dtypes)
    
    line_mask = tall_df['variable'].isin(['actual'])
    line_data=tall_df[line_mask].reset_index(drop = True)
    #display(line_data)

    line_data[var] = line_data[var].apply(str)
    #print(line_data.dtypes)
    
    bar = sns.barplot(x=var,
                      y='value',
                      data=bar_data,
                      color='gray',
                      ax=ax1)

    line1 = sns.lineplot(x=var,
                         y='value',
                         data=line_data,
                         marker='s',
                         hue='variable',
                         ax=ax2)
    
    handles, labels = ax2.get_legend_handles_labels()
    
    
    ax2.legend(handles=handles, labels=labels)
    ax2.set(ylim=(0, max_y*1.2))
    
    bar.set_xticklabels(bar.get_xticklabels(), rotation=45, horizontalalignment='right')
    
    bar.set_xlabel("", fontsize = 12)
    
    bar.set_ylabel("Frequency", fontsize = 12)
    line1.set_ylabel("Probability of Collision", fontsize = 12)
    
    title_string = (var)
    #print(title_string)
    
    bar.set_title(title_string, fontsize=13, loc='left')
    plt.gcf().set_size_inches(10, 5)
    
    return

In [None]:
model_df['actual'] = model_df['target']
model_df['drv_holiday_flag'].unique()

In [None]:
univ_plot(data = model_df, var = 'drv_holiday_flag')

In [None]:
import matplotlib.backends.backend_pdf
pdf = matplotlib.backends.backend_pdf.PdfPages("pre-modeling_univariates.pdf")
vars_to_save = ['collision_month',
       'collision_dayofweek', 'collision_hour', #'accident_count', 'ttv_split',
       'node_street_count', 'node_stop', 'node_traffic_signals',
       'la_data_city_name', 'edge_speed_kph_max', 'edge_speek_kph_min',
       'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
       'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
       'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
       'amenities_restaurant_cnt', 'amenities_college_cnt',
       'prev1_yr_coll_cnt', 'prev2_yr_coll_cnt', 'prev1_yr_coll_neighbor1',
       'prev1_yr_coll_neighbor2', 'prev2_yr_coll_neighbor1',
       'prev2_yr_coll_neighbor2', 'noaa_wind_speed', 'noaa_precipitation',
       'noaa_temperature_average', 'noaa_temperature_max',
       'noaa_temperature_min', 'drv_collision_hour_sin',
       'drv_collision_hour_cos', 'drv_holiday_flag',
       'drv_edge_lanes_max_imputed_flag']
for var in vars_to_save:
    fig = univ_plot(data = model_df, var = var)
    pdf.savefig(orientation = 'portrait')
    
pdf.close()

