In [None]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_integer_dtype 
import numpy as np

from glmnet import LogitNet
from sklearn.metrics import roc_auc_score
#import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
#from glmnet import glmnet; from glmnetPlot import glmnetPlot
#from glmnetPrint import glmnetPrint; from glmnetCoef import glmnetCoef; from glmnetPredict import glmnetPredict
#from cvglmnet import cvglmnet; from cvglmnetCoef import cvglmnetCoef
#from cvglmnetPlot import cvglmnetPlot; from cvglmnetPredict import cvglmnetPredict

from sklearn.preprocessing import OneHotEncoder

#plt.rc("figure", figsize=(16,8))
#plt.rc("font", size=14)

import seaborn as sns
import matplotlib.pyplot as plt

import boto3
import awswrangler

s3_bucket = 'traffic-data-bucket'


In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        #print(col)
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        #print(col)
        cat_indicies.append(X.columns.get_loc(col))
        #print(X.columns.get_loc(col))
    return cat_indicies

In [None]:
#initialize
collision_year_list = [2015, 2016, 2017, 2018, 2019]


street_features = ['la_data_city_name', 
                     'node_street_count', 'node_stop', 'node_traffic_signals',
                     'edge_speed_kph_max', 'edge_speek_kph_min',
                     'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
                     'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
                     'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
                     'amenities_restaurant_cnt', 'amenities_college_cnt',
                     'drv_edge_lanes_max_imputed_flag']

time_features = ['collision_hour',
                 'collision_month', 'collision_dayofweek', 'drv_holiday_flag'
                ]

hex_history_features = ['prev1_yr_coll_cnt', 'prev1_yr_coll_neighbor1']

weather_features = ['noaa_wind_speed', 'noaa_precipitation',
                    'noaa_temperature_average', 'noaa_temperature_max',
                    'noaa_temperature_min']


model_features = street_features +  time_features + hex_history_features +  weather_features


In [None]:
df = df[df.collision_year != 2014]
df.collision_year.unique()

In [None]:
df['collision_month'] = df['collision_month'].map(str)
df['collision_dayofweek'] = df['collision_dayofweek'].map(str)
df['collision_year'] = df['collision_year'].map(str)
df['collision_hour'] = df['collision_hour'].map(str)

In [None]:
df_caps = df.copy()
df_caps.edge_lanes_max = df_caps.edge_lanes_max.clip(upper=8)
df_caps.node_traffic_signals = df_caps.node_traffic_signals.clip(upper=13)
df_caps.node_street_count = df_caps.node_street_count.clip(upper=6)
df_caps.node_stop = df_caps.node_stop.clip(upper=20)
df_caps.amenities_bar_cnt = df_caps.amenities_bar_cnt.clip(upper=2)
df_caps.amenities_school_cnt = df_caps.amenities_school_cnt.clip(upper=3)
df_caps.amenities_restaurant_cnt = df_caps.amenities_restaurant_cnt.clip(upper=3)
df_caps.amenities_college_cnt = df_caps.amenities_college_cnt.clip(upper=1)

df_caps.prev1_yr_coll_cnt = df_caps.prev1_yr_coll_cnt.clip(upper=70)
df_caps.prev1_yr_coll_neighbor1 = df_caps.prev1_yr_coll_neighbor1 - df_caps.prev1_yr_coll_cnt

In [None]:
df_caps['prev1_yr_coll_cnt_pw2'] = df_caps['prev1_yr_coll_cnt']**2
#df_caps['prev1_yr_coll_cnt_pw3'] = df_caps['prev1_yr_coll_cnt']**3

In [None]:
model_features = model_features + ['prev1_yr_coll_cnt_pw2']

In [None]:
cat_index = get_categorical_indicies(df_caps[model_features])
cat_col_list = df_caps[model_features].columns[cat_index].values.tolist()
cat_col_list

In [None]:
train_df = df_caps[df_caps.ttv_split == 'Train']
train_df = train_df[train_df.collision_year != '2014']
test_df = df_caps[df_caps.ttv_split == 'Test']
test_df = test_df[test_df.collision_year != '2014']
all_df = df_caps.copy()

In [None]:
train_cat_df = train_df[cat_col_list]
test_cat_df = test_df[cat_col_list]
all_cat_df = all_df[cat_col_list]

train_num_df = train_df[model_features]
test_num_df = test_df[model_features]
all_num_df = all_df[model_features]

train_num_df = train_num_df.drop(cat_col_list, axis = 1)
test_num_df = test_num_df.drop(cat_col_list, axis = 1)
all_num_df = all_num_df.drop(cat_col_list, axis = 1)


In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(train_cat_df)
col_names = enc.get_feature_names_out()
train_one_hot_df = pd.DataFrame(enc.transform(train_cat_df).toarray())
train_one_hot_df.columns = col_names

test_one_hot_df = pd.DataFrame(enc.transform(test_cat_df).toarray())
test_one_hot_df.columns = col_names

all_one_hot_df = pd.DataFrame(enc.transform(all_cat_df).toarray())
all_one_hot_df.columns = col_names

In [None]:
train_cat_num_joined_df = pd.concat([train_one_hot_df.reset_index(drop = True), train_num_df.reset_index(drop = True)], axis = 1)
test_cat_num_joined_df = pd.concat([test_one_hot_df.reset_index(drop = True), test_num_df.reset_index(drop = True)], axis = 1)
all_cat_num_joined_df = pd.concat([all_one_hot_df.reset_index(drop = True), all_num_df.reset_index(drop = True)], axis = 1)

In [None]:
def convert_int_to_float(X_frame):
    ints = []
    for col in X_frame.columns:
        if is_integer_dtype(X_frame[col]):
            ints.append(col)
    for col in ints:
        X_frame[col] = X_frame[col].astype('float')
    return X_frame

In [None]:
train_cat_num_joined_df = convert_int_to_float(train_cat_num_joined_df)
test_cat_num_joined_df = convert_int_to_float(test_cat_num_joined_df)
all_cat_num_joined_df = convert_int_to_float(all_cat_num_joined_df)

In [None]:
#train_cat_num_joined_df = train_cat_num_joined_df.fillna(0)
#test_cat_num_joined_df = test_cat_num_joined_df.fillna(0)

In [None]:
X_train = train_cat_num_joined_df
X_test = test_cat_num_joined_df
X_all = all_cat_num_joined_df

y_train = train_df[['target']]
y_test = test_df[['target']]
y_all = all_df[['target']]

y_train = convert_int_to_float(y_train)
y_test = convert_int_to_float(y_test)
y_all = convert_int_to_float(y_all)

In [None]:
alpha_range = range(0,1000, 50)
alpha_range = [a/1000 for a in alpha_range]

### Find the best alpha and lambda - only run if a grid search is necessary.  Otherwise skip ahead to fixed model

In [None]:
#alph_list = list()
#lambda_list = list()
#best_train_score_list = list()
#best_test_score_list = list()
#
#
#thisReg = LogitNet(scoring='roc_auc')
#for a in alpha_range:
#    print(a)
#    thisReg = LogitNet(scoring='roc_auc', alpha = a)
#    thisReg.fit(X_train, y_train.values.ravel())
#    best_lambda = thisReg.lambda_best_
#    best_mask = thisReg.lambda_path_ == thisReg.lambda_best_
#    best_index = [i for i, x in enumerate(best_mask) if x]
#    best_train_score = thisReg.cv_mean_score_[best_index][0]
#    best_test_score = roc_auc_score(y_test.astype(int), thisReg.predict_proba(X_test)[:,1])
#    alph_list.append(a)
#    lambda_list.append(best_lambda)
#    best_train_score_list.append(best_train_score)
#    best_test_score_list.append(best_test_score)

In [None]:
#best_train_score_list

In [None]:
#alph_list

In [None]:
#best_test_auc_mask = max(best_test_score_list) == best_test_score_list
#best_test_auc_index = [i for i, x in enumerate(best_test_auc_mask) if x]
#best_test_auc_index = best_test_auc_index[0]
#best_alpha = alph_list[best_test_auc_index]
#best_alpha
##0.85

In [None]:
#best_grid_test_auc = max(best_test_score_list)
#best_grid_test_auc
##0.7650582500286744

In [None]:
#best_lambda = lambda_list[best_test_auc_index]
#best_lambda = best_lambda[0]
#best_lambda
##0.0015224243404703295

In [None]:
best_lambda = 0.0015224243404703295
best_alpha = 0.85

In [None]:
#made the model worse
#X_train['prev1_yr_coll_neighbor1_flag'] = 0
#X_train.loc[X_train.prev1_yr_coll_neighbor1 == 0, 'prev1_yr_coll_neighbor1_flag'] = 1
#X_train['prev1_yr_coll_cnt_flag'] = 0
#X_train.loc[X_train.prev1_yr_coll_cnt == 0, 'prev1_yr_coll_cnt_flag'] = 1
#X_test['prev1_yr_coll_neighbor1_flag'] = 0
#X_test.loc[X_test.prev1_yr_coll_neighbor1 == 0, 'prev1_yr_coll_neighbor1_flag'] = 1
#X_test['prev1_yr_coll_cnt_flag'] = 0
#X_test.loc[X_test.prev1_yr_coll_cnt == 0, 'prev1_yr_coll_cnt_flag'] = 1

In [None]:
glmnet = LogitNet(scoring='roc_auc', alpha = best_alpha, lambda_path = [best_lambda])
glmnet.fit(X_train, y_train.values.ravel())

In [None]:
coef_df = pd.DataFrame(np.transpose(glmnet.coef_))
coef_df.columns = ['coefficient']
coef_df['variables'] = X_train.columns
coef_df = coef_df.sort_values('coefficient')
coef_df.shape

In [None]:
len(coef_df[coef_df['coefficient'] == 0])

In [None]:
roc_auc_score(y_test.astype(int), glmnet.predict_proba(X_test)[:,1])

In [None]:
predictions = glmnet.predict_proba(X_all)[:,1]

In [None]:
df['prediction'] = predictions

In [None]:
df_output = df[['hex_id', 'collision_date', 'collision_hour', 'ttv_split', 'prediction']]

In [None]:
df_output.sample(5)

In [None]:
model_name = "GLMnet_03"
df_output['model_name'] = model_name

awswrangler.s3.to_csv(df=df_output, path = f"s3://{s3_bucket}/model_scoring/individual_model_scores/{model_name}.csv", index=False,
                       boto3_session=my_session, use_threads=True
                       )

In [None]:
import boto3
import pickle


key='model_scoring/individual_model_scores/GLMnet_v02.pkl'
pickle_byte_obj = pickle.dumps(glmnet)
#s3_resource = boto3.resource('s3')
s3_resource = my_session.resource('s3')

s3_resource.Object(s3_bucket,key).put(Body=pickle_byte_obj)