### GBM Notebook

In [None]:
#!pip install catboost

In [None]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_integer_dtype

import numpy as np

import catboost as cb
from catboost import CatBoostClassifier

from sklearn.metrics import roc_auc_score

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import boto3
import awswrangler

s3_bucket = 'traffic-data-bucket'


In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
df.collision_year.unique()

In [None]:
def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        #print(col)
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        #print(col)
        cat_indicies.append(X.columns.get_loc(col))
        #print(X.columns.get_loc(col))
    return cat_indicies
#categorical_indicies = get_categorical_indicies(X)

In [None]:
def convert_cats(X_frame):
    cats = []
    for col in X_frame.columns:
        if is_numeric_dtype(X_frame[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        X_frame[col] = X_frame[col].astype('category')
    return X_frame

In [None]:
def convert_int_to_float(X_frame):
    ints = []
    for col in X_frame.columns:
        if is_integer_dtype(X_frame[col]):
            ints.append(col)
    for col in ints:
        X_frame[col] = X_frame[col].astype('float')
    return X_frame

In [None]:
#initialize
collision_year_list = [2015, 2016, 2017, 2018, 2019]


street_features = ['la_data_city_name', 
                     'node_street_count', 'node_stop', 'node_traffic_signals',
                     'edge_speed_kph_max', 'edge_speek_kph_min',
                     'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
                     'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
                     'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
                     'amenities_restaurant_cnt', 'amenities_college_cnt',
                     'drv_edge_lanes_max_imputed_flag']

time_features = ['drv_collision_hour_sin','drv_collision_hour_cos',
                 'collision_month', 'collision_dayofweek', 'drv_holiday_flag'
                ]

hex_history_features = ['prev1_yr_coll_cnt', 'prev1_yr_coll_neighbor1']

weather_features = ['noaa_wind_speed', 'noaa_precipitation',
                    'noaa_temperature_average', 'noaa_temperature_max',
                    'noaa_temperature_min']


model_features = street_features +  time_features + hex_history_features +  weather_features


argument	description
* iterations=500	The maximum number of trees that can be built when solving machine learning problems. Fewer may be used.
* learning_rate=0.03	used for reducing the gradient step. It affects the overall time of training: the smaller the value, the more iterations are required for training.
* depth=6	Depth of the tree. Can be any integer up to 32. Good values in the range 1 - 10.
* l2_leaf_reg=3	try different values for the regularizer to find the best possible. Any positive values are allowed.
* border_count=32	The number of splits for numerical features. Allowed values are integers from 1 to 255 inclusively.

In [None]:
param_grid = {'depth':[12],
          'iterations':[400],
          'learning_rate':[0.2], 
          'l2_leaf_reg':[10],
          'border_count':[40]}

In [None]:
df = df[df.collision_year != 2014]
df.collision_year.unique()

In [None]:
#df['collision_month'] = df['collision_month'].map(str)
#df['collision_dayofweek'] = df['collision_dayofweek'].map(str)
#df['collision_year'] = df['collision_year'].map(str)

In [None]:
df.la_data_city_name = pd.Categorical(df.la_data_city_name)
df['la_data_city_name'] = df['la_data_city_name'].cat.codes
df['la_data_city_name'] = df['la_data_city_name'].map(str)

In [None]:
df_caps = df.copy()

In [None]:
df_caps.edge_lanes_max = df_caps.edge_lanes_max.clip(upper=8)
df_caps.node_traffic_signals = df_caps.node_traffic_signals.clip(upper=13)
df_caps.node_street_count = df_caps.node_street_count.clip(upper=6)
df_caps.node_stop = df_caps.node_stop.clip(upper=20)
df_caps.amenities_bar_cnt = df_caps.amenities_bar_cnt.clip(upper=2)
df_caps.amenities_school_cnt = df_caps.amenities_school_cnt.clip(upper=3)
df_caps.amenities_restaurant_cnt = df_caps.amenities_restaurant_cnt.clip(upper=3)
df_caps.amenities_college_cnt = df_caps.amenities_college_cnt.clip(upper=1)

df_caps.prev1_yr_coll_cnt = df_caps.prev1_yr_coll_cnt.clip(upper=70)
df_caps.prev1_yr_coll_neighbor1 = df_caps.prev1_yr_coll_neighbor1.clip(upper=350)
df_caps.prev1_yr_coll_neighbor1 = df_caps.prev1_yr_coll_neighbor1.divide(7)
df_caps.prev1_yr_coll_neighbor1 = df_caps.prev1_yr_coll_neighbor1.round()

In [None]:
model_features.remove('noaa_precipitation')
model_features.remove('noaa_temperature_max')
model_features.remove('noaa_temperature_average')
model_features.remove('edge_speek_kph_min')
model_features.remove('edge_living_street_flag')

In [None]:
#v5 try isolating the training window
train_df = df_caps[(df_caps.ttv_split == 'Train')]
train_df = train_df[train_df.collision_year != 2020]

test_df = df_caps[(df_caps.ttv_split == 'Test')]
test_df = test_df[test_df.collision_year != 2020]

all_df = df_caps.copy()

In [None]:
X_train = train_df[model_features]
X_test = test_df[model_features]
X_all = all_df[model_features]
y_train = train_df['target']
y_test = test_df['target']

In [None]:
X_train['drv_holiday_flag'] = X_train['drv_holiday_flag'].astype(float)
X_test['drv_holiday_flag'] = X_test['drv_holiday_flag'].astype(float)
X_all['drv_holiday_flag'] = X_all['drv_holiday_flag'].astype(float)

In [None]:
pd.options.display.max_columns = None
X_train.sample(2)

In [None]:
categorical_indicies = get_categorical_indicies(X_train)
X_train = convert_cats(X_train)
X_test = convert_cats(X_test)
X_all = convert_cats(X_all)

In [None]:
X_test = convert_int_to_float(X_test)
X_train = convert_int_to_float(X_train)
X_all = convert_int_to_float(X_all)

In [None]:
#X_train.dtypes

In [None]:
train_dataset = cb.Pool(X_train[model_features],y_train, cat_features=np.array(categorical_indicies))
test_dataset = cb.Pool(X_test[model_features],y_test, cat_features=np.array(categorical_indicies))

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE', eval_metric='AUC:hints=skip_train~false', silent=True, metric_period=50)

model.grid_search(param_grid,train_dataset,verbose=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], 
                y=fi_df['feature_names'],
                color=sns.color_palette("Set2")[7]
                )
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(model.get_feature_importance(),X_train.columns,'CATBOOST ')

In [None]:
predictions = model.predict(X_all)

In [None]:
df['prediction'] = predictions

In [None]:
df_output = df[['hex_id', 'collision_date', 'collision_hour', 'ttv_split', 'prediction']]

In [None]:
df_output.sample(5)

In [None]:
roc_auc_score(y_test.astype(int), model.predict(X_test))
#0.7885608825082839

In [None]:
model_name = "GBM_06"
df_output['model_name'] = model_name

awswrangler.s3.to_csv(df=df_output, path = f"s3://{s3_bucket}/model_scoring/individual_model_scores/{model_name}.csv", index=False,
                       boto3_session=my_session, use_threads=True
                       )