### GBM Notebook

In [None]:
#!pip install catboost

In [None]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_integer_dtype

import numpy as np

import catboost as cb
from catboost import CatBoostClassifier

# Suppress all warnings
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

import boto3
import awswrangler

s3_bucket = 'traffic-data-bucket'

In [None]:
from aws_secrets import aws_access_key_id, aws_secret_access_key, aws_session_token

my_session = boto3.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token = aws_session_token

)

In [None]:
df = awswrangler.s3.read_parquet(path = f's3://{s3_bucket}/model_data/model_data_post_transformation.parquet', boto3_session=my_session, use_threads=True)

In [None]:
df.collision_year.unique()

In [None]:
def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        #print(col)
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        #print(col)
        cat_indicies.append(X.columns.get_loc(col))
        #print(X.columns.get_loc(col))
    return cat_indicies
#categorical_indicies = get_categorical_indicies(X)

In [None]:
def convert_cats(X_frame):
    cats = []
    for col in X_frame.columns:
        if is_numeric_dtype(X_frame[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        X_frame[col] = X_frame[col].astype('category')
    return X_frame

In [None]:
def convert_inst_to_float(X_frame):
    ints = []
    for col in X_frame.columns:
        if is_integer_dtype(X_frame[col]):
            ints.append(col)
    for col in ints:
        X_frame[col] = X_frame[col].astype('float')
    return X_frame

In [None]:
df.columns

In [None]:
#initialize
collision_year_list = [2015, 2016, 2017, 2018, 2019]


street_features = [#'la_data_city_name', 
                     'node_street_count', 'node_stop', 'node_traffic_signals',
                     'edge_speed_kph_max', 'edge_speek_kph_min',
                     'edge_lanes_max', 'edge_motorway_flag', 'edge_motorway_link_flag',
                     'edge_living_street_flag', 'edge_bridge_flag', 'edge_oneway_flag',
                     'edge_tunnel_flag', 'amenities_bar_cnt', 'amenities_school_cnt',
                     'amenities_restaurant_cnt', 'amenities_college_cnt',
                     'drv_edge_lanes_max_imputed_flag']

time_features = ['drv_collision_hour_sin','drv_collision_hour_cos',
                 'collision_month', 'collision_dayofweek', 'drv_holiday_flag'
                ]

hex_history_features = ['prev1_yr_coll_cnt', 'prev1_yr_coll_neighbor1']

weather_features = ['noaa_wind_speed', 'noaa_precipitation',
                    'noaa_temperature_average', 'noaa_temperature_max',
                    'noaa_temperature_min']


model_features = street_features +  time_features + hex_history_features +  weather_features


argument	description
* iterations=500	The maximum number of trees that can be built when solving machine learning problems. Fewer may be used.
* learning_rate=0.03	used for reducing the gradient step. It affects the overall time of training: the smaller the value, the more iterations are required for training.
* depth=6	Depth of the tree. Can be any integer up to 32. Good values in the range 1 - 10.
* l2_leaf_reg=3	try different values for the regularizer to find the best possible. Any positive values are allowed.
* loss_function='Logloss'	For 2-class classification use 'LogLoss' or 'CrossEntropy'. For multiclass use 'MultiClass'.
* border_count=32	The number of splits for numerical features. Allowed values are integers from 1 to 255 inclusively.
* ctr_border_count=50	The number of splits for categorical features. Allowed values are integers from 1 to 255 inclusively.

In [None]:
param_grid = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations':[250,100,500,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'ctr_border_count':[50,5,10,20,100,200],
          'thread_count':4}

In [None]:
df['collision_month'] = df['collision_month'].map(str)
df['collision_dayofweek'] = df['collision_dayofweek'].map(str)
df['collision_year'] = df['collision_year'].map(str)

In [None]:
train_df = df[df.ttv_split == 'Train']
test_df = df[df.ttv_split == 'Test']

In [None]:
X_train = train_df[model_features]
X_test = test_df[model_features]
y_train = train_df['target']
y_test = test_df['target']

In [None]:
X_train['drv_holiday_flag'] = X_train['drv_holiday_flag'].astype(float)
X_test['drv_holiday_flag'] = X_test['drv_holiday_flag'].astype(float)

In [None]:
pd.options.display.max_columns = None
X_train.sample(2)

In [None]:
categorical_indicies = get_categorical_indicies(X_train)
X_train = convert_cats(X_train)
X_test = convert_cats(X_test)

In [None]:
X_test = convert_inst_to_float(X_test)
X_train = convert_inst_to_float(X_train)

In [None]:
X_train.dtypes

In [None]:
train_dataset = cb.Pool(X_train[model_features],y_train, cat_features=np.array(categorical_indicies))
test_dataset = cb.Pool(X_test[model_features],y_test, cat_features=np.array(categorical_indicies))

In [None]:
model = cb.CatBoostClassifier(loss_function='Logloss', eval_metric='F1', silent=True)

model.grid_search(param_grid,train_dataset,verbose=False)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    #Add chart labels
    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

In [None]:
plot_feature_importance(model.get_feature_importance(),X_train.columns,'CATBOOST ')

In [None]:
model_output = df.copy()
model_output = convert_inst_to_float(model_output)
model_output = convert_cats(model_output)

In [None]:
model_output['prediction'] = model.predict(model_output[model_features])

In [None]:
model_output['model_name'] = "GBM_01"

In [None]:
model_output = model_output[['hex_id', 'collision_date', 'collision_hour', 'ttv_split', 'prediction', 'model_name']]

In [None]:
awswrangler.s3.to_csv(df=model_output, path = f"s3://{s3_bucket}/model_scoring/individual_model_scores/GBM_01.csv", index=False,
                       boto3_session=my_session, use_threads=True
                       )