In [1]:
from bayes_opt import BayesianOptimization

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')
seed = 101 # Lucky seed

Load and inspect San Francisco Crime Classification dataset.

In [2]:
df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


Numerically encode target and categorical variables.

In [3]:
cat_feats = ['Category', 'Descript', 'DayOfWeek', 'PdDistrict', 'Resolution']
cat_dicts = {}
for feat in cat_feats:
    cat_dicts[feat] = {k:v for (v,k) in enumerate(df[feat].value_counts().index)}

In [4]:
for feat in cat_dicts.keys():
    df[feat] = df[feat].map(cat_dicts[feat])
df.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,7,5,1,2,1,OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,1,44,1,2,1,OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,1,44,1,2,1,VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,0,0,1,2,0,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,0,0,1,8,0,100 Block of BRODERICK ST,-122.438738,37.771541


Grab data and split into train/test sets.

In [5]:
X = df[['Descript', 'DayOfWeek', 'PdDistrict', 'Resolution', 'X', 'Y']].values
y = df['Category'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

Define target function and instantiate Bayesian Optimization instance.

In [6]:
def target(**params):
    n_estimators = int(params['n_estimators'])
    max_depth = int(params['max_depth'])
    min_samples_split = int(params['min_samples_split'])
    class_weight = int(params['class_weight'])
    class_weight_dict = {0:None, 1:'balanced'}
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   class_weight=class_weight_dict[class_weight],
                                   n_jobs=-1, random_state=seed)
    scores = cross_val_score(model, X_train, y_train, scoring='neg_log_loss', cv=3)
    return scores.mean()

In [7]:
params = {'n_estimators':(10,200),
          'max_depth':(1,20),
          'min_samples_split':(2,100),
          'class_weight':(0,1.99)}
bo = BayesianOptimization(target, params, random_state=seed)
bo.maximize(init_points=5, n_iter=10, acq='ucb')

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   class_weight |   max_depth |   min_samples_split |   n_estimators | 
    1 | 00m43s | [35m  -1.17155[0m | [32m        0.4624[0m | [32m    16.8440[0m | [32m            56.3143[0m | [32m      108.1157[0m | 
    2 | 00m33s |   -1.94870 |         0.1663 |      6.8324 |             36.5089 |       118.4268 | 
    3 | 00m08s |   -1.18467 |         1.2011 |     17.9786 |             19.8255 |        15.4101 | 
    4 | 00m18s |   -1.51822 |         1.4507 |     14.7093 |             78.9890 |        42.5891 | 
    5 | 00m32s |   -2.16718 |         0.5497 |      4.6088 |             96.6174 |       140.2026 | 
[31mBayesian Optimization[0m
[94m---------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   class_weight |   max_depth |   min_samples_s

In [8]:
bo.res['max']

{'max_params': {'class_weight': 1.99,
  'max_depth': 20.0,
  'min_samples_split': 2.0,
  'n_estimators': 200.0},
 'max_val': -0.84328138630702298}

Retrain model and score on test set.

In [9]:
model = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=2, class_weight='balanced', n_jobs=-1)
model.fit(X_train, y_train)

In [11]:
labels = list(cat_dicts['Category'].values())
log_loss(y_test, model.predict_proba(X_test), labels=labels)

0.81389066175198699