## Hyperparameter Optimization of LightGBM with Focal Loss

Here I will quicky show how to use [Hyperopt](https://github.com/hyperopt/hyperopt) to optimize all LightGBM's hyperparameters and $\alpha$ and $\gamma$ for the Focal Loss. 

I am going to assume that we want to optimise "against" a standard metric for imbalanced datasets such as the F1 score

We first need to code that metric to be passed to LightGBM

In [7]:
import numpy as np
import lightgbm as lgb
import pickle

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from scipy.misc import derivative
from hyperopt import hp, tpe, fmin, Trials

def sigmoid(x): return 1./(1. +  np.exp(-x))

def focal_loss_lgb(y_pred, dtrain, alpha, gamma):
    """
    Focal Loss for lightgbm

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    """
    a,g = alpha, gamma
    y_true = dtrain.label
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess

def lgb_focal_f1_score(preds, lgbDataset):
    """
    When using custom losses the row prediction needs to passed through a
    sigmoid to represent a probability

    Parameters:
    -----------
    preds: numpy.ndarray
        array with the predictions
    lgbDataset: lightgbm.Dataset
    """
    preds = sigmoid(preds)
    binary_preds = [int(p>0.5) for p in preds]
    y_true = lgbDataset.get_label()
    return 'f1', f1_score(y_true, binary_preds), True

Let's now define our objective function

In [15]:
def objective(params):
    """
    objective function for lightgbm.
    """
    # hyperopt casts as float
    params['num_boost_round'] = int(params['num_boost_round'])
    params['num_leaves'] = int(params['num_leaves'])

    # need to be passed as parameter
    params['verbose'] = -1
    params['seed'] = 1

    focal_loss = lambda x,y: focal_loss_lgb(x, y,
        params['alpha'], params['gamma'])
    # if you do not want an annoying warning related to the unrecognised param
    # 'alpha', simple pop them out from the dict params here and insert them
    # back before return. For this particular notebook I can live  with it, so
    # I will leave it
    cv_result = lgb.cv(
        params,
        train,
        num_boost_round=params['num_boost_round'],
        fobj = focal_loss,
        feval = lgb_focal_f1_score,
        nfold=3,
        stratified=True,
        early_stopping_rounds=20)
    # I save the length or the results (i.e. the number of estimators) because
    # it might have stopped earlier and is always useful to have that
    # information 
    early_stop_dict[objective.i] = len(cv_result['f1-mean'])
    score = round(cv_result['f1-mean'][-1], 4)
    objective.i+=1
    return -score

Now the parameter space that we are going to be exploring:

In [5]:
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    'num_boost_round': hp.quniform('num_boost_round', 50, 500, 20),
    'num_leaves': hp.quniform('num_leaves', 31, 255, 4),
    'min_child_weight': hp.uniform('min_child_weight', 0.1, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.),
    'subsample': hp.uniform('subsample', 0.5, 1.),
    'reg_alpha': hp.uniform('reg_alpha', 0.01, 0.1),
    'reg_lambda': hp.uniform('reg_lambda', 0.01, 0.1),
    'alpha': hp.uniform('alpha', 0.1, 0.75),
    'gamma': hp.uniform('gamma', 0.5, 5)
    }

And we are ready, let's just load some data and run the whole thing

In [9]:
PATH = Path("../data/")
databunch = pickle.load(open(PATH/'adult_databunch.p', 'rb'))
colnames = databunch.colnames
categorical_columns = databunch.categorical_columns + databunch.crossed_columns
X = databunch.data
y = databunch.target
# you know, in real life, train, valid AND test, and you keep it somewhere safe...
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.25,
    random_state=1, stratify=y)
# let's have a look:
X.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,education_occupation,native_country_occupation
11961,0.287671,0,0,0,0,0,0,0,0.0,0.0,0.397959,0,0,0
1230,0.09589,1,1,0,1,0,0,1,0.0,0.0,0.397959,0,1,1
16067,0.589041,1,1,1,1,1,0,1,0.0,0.0,0.193878,0,1,1
12914,0.452055,1,1,2,2,2,0,0,0.0,0.0,0.479592,0,2,2
6343,0.205479,1,2,2,3,2,0,0,0.0,0.0,0.397959,0,3,3


In [11]:
print(y.values)

[1 0 0 ... 0 0 1]


In [12]:
train = lgb.Dataset(
    X_tr, y_tr,
    feature_name=colnames,
    categorical_feature = categorical_columns,
    free_raw_data=False)

In [16]:
# the error bar looks better in the terminal...
objective.i=0
trials = Trials()
early_stop_dict = {}
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=5,
            trials=trials)

  0%|          | 0/5 [00:00<?, ?it/s, best loss: ?]





 20%|██        | 1/5 [00:25<01:40, 25.01s/it, best loss: -0.481]





 40%|████      | 2/5 [00:34<01:00, 20.32s/it, best loss: -0.7066]





 60%|██████    | 3/5 [00:52<00:39, 19.54s/it, best loss: -0.7066]





 80%|████████  | 4/5 [01:24<00:23, 23.33s/it, best loss: -0.7066]





100%|██████████| 5/5 [01:35<00:00, 19.63s/it, best loss: -0.7066]


**Note:** using the fscore (or any score) is normally more expensive than a loss. 

In [19]:
best['num_boost_round'] = early_stop_dict[trials.best_trial['tid']]
best['num_leaves'] = int(best['num_leaves'])
best['verbose'] = -1
focal_loss = lambda x,y: focal_loss_lgb(x, y, best['alpha'], best['gamma'])
model = lgb.train(best, train, fobj=focal_loss)
preds = model.predict(X_val)
preds = sigmoid(preds)
preds = (preds > 0.5).astype('int')



In [20]:
print(f1_score(y_val, preds))

0.7121898206846586
