## How to code and use the Focal Loss with LightGBM

The [Focal Loss](https://arxiv.org/pdf/1708.02002.pdf) for LightGBM can be coded as:

In [67]:
import numpy as np
import pickle
import lightgbm as lgb

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from scipy.misc import derivative

In [68]:
def focal_loss_lgb(y_pred, dtrain, alpha, gamma):
    """
    Focal Loss for lightgbm

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    """
    a,g = alpha, gamma
    y_true = dtrain.label
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess

If we are going to use it as our custom loss, we also need our custom evaluation function

In [69]:
def focal_loss_lgb_eval_error(y_pred, dtrain, alpha, gamma):
    """
    Adapation of the Focal Loss for lightgbm to be used as evaluation loss

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    """
    a,g = alpha, gamma
    y_true = dtrain.label
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

To use them, first we need to make them partial functions of **only** `y_pred` and `dtrain`, since this is a structural requirement for LighGBM. Then, we simply pass them as parameters. 

Let me first load some processed data:

In [76]:
PATH = Path("../data/")
databunch = pickle.load(open(PATH/'adult_databunch.p', 'rb'))
colnames = databunch.colnames
categorical_columns = databunch.categorical_columns + databunch.crossed_columns
X = databunch.data
y = databunch.target
# you know, in real life, train, valid AND test, and you keep it somewhere safe...
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.25,
    random_state=1, stratify=y)
# let's have a look:
X.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,education_occupation,native_country_occupation
11961,0.287671,0,0,0,0,0,0,0,0.0,0.0,0.397959,0,0,0
1230,0.09589,1,1,0,1,0,0,1,0.0,0.0,0.397959,0,1,1
16067,0.589041,1,1,1,1,1,0,1,0.0,0.0,0.193878,0,1,1
12914,0.452055,1,1,2,2,2,0,0,0.0,0.0,0.479592,0,2,2
6343,0.205479,1,2,2,3,2,0,0,0.0,0.0,0.397959,0,3,3


In [78]:
print(y.values)

[1 0 0 ... 0 0 1]


LightGBM with Focal Loss

In [71]:
lgtrain = lgb.Dataset(
    X_tr, y_tr,
    feature_name=colnames,
    categorical_feature = categorical_columns,
    free_raw_data=False)
lgvalid = lgtrain.create_valid(X_val, y_val)

In [72]:
focal_loss = lambda x,y: focal_loss_lgb(x, y, 0.25, 2.)
eval_error = lambda x,y: focal_loss_lgb_eval_error(x, y, 0.25, 2.)
params  = {'learning_rate':0.1, 'num_boost_round':10}
model = lgb.train(
    params,
    lgtrain,
    valid_sets=[lgvalid],
    fobj=focal_loss,
    feval=eval_error
    )

New categorical_feature is ['education', 'education_occupation', 'gender', 'marital_status', 'native_country', 'native_country_occupation', 'occupation', 'race', 'relationship', 'workclass']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's focal_loss: 0.098781
[2]	valid_0's focal_loss: 0.0898273
[3]	valid_0's focal_loss: 0.0821333
[4]	valid_0's focal_loss: 0.0755058
[5]	valid_0's focal_loss: 0.0697994
[6]	valid_0's focal_loss: 0.064839
[7]	valid_0's focal_loss: 0.0605124
[8]	valid_0's focal_loss: 0.0567805
[9]	valid_0's focal_loss: 0.0534902
[10]	valid_0's focal_loss: 0.0506304


### Sklearn's API

If you prefer to use LightGBM's sklearn API, simply replace `dtrain` with `y_true`, and swap the predictions and ground truth order, like:

In [73]:
def focal_loss_lgb_sk(y_true, y_pred, alpha, gamma):
    """
    Focal Loss for lightgbm

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    """
    a,g = alpha, gamma
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess

In [74]:
def focal_loss_lgb_eval_error_sk(y_true, y_pred, alpha, gamma):
    """
    Adapation of the Focal Loss for lightgbm to be used as evaluation loss

    Parameters:
    -----------
    y_pred: numpy.ndarray
        array with the predictions
    dtrain: lightgbm.Dataset
    alpha, gamma: float
        See original paper https://arxiv.org/pdf/1708.02002.pdf
    """
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

In [75]:
focal_loss = lambda x,y: focal_loss_lgb_sk(x, y, 0.25, 2.)
eval_error = lambda x,y: focal_loss_lgb_eval_error_sk(x, y, 0.25, 2.)
model = lgb.LGBMClassifier(objective=focal_loss, learning_rate=0.1, num_boost_round=10)
model.fit(
    X_tr,
    y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric=eval_error)

[1]	valid_0's focal_loss: 0.0988352
[2]	valid_0's focal_loss: 0.0899494
[3]	valid_0's focal_loss: 0.0823239
[4]	valid_0's focal_loss: 0.0757314
[5]	valid_0's focal_loss: 0.0700502
[6]	valid_0's focal_loss: 0.0651475
[7]	valid_0's focal_loss: 0.0608702
[8]	valid_0's focal_loss: 0.0571672
[9]	valid_0's focal_loss: 0.0539455
[10]	valid_0's focal_loss: 0.051152


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_boost_round=10, num_leaves=31,
               objective=<function <lambda> at 0x115bc1950>, random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)