# LightGBM Optuna

## Load data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [3]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat10',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [4]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']

In [5]:
from categorical_transform import CategoricalTransform
ct = CategoricalTransform(cat_cols)
x_train = ct.fit_transform(x_train)

## Optuna optimierung

In [47]:
from sklearn.metrics import roc_auc_score
from lightgbm.sklearn import LGBMClassifier
from sklearn.pipeline import Pipeline 
import optuna
from categorical_transform import CategoricalTransform
import numpy as np
from sklearn.model_selection import KFold
    
def objective(trial):
    params = {'n_estimators': trial.suggest_int("n_estimators", 10, 20000, log=True),
              'min_child_samples':trial.suggest_int("min_child_samples", 1, 5000, log=True),
              'max_depth':trial.suggest_int("max_depth", 1, 50),
              'reg_lambda': trial.suggest_float("reg_lambda", 0.0001, 25, log=True),
              'reg_alpha': trial.suggest_float("reg_alpha", 0.0001, 25, log=True),
              'colsample_bytree': trial.suggest_float("colsample_bytree", 0.1, 1),
              'num_leaves': trial.suggest_int("num_leaves", 5, 5000, log=True),
              'cat_smooth': trial.suggest_int("cat_smooth", 0, 100),
              'class_weight':trial.suggest_categorical('class_weight',['balanced',None])
             }
    
    lgbm = LGBMClassifier(objective='binary', **params)
    kf = KFold(n_splits=5, shuffle=True)
    roc_test = []
    for train_index, test_index in kf.split(x_train):
        x_train_fold, x_test_fold = x_train.loc[train_index], x_train.loc[test_index]
        y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
        lgbm.fit(x_train_fold, y_train_fold, 
                 eval_set = (x_test_fold, y_test_fold), early_stopping_rounds=200, verbose=0,
                 eval_metric='auc')    
        #lgbm.fit(x_train_fold, y_train_fold)        
        proba = lgbm.predict_proba(x_test_fold)[:,1]
        roc_test.append(roc_auc_score(y_test_fold, proba))
    
    return np.mean(roc_test)

In [48]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=600)
print(study.best_trial)

[32m[I 2021-03-09 20:13:28,973][0m A new study created in memory with name: no-name-f387ce15-85b4-4f7a-951e-17373d9bb733[0m
[32m[I 2021-03-09 20:14:31,690][0m Trial 0 finished with value: 0.893308095934992 and parameters: {'n_estimators': 4245, 'min_child_samples': 21, 'max_depth': 3, 'reg_lambda': 0.0002470655344589969, 'reg_alpha': 2.2750741446107647, 'colsample_bytree': 0.9728656238206234, 'num_leaves': 6, 'cat_smooth': 72, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.893308095934992.[0m
[32m[I 2021-03-09 20:15:06,993][0m Trial 1 finished with value: 0.8934520772587234 and parameters: {'n_estimators': 945, 'min_child_samples': 93, 'max_depth': 6, 'reg_lambda': 1.6328197478211002, 'reg_alpha': 7.80054923971875, 'colsample_bytree': 0.6498498321239707, 'num_leaves': 1817, 'cat_smooth': 13, 'class_weight': None}. Best is trial 1 with value: 0.8934520772587234.[0m
[32m[I 2021-03-09 20:15:34,198][0m Trial 2 finished with value: 0.8927651182691136 and parameters: {

[32m[I 2021-03-09 20:16:20,923][0m Trial 3 finished with value: 0.8933864776418957 and parameters: {'n_estimators': 6696, 'min_child_samples': 479, 'max_depth': 29, 'reg_lambda': 0.8650991471214148, 'reg_alpha': 0.06564981933430411, 'colsample_bytree': 0.49840323614780657, 'num_leaves': 13, 'cat_smooth': 8, 'class_weight': None}. Best is trial 1 with value: 0.8934520772587234.[0m
[32m[I 2021-03-09 20:16:25,256][0m Trial 4 finished with value: 0.8862869304765508 and parameters: {'n_estimators': 17, 'min_child_samples': 1025, 'max_depth': 10, 'reg_lambda': 0.0011409581755915596, 'reg_alpha': 0.42545882635128784, 'colsample_bytree': 0.42980279920072617, 'num_leaves': 673, 'cat_smooth': 8, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.8934520772587234.[0m
[32m[I 2021-03-09 20:17:04,672][0m Trial 5 finished with value: 0.8928162979934318 and parameters: {'n_estimators': 1625, 'min_child_samples': 18, 'max_depth': 26, 'reg_lambda': 0.00015963551173416208, 'reg_alpha': 0.

[32m[I 2021-03-09 20:17:44,850][0m Trial 6 finished with value: 0.89502835294109 and parameters: {'n_estimators': 13342, 'min_child_samples': 458, 'max_depth': 6, 'reg_lambda': 0.010249670571339088, 'reg_alpha': 0.2281864463471308, 'colsample_bytree': 0.35182941929555944, 'num_leaves': 3798, 'cat_smooth': 78, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.89502835294109.[0m
[32m[I 2021-03-09 20:18:19,721][0m Trial 7 finished with value: 0.8934522658845301 and parameters: {'n_estimators': 18782, 'min_child_samples': 1, 'max_depth': 6, 'reg_lambda': 0.00047064673055634466, 'reg_alpha': 0.0005959941469127894, 'colsample_bytree': 0.3244880845239001, 'num_leaves': 42, 'cat_smooth': 12, 'class_weight': None}. Best is trial 6 with value: 0.89502835294109.[0m
[32m[I 2021-03-09 20:19:02,055][0m Trial 8 finished with value: 0.8940530944916499 and parameters: {'n_estimators': 2103, 'min_child_samples': 207, 'max_depth': 9, 'reg_lambda': 0.014367429687839373, 'reg_alpha': 0.001

[32m[I 2021-03-09 20:19:55,078][0m Trial 10 finished with value: 0.8919340024606015 and parameters: {'n_estimators': 152, 'min_child_samples': 3838, 'max_depth': 49, 'reg_lambda': 0.05891703084077512, 'reg_alpha': 0.02122434117709964, 'colsample_bytree': 0.18807062382508571, 'num_leaves': 3977, 'cat_smooth': 95, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.89502835294109.[0m
[32m[I 2021-03-09 20:20:53,703][0m Trial 11 finished with value: 0.8950006549461538 and parameters: {'n_estimators': 9916, 'min_child_samples': 3234, 'max_depth': 38, 'reg_lambda': 0.009727653225310103, 'reg_alpha': 0.31213136953958504, 'colsample_bytree': 0.10422180964203245, 'num_leaves': 3840, 'cat_smooth': 37, 'class_weight': 'balanced'}. Best is trial 6 with value: 0.89502835294109.[0m
[32m[I 2021-03-09 20:22:07,146][0m Trial 12 finished with value: 0.895235303580943 and parameters: {'n_estimators': 17508, 'min_child_samples': 4679, 'max_depth': 43, 'reg_lambda': 0.012154135473120976, 're

[32m[I 2021-03-09 20:22:18,546][0m Trial 13 finished with value: 0.8900340396906241 and parameters: {'n_estimators': 151, 'min_child_samples': 4856, 'max_depth': 50, 'reg_lambda': 0.11854628855642513, 'reg_alpha': 0.024359589694754415, 'colsample_bytree': 0.12077086957492594, 'num_leaves': 4271, 'cat_smooth': 93, 'class_weight': 'balanced'}. Best is trial 12 with value: 0.895235303580943.[0m
[32m[I 2021-03-09 20:22:22,282][0m Trial 14 finished with value: 0.883983717442848 and parameters: {'n_estimators': 11, 'min_child_samples': 316, 'max_depth': 41, 'reg_lambda': 0.004821616070249156, 'reg_alpha': 0.06448992536122702, 'colsample_bytree': 0.35100586137365364, 'num_leaves': 130, 'cat_smooth': 42, 'class_weight': 'balanced'}. Best is trial 12 with value: 0.895235303580943.[0m
[32m[I 2021-03-09 20:23:02,935][0m Trial 15 finished with value: 0.8953761018691926 and parameters: {'n_estimators': 19901, 'min_child_samples': 1225, 'max_depth': 17, 'reg_lambda': 0.32146282595088194, 're

[32m[I 2021-03-09 20:23:21,089][0m Trial 16 finished with value: 0.8943900442628216 and parameters: {'n_estimators': 239, 'min_child_samples': 2327, 'max_depth': 16, 'reg_lambda': 0.12393009772215906, 'reg_alpha': 15.587554294300741, 'colsample_bytree': 0.18449828912105348, 'num_leaves': 127, 'cat_smooth': 29, 'class_weight': 'balanced'}. Best is trial 15 with value: 0.8953761018691926.[0m
[32m[I 2021-03-09 20:24:09,994][0m Trial 17 finished with value: 0.8938332314507876 and parameters: {'n_estimators': 4788, 'min_child_samples': 5, 'max_depth': 19, 'reg_lambda': 19.63824519744629, 'reg_alpha': 2.911115448622843, 'colsample_bytree': 0.10368025933713763, 'num_leaves': 382, 'cat_smooth': 54, 'class_weight': 'balanced'}. Best is trial 15 with value: 0.8953761018691926.[0m


FrozenTrial(number=15, values=[0.8953761018691926], datetime_start=datetime.datetime(2021, 3, 9, 20, 22, 22, 283175), datetime_complete=datetime.datetime(2021, 3, 9, 20, 23, 2, 934681), params={'n_estimators': 19901, 'min_child_samples': 1225, 'max_depth': 17, 'reg_lambda': 0.32146282595088194, 'reg_alpha': 4.211636870856943, 'colsample_bytree': 0.21531358139125134, 'num_leaves': 314, 'cat_smooth': 76, 'class_weight': 'balanced'}, distributions={'n_estimators': IntLogUniformDistribution(high=20000, low=10, step=1), 'min_child_samples': IntLogUniformDistribution(high=5000, low=1, step=1), 'max_depth': IntUniformDistribution(high=50, low=1, step=1), 'reg_lambda': LogUniformDistribution(high=25, low=0.0001), 'reg_alpha': LogUniformDistribution(high=25, low=0.0001), 'colsample_bytree': UniformDistribution(high=1, low=0.1), 'num_leaves': IntLogUniformDistribution(high=5000, low=5, step=1), 'cat_smooth': IntUniformDistribution(high=100, low=0, step=1), 'class_weight': CategoricalDistribution

In [49]:
study.best_params

{'n_estimators': 19901,
 'min_child_samples': 1225,
 'max_depth': 17,
 'reg_lambda': 0.32146282595088194,
 'reg_alpha': 4.211636870856943,
 'colsample_bytree': 0.21531358139125134,
 'num_leaves': 314,
 'cat_smooth': 76,
 'class_weight': 'balanced'}

In [50]:
study.best_value

0.8953761018691926

## Retrain with optimal parameters

In [26]:
pipe = Pipeline([('cat_trans', CategoricalTransform(cat_cols)), 
              ('lgbm', LGBMClassifier(**study.best_params))])

In [27]:
pipe.fit(x_train, y_train)



Pipeline(steps=[('cat_trans',
                 CategoricalTransform(cat_cols=['cat0', 'cat1', 'cat2', 'cat3',
                                                'cat4', 'cat5', 'cat6', 'cat7',
                                                'cat8', 'cat9', 'cat10',
                                                'cat11', 'cat12', 'cat13',
                                                'cat14', 'cat15', 'cat16',
                                                'cat17', 'cat18'])),
                ('lgbm',
                 LGBMClassifier(cat_smooth=5,
                                colsample_bytree=0.6049537084498485,
                                max_depth=24, min_data_in_leaf=4961,
                                n_estimators=294, num_leaves=458,
                                reg_lambda=0.18247393656457597))])

In [28]:
pred = pipe.predict_proba(x_test.drop(columns='id'))[:,1]

In [29]:
submission = pd.DataFrame({'id':test['id'], 'target':pred})
submission.head()

Unnamed: 0,id,target
0,5,0.118467
1,6,0.357551
2,8,0.008229
3,9,0.263619
4,11,0.095954


In [30]:
submission.to_csv('submission_lightgbm_optuna.csv', index=False)