In [1]:
import numpy as np
import pandas as pd


from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.model_selection import KFold, cross_val_score

from bayes_opt import BayesianOptimization
from bayes_opt.util import Colours

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('data/train_transf.csv')
test = pd.read_csv('data/test_transf.csv')

В ходе решения сравнивались несколько алгоритмов, также тестировался стэкинг нескольких алгоритмов. Лучше всего себя показал LGB с байесовской оптимизацией гиперпараметров

In [3]:
features = list(filter(lambda x: ('field' in x), train.columns))


X = train[features]
y = train['goal1']

## Hyperparameters tuning

In [6]:
def lgb_cv(n_estimators, learning_rate, max_depth,\
           num_leaves,\
           colsample_bytree,\
           subsample,\
           reg_alpha,\
           reg_lambda,\
           
           data, targets):

    estimator = LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        num_leaves= num_leaves,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        
        random_state=2
    )
    cval = cross_val_score(estimator, data, targets,
                               scoring='neg_log_loss', cv=4)
    return cval.mean()

In [7]:
def optimize_lgb(data, targets):
    """Apply Bayesian Optimization to Lgb parameters."""
    def lgb_crossval(n_estimators, learning_rate, max_depth, 
                    num_leaves,
                    colsample_bytree,
                    subsample,
                    reg_alpha,
                    reg_lambda):
        """Wrapper of Lgb validation.
        Notice how we ensure n_estimators and min_samples_split are casted
        to integer before we pass them along. Moreover, to avoid max_features
        taking values outside the (0, 1) range, we also ensure it is capped
        accordingly.
        """
        return lgb_cv(
            n_estimators=int(n_estimators),
            learning_rate=learning_rate,
            max_depth=int(max_depth),
            
            num_leaves= int(num_leaves),
            colsample_bytree=colsample_bytree,
            subsample=subsample,
            reg_alpha=int(reg_alpha),
            reg_lambda=int(reg_lambda) ,            
            
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=lgb_crossval,
        pbounds={
            "n_estimators": (200, 250),
            "learning_rate": (0.01, 0.05),
            "max_depth":(1, 8),
            
            'num_leaves':(6,50),
            'colsample_bytree':(0.5,0.7),
            'subsample':(0.6,0.8),
            'reg_alpha':(0,100),
            'reg_lambda':(0,100),
        },
        random_state=1234,
        verbose=2
    )
    optimizer.maximize(n_iter=5)

    print("Final result:", optimizer.max)

In [8]:
print(Colours.green("--- Optimizing Algo ---"))
optimize_lgb(X, y)

[92m--- Optimizing Algo ---[0m
|   iter    |  target   | colsam... | learni... | max_depth | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1014  [0m | [0m 0.5383  [0m | [0m 0.03488 [0m | [0m 4.064   [0m | [0m 239.3   [0m | [0m 40.32   [0m | [0m 27.26   [0m | [0m 27.65   [0m | [0m 0.7604  [0m |
| [0m 2       [0m | [0m-0.102   [0m | [0m 0.6916  [0m | [0m 0.04504 [0m | [0m 3.505   [0m | [0m 225.0   [0m | [0m 36.07   [0m | [0m 71.27   [0m | [0m 37.03   [0m | [0m 0.7122  [0m |
| [0m 3       [0m | [0m-0.1025  [0m | [0m 0.6006  [0m | [0m 0.01055 [0m | [0m 6.41    [0m | [0m 244.1   [0m | [0m 22.05   [0m | [0m 61.54   [0m | [0m 7.538   [0m | [0m 0.6738  [0m |
| [0m 4       [0m | [0m-0.1019  [0m | [0m 0.6866  [0m | [0m 0.03606 [0m | [0m 3.78    [0m | [0m 239.4   [

## Model fit and submit

In [9]:
%%time


lgb = LGBMClassifier(n_estimators=247, learning_rate = 0.03, max_depth = 6,
                        colsample_bytree = 0.626,
                        subsample = 0.668,
                        num_leaves = 46,
                        reg_alpha = 0.4,
                        reg_lambda = 98
                        )
scores = cross_val_score(lgb, X, y, scoring='roc_auc', cv=3)
print(f"{scores.mean()} ± {scores.std()}")

0.6864370237863696 ± 0.008963339327280133
Wall time: 16.9 s


In [10]:
#submit
lgb.fit(X, y)
pred = lgb.predict_proba(test[features])[:,1]

pd.DataFrame(pred, columns=['proba'], index=test['orderid']).to_csv('submit_0.csv')

Данного решения хватило для получения бронзовой медали в первой задаче контеста. Модель можно улучшить, если продолжить более детальный анализ закодированных признаков. 