In [9]:
import pandas as pd
from sklearn.metrics import log_loss
import lightgbm as lgbm
# import optuna.integration.lightgbm as lgb

In [10]:
# reading data
X_train = pd.read_feather('X_train.f').set_index('sku')
X_val = pd.read_feather('X_val.f').set_index('sku')
y_train = pd.read_feather('y_train.f').set_index('sku').iloc[:, 0]
y_val = pd.read_feather('y_val.f').set_index('sku').iloc[:, 0]

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((432587, 40), (144196, 40), (432587,), (144196,))

In [11]:
# tuned the model first using Optuna and extracted the best params

# dtrain = lgb.Dataset(
#     data=X_train,
#     label=y_train
# )
# dval = lgb.Dataset(
#     data=X_val,
#     label=y_val
# )

# params = {
#     'objective': 'multiclass',
#     'num_classes': 30,
#     'metric': 'multi_logloss',
#     'boosting': 'gbdt',
#     'random_seed': 0,
#     'deterministic': True
# }  

# # using Optuna LightGBM Tuner
# model = lgb.train(
#     params,
#     dtrain,
#     valid_sets=[dtrain, dval],
#     early_stopping_rounds=20
# )

In [12]:
dtrain = lgbm.Dataset(
    data=X_train,
    label=y_train
)
dval = lgbm.Dataset(
    data=X_val,
    label=y_val
)

best_params = {
    'objective': 'multiclass',
    'num_classes': 30,
    'metric': 'multi_logloss',
    'boosting': 'gbdt',
    'random_seed': 0,
    'deterministic': True,
    'feature_pre_filter': False,
    'lambda_l1': 0.00037397899681396743,
    'lambda_l2': 8.75904356834923,
    'num_leaves': 19,
    'feature_fraction': 0.8,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 25,
    'num_iterations': 140,
    'early_stopping_round': 20
}  

model = lgbm.train(
    best_params,
    dtrain,
    valid_sets=[dtrain, dval],
    early_stopping_rounds=20
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6640
[LightGBM] [Info] Number of data points in the train set: 432587, number of used features: 40
[LightGBM] [Info] Start training from score -3.336781
[LightGBM] [Info] Start training from score -3.308508
[LightGBM] [Info] Start training from score -3.334118
[LightGBM] [Info] Start training from score -3.400128
[LightGBM] [Info] Start training from score -3.461701
[LightGBM] [Info] Start training from score -3.520495
[LightGBM] [Info] Start training from score -3.374346
[LightGBM] [Info] Start training from score -3.283230
[LightGBM] [Info] Start training from score -3.260140
[LightGBM] [Info] Start training from score -3.278189
[LightGBM] [Info] Start training from score -3.360135
[LightGBM] [Info] Start training from score -3.478267
[LightGBM] [Info] Start training from score -3.519792
[LightGBM] [Info] Start training from score -3.3

[87]	training's multi_logloss: 3.06218	valid_1's multi_logloss: 3.12085
[88]	training's multi_logloss: 3.06149	valid_1's multi_logloss: 3.12076
[89]	training's multi_logloss: 3.06081	valid_1's multi_logloss: 3.12069
[90]	training's multi_logloss: 3.06018	valid_1's multi_logloss: 3.12061
[91]	training's multi_logloss: 3.05954	valid_1's multi_logloss: 3.12058
[92]	training's multi_logloss: 3.05887	valid_1's multi_logloss: 3.12053
[93]	training's multi_logloss: 3.05823	valid_1's multi_logloss: 3.12048
[94]	training's multi_logloss: 3.05757	valid_1's multi_logloss: 3.12043
[95]	training's multi_logloss: 3.05695	valid_1's multi_logloss: 3.12038
[96]	training's multi_logloss: 3.05632	valid_1's multi_logloss: 3.12034
[97]	training's multi_logloss: 3.05569	valid_1's multi_logloss: 3.12025
[98]	training's multi_logloss: 3.05507	valid_1's multi_logloss: 3.12023
[99]	training's multi_logloss: 3.05449	valid_1's multi_logloss: 3.12017
[100]	training's multi_logloss: 3.05387	valid_1's multi_logloss:

In [13]:
%%time

# predictions on best model
probs_train = model.predict(X_train)
probs_val = model.predict(X_val)

log_train = log_loss(y_train, probs_train)
log_val = log_loss(y_val, probs_val)
print('Log loss (train):', log_train)
print('Log loss (val):', log_val)

# scoring_function expects target in [1, 30] not in [0, 29]
rps_train = scoring_function(y_train + 1, probs_train)
rps_val = scoring_function(y_val + 1, probs_val)
print('RPS (train):', rps_train)
print('RPS (val):', rps_val)

Log loss (train): 3.030849956990119
Log loss (val): 3.1194339837202962
RPS (train): 3.626246687440311
RPS (val): 3.6679233366046873
CPU times: user 6min 9s, sys: 2.22 s, total: 6min 11s
Wall time: 1min 51s


In [16]:
pd.DataFrame(probs_val).to_csv('2.1-probs-tuned-lgbm.csv')