In [1]:
# ! pip install lightgbm

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import sklearn.metrics
import lightgbm as lgb
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

from pathlib import Path
import sys
pathlib = str(Path().resolve()) + "/../../"
sys.path.append(pathlib)

data_path = "../../data"

from utils import Timer

In [3]:
train_df = pd.read_parquet(os.path.join(data_path, "1_LearningFE",  "train_processed.parquet")).reset_index(drop=True)

selected_features = ['dow']
selected_features += [f"f_{i}" for i in range(0, 80)]
selected_features += [f"f_{i}_CE" for i in [2, 4, 6, 13, 15, 18]+[78, 75, 50, 20, 24]]
selected_features += [f"f_{i}_idx" for i in range(2, 23) if i not in [2, 4, 6, 15]]

target_label = 'is_installed'
excluded_features = ['f_0', 'f_7', 'f_1', "is_clicked", "is_installed"]

features_train = [i for i in selected_features if i not in excluded_features and i in train_df.columns]
display(train_df[features_train])

lgbm_parms = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss'],
    'num_leaves': 63,
    'max_bin': 255,
    'num_trees': 10000,
    'min_data_in_leaf': 20,
    'min_sum_hessian_in_leaf': 5.0,
    'is_enable_sparse': True,
    'learning_rate': 0.01,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5
}

with Timer(f"Training"):
    dtrain = lgb.Dataset(
        data=train_df[features_train], 
        label=train_df[target_label]
    )

    model = lgb.train(
        lgbm_parms,
        train_set=dtrain,
        valid_sets=dtrain,
        verbose_eval=1000,)

Unnamed: 0,dow,f_2,f_3,f_4,f_5,f_6,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,f_31,f_32,f_33,f_34,f_35,f_36,f_37,f_38,f_39,f_40,f_41,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,f_50,f_51,f_52,f_53,f_54,f_55,f_56,f_57,f_58,f_59,f_60,f_61,f_62,f_63,f_64,f_65,f_66,f_67,f_68,f_69,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,f_2_CE,f_4_CE,f_6_CE,f_13_CE,f_15_CE,f_18_CE,f_78_CE,f_75_CE,f_50_CE,f_20_CE,f_24_CE,f_3_idx,f_5_idx,f_7_idx,f_8_idx,f_9_idx,f_10_idx,f_11_idx,f_12_idx,f_13_idx,f_14_idx,f_16_idx,f_17_idx,f_18_idx,f_19_idx,f_20_idx,f_21_idx,f_22_idx
0,3,3346,22294,6767,21545,1159,19203,6675,21574,27833,4473,18614,27291,2038,18162,12554,21865,29982,27961,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.308513,0.308513,0.308513,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.539897,0.000000,0.0,0.0,0.0,201888,94474,4065,427984,20006,446231,3394351,14554,3319591,323920,3260294,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,3,20095,563,31686,15908,590,19203,6675,19343,27833,30670,11359,20899,25365,14709,3249,20452,26758,4222,4625,0,0,0,0,0,0,0,0,0,0,3,0,1,0,1,1,1,0,0,0,26.570643,0.000060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.398702,0.000000,0.000000,0.192820,0.000000,0.000000,2.159588,1.605064,2.994627,0.0,0.591404,0.000000,1.065663,90779409,1677.072375,0.000050,8.146457,1.058418,1.712723,0.000000,0.0,0.571121,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,1007519,397980,397980,4526,842,36666,3394351,2810343,3319591,1008800,3260294,1,1,2,2,2,1,2,1,1,1,1,1,1,1,1,1,2
2,3,5156,22294,18971,25604,30192,19203,6675,19343,32266,4473,17705,26082,29887,14709,11918,12020,29982,27961,4740,0,0,0,0,0,0,0,0,0,0,3,0,1,1,1,1,1,0,0,0,15.657012,1.620103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.909948,0.000000,0.000000,0.038564,0.038564,0.038564,2.198152,0.830089,0.302038,0.0,1.626360,0.000000,0.000000,13087224,23.439561,1.495782,7.993264,0.000000,0.000000,1.148976,0.0,2.855607,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,95835,82898,7705,708988,5860,1340010,3394351,2810343,3319591,323920,3260294,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,3,3346,22294,6767,21545,10208,21621,6675,19343,32266,4473,18614,20496,11481,14709,6395,21865,29982,27961,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,1,0,0,0,1.272614,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.192820,0.192820,0.0,0.0,0.0,201888,94474,9149,427984,8169,446231,3394351,15787,3319591,323920,3260294,2,2,3,1,3,2,1,2,2,1,2,1,2,2,2,2,3
4,3,3346,22294,5579,15908,11774,21218,6675,22970,27957,4473,20634,20899,908,14709,13234,12020,29982,27961,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,1,1,0,0,0,21.673006,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.115692,0.038564,0.115692,0.347077,0.154256,0.694153,0.000000,0.000000,0.0,1.478509,0.258599,1.420884,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,201888,37540,8565,332611,3791,1340010,3394351,2810343,3319591,323920,3260294,3,2,4,1,4,1,1,3,1,2,3,1,1,3,3,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3485847,3,26325,7152,21563,21545,28565,21621,6675,22970,11004,4255,4230,26485,1590,25546,11918,12020,16851,30153,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,1,0,0,0,29.308692,2.225864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.762581,0.000000,0.000000,0.000000,0.347077,0.000000,0.347077,1.229138,1.756662,0.0,4.287676,0.129300,0.000000,51532956,556.191364,2.157136,8.087705,4.223591,11.651090,2.177379,0.0,0.000000,0.000000,0.000000,1.156922,0.269948,0.0,0.0,0.0,174464,35939,1345,819496,1150,1340010,3394351,2810343,3319591,222250,3260294,12196,28260,69862,4666,68162,22586,7573,5160,15965,14587,19546,7988,26330,8656,8580,34659,59129
3485848,3,20095,563,31686,19475,590,19203,6675,21574,11810,11402,18614,27291,14827,25546,28694,21865,26758,4222,4625,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,1,0,0,0,28.537411,0.000801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.270802,0.038564,0.000000,0.038564,4.511996,0.000000,5.591790,1.707634,1.979011,0.0,0.147851,0.000000,0.000000,255183433,2255.701154,0.000692,8.190964,0.199623,0.488804,0.000204,0.0,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,1007519,397980,397980,427984,6378,446231,3394351,2810343,3319591,1008800,3260294,19203,6139,69863,25945,68163,6331,8234,5052,9700,6954,19547,4398,10064,19429,19202,19217,59130
3485849,3,20095,563,22861,25604,21280,21218,6675,19343,11407,30670,30214,20899,24120,25546,3249,12020,26758,4222,4625,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,1,1,0,0,0,22.675672,0.000801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.270802,0.000000,0.000000,0.269948,0.000000,0.000000,0.192820,1.707634,1.979011,0.0,0.000000,0.129300,0.355221,255183433,2255.701154,0.000692,8.190964,0.199623,0.488804,0.000204,0.0,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,1007519,495055,306828,505713,1535,1340010,3394351,2810343,3319591,1008800,3260294,19204,24656,69864,22564,68164,40947,8510,11688,10152,21603,19548,11055,26331,19430,19203,19218,59131
3485850,3,26325,22294,4896,21545,26484,19203,6675,22970,11810,8659,17705,26082,24692,18162,8881,12020,16851,30153,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,14.924294,2.225864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.762581,0.000000,0.000000,0.000000,0.000000,0.000000,0.462769,1.229138,1.756662,0.0,0.443553,2.068795,0.000000,51532956,556.191364,2.157136,8.087705,4.223591,11.651090,2.177379,0.0,3.426729,0.000000,0.000000,1.156922,0.269948,0.0,0.0,0.0,174464,50239,18760,708988,13234,1340010,3394351,2810343,3319591,222250,3260294,32547,28261,69865,25946,68165,22587,8235,8113,13593,7974,12154,3880,26332,8657,8581,34660,59132


Training started ...




[LightGBM] [Info] Number of positive: 606602, number of negative: 2879250
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11238
[LightGBM] [Info] Number of data points in the train set: 3485852, number of used features: 106
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.174018 -> initscore=-1.557412
[LightGBM] [Info] Start training from score -1.557412
[1000]	training's binary_logloss: 0.295715
[2000]	training's binary_logloss: 0.289539
[3000]	training's binary_logloss: 0.286587
[4000]	training's binary_logloss: 0.284283
[5000]	training's binary_logloss: 0.282224
[6000]	training's binary_logloss: 0.280285
[7000]	training's binary_logloss: 0.278458
[8000]	training's binary_logloss: 0.276724
[9000]	training's binary_logloss: 0.275037
[10000]	training's binary_logloss: 0.273412
Training took 828.749081721995 sec


In [4]:
model.save_model(os.path.join(data_path, "1_LearningFE",  "lgbm_trained_HPO1.mdl"))

<lightgbm.basic.Booster at 0x7fb1d6bc12b0>