### Import

In [20]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import category_encoders as ce
import lightgbm as lgb

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [21]:
# 동일한 결과 보장을 위해 Seed값을 고정합니다
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed를 42로 고정

In [22]:
# 제공된 train 데이터와 test 데이터를 불러옵니다
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [23]:
# 빠른 훈련을 위해 데이터 갯수 줄이기
# train = train.sample(100000)
# train.info()

### EDA 1 : Sparse and Dense

In [24]:
train.head()

Unnamed: 0,ID,Click,F01,F02,F03,F04,F05,F06,F07,F08,...,F30,F31,F32,F33,F34,F35,F36,F37,F38,F39
0,TRAIN_00000000,1,NSLHFNS,AVKQTCL,DTZFPRW,114.0,ISVXFVA,1,PQZBVMG,LPYPUNA,...,NZGEZLW,GTISJWW,380.0,2.0,AXQFZWC,IRUDRFB,,TFJMLCZ,0.0,AURZYDY
1,TRAIN_00000001,0,VGIVWZQ,LSUSMVO,PQGWFJZ,26.0,NFRVLWS,43,IMPIGJT,MIGYEEG,...,NZGEZLW,GTISJWW,466.0,1.0,DRVVDHZ,IRUDRFB,19.0,AUGTURV,0.0,LUZRMLU
2,TRAIN_00000002,0,JCDXFYU,PILDDJU,IAGJDOH,119.0,LFPUEOV,0,FFUTIRZ,OFKQGTY,...,VHXETCF,KHZNEZF,197.0,0.0,QMOULXS,IRUDRFB,8.0,ZVSTLNM,0.0,MHBRSQK
3,TRAIN_00000003,1,PSMFWTP,ZYAVJHP,,15.0,ATQPZSJ,26,ZDTZNSB,THBWWCD,...,IVIRTPR,GTISJWW,8640.0,0.0,IZLJUJS,IRUDRFB,14.0,ZBSRLCQ,0.0,GAZBSSZ
4,TRAIN_00000004,0,SLCRICD,QPQWGXA,,13.0,CHZGJZR,20,PQZBVMG,MIGYEEG,...,NZGEZLW,WHSRKIM,41774.0,0.0,BHBIZCL,IRUDRFB,13.0,QHYLSBX,0.0,QTATWAY


### EDA 2 : Imbalance

In [25]:
click = train['Click'].value_counts(normalize=True)

click_figure = px.bar(click,
             x=['Not Clicked : 0', 'Clicked : 1'],
             y=click.values.tolist(),
             labels={'x': 'Value', 'y': 'Percentage'},
             width = 450,
             height = 500
            )

# 그래프 표시
click_figure.show()

### Data Preprocessing 1 : Select x, y

In [26]:
train_x = train.drop(columns=['ID', 'Click'])
train_y = train['Click']

test_x = test.drop(columns=['ID'])

### Data Preprocessing 2 : Fill NaN

In [27]:
for col in tqdm(train_x.columns):
    if train_x[col].isnull().sum() != 0:
        train_x[col].fillna(0, inplace=True)
        test_x[col].fillna(0, inplace=True)

100%|██████████| 39/39 [00:37<00:00,  1.05it/s]


### Data Preprocessing 3 : Count Encoding

In [28]:
encoding_target = list(train_x.dtypes[train_x.dtypes == "object"].index)

enc = ce.CountEncoder(cols = encoding_target).fit(train_x, train_y)
X_train_encoded = enc.transform(train_x)
X_test_encoded = enc.transform(test_x)

### Model Setting

In [30]:
def objectiveLGBM(trial: Trial, X, y):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_int('max_depth', 1, 20),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'learning_rate': trial.suggest_loguniform("learning_rate", 1e-8, 1e-2),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        # "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'device':"gpu",
        'gpu_use_dp':True
    }
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        
    model = lgb.LGBMClassifier(**param)
    model = model.fit(X_train, y_train)
    
    score = mean_squared_error(model.predict(X_test), y_test, squared=False)
    return score

In [31]:
study = optuna.create_study(direction='minimize')
study.optimize(lambda trial : objectiveLGBM(trial, X_train_encoded, train_y), n_trials=20)
print('Best trial: score {},\nparams {}'.format(study.best_trial.value,study.best_trial.params))

optuna.visualization.plot_param_importances(study) # 파라미터 중요도 확인 그래프
optuna.visualization.plot_optimization_history(study) # 최적화 과정 시각화

[I 2024-05-26 18:39:16,578] A new study created in memory with name: no-name-7cd29153-27c5-46f9-9b74-d1920e1f795e



suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013320, number of negative: 20731531
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6456
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.405041 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194731 -> initscore=-1.419557
[LightGBM] [Info] Start training from score -1.419557


[I 2024-05-26 18:51:07,211] Trial 0 finished with value: 0.4410869199253874 and parameters: {'n_estimators': 1190, 'max_depth': 4, 'num_leaves': 184, 'learning_rate': 0.00015159740418497643, 'subsample': 0.6552358447870762, 'min_child_samples': 74}. Best is trial 0 with value: 0.4410869199253874.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012009, number of negative: 20732842
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6446
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.423944 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194680 -> initscore=-1.419882
[LightGBM] [Info] Start training from score -1.419882


[I 2024-05-26 19:04:11,430] Trial 1 finished with value: 0.4416061322089296 and parameters: {'n_estimators': 2176, 'max_depth': 2, 'num_leaves': 27, 'learning_rate': 9.251505795794353e-07, 'subsample': 0.5838351938584765, 'min_child_samples': 53}. Best is trial 0 with value: 0.4410869199253874.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013274, number of negative: 20731577
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6454
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.382889 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194729 -> initscore=-1.419569
[LightGBM] [Info] Start training from score -1.419569


[I 2024-05-26 19:24:10,773] Trial 2 finished with value: 0.44100488315891995 and parameters: {'n_estimators': 1910, 'max_depth': 15, 'num_leaves': 17, 'learning_rate': 0.0008484771240600346, 'subsample': 0.41653658182535913, 'min_child_samples': 19}. Best is trial 2 with value: 0.44100488315891995.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013334, number of negative: 20731517
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6437
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.388767 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194732 -> initscore=-1.419554
[LightGBM] [Info] Start training from score -1.419554


[I 2024-05-26 19:27:34,708] Trial 3 finished with value: 0.44108137202612185 and parameters: {'n_estimators': 168, 'max_depth': 18, 'num_leaves': 136, 'learning_rate': 7.530901371235788e-08, 'subsample': 0.3630463965274233, 'min_child_samples': 63}. Best is trial 2 with value: 0.44100488315891995.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013032, number of negative: 20731819
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6440
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.389816 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194720 -> initscore=-1.419629
[LightGBM] [Info] Start training from score -1.419629


[I 2024-05-26 19:44:56,920] Trial 4 finished with value: 0.43592052973437484 and parameters: {'n_estimators': 2474, 'max_depth': 5, 'num_leaves': 5, 'learning_rate': 0.005983263492178158, 'subsample': 0.38838401753667173, 'min_child_samples': 9}. Best is trial 4 with value: 0.43592052973437484.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013827, number of negative: 20731024
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6455
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.389878 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194751 -> initscore=-1.419432
[LightGBM] [Info] Start training from score -1.419432


[I 2024-05-26 19:49:34,760] Trial 5 finished with value: 0.44088596220196014 and parameters: {'n_estimators': 344, 'max_depth': 5, 'num_leaves': 86, 'learning_rate': 2.1458057757919182e-05, 'subsample': 0.47775516076429536, 'min_child_samples': 16}. Best is trial 4 with value: 0.43592052973437484.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012825, number of negative: 20732026
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6460
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.434780 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194712 -> initscore=-1.419680
[LightGBM] [Info] Start training from score -1.419680


[I 2024-05-26 20:18:16,743] Trial 6 finished with value: 0.4412830329614446 and parameters: {'n_estimators': 1749, 'max_depth': 18, 'num_leaves': 157, 'learning_rate': 1.4890458099045205e-05, 'subsample': 0.944954216534468, 'min_child_samples': 55}. Best is trial 4 with value: 0.43592052973437484.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012059, number of negative: 20732792
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6450
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.381041 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194682 -> initscore=-1.419870
[LightGBM] [Info] Start training from score -1.419870


[I 2024-05-26 20:21:53,470] Trial 7 finished with value: 0.44158634125991697 and parameters: {'n_estimators': 498, 'max_depth': 2, 'num_leaves': 124, 'learning_rate': 6.622951634106303e-07, 'subsample': 0.9779315589883253, 'min_child_samples': 57}. Best is trial 4 with value: 0.43592052973437484.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012497, number of negative: 20732354
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6452
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.423296 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194699 -> initscore=-1.419761
[LightGBM] [Info] Start training from score -1.419761


[I 2024-05-26 20:32:24,205] Trial 8 finished with value: 0.44141293461401176 and parameters: {'n_estimators': 1080, 'max_depth': 4, 'num_leaves': 39, 'learning_rate': 2.8266971543604505e-08, 'subsample': 0.5229072179823006, 'min_child_samples': 43}. Best is trial 4 with value: 0.43592052973437484.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013490, number of negative: 20731361
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6465
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.419319 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194738 -> initscore=-1.419515
[LightGBM] [Info] Start training from score -1.419515


[I 2024-05-26 21:03:56,878] Trial 9 finished with value: 0.4301252443347253 and parameters: {'n_estimators': 2614, 'max_depth': 7, 'num_leaves': 222, 'learning_rate': 0.0030536837768420934, 'subsample': 0.7404920654016923, 'min_child_samples': 62}. Best is trial 9 with value: 0.4301252443347253.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013222, number of negative: 20731629
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6442
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.427903 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194727 -> initscore=-1.419582
[LightGBM] [Info] Start training from score -1.419582


[I 2024-05-26 21:41:54,015] Trial 10 finished with value: 0.42273687798622045 and parameters: {'n_estimators': 2998, 'max_depth': 10, 'num_leaves': 254, 'learning_rate': 0.008995512773662645, 'subsample': 0.7459245755203137, 'min_child_samples': 89}. Best is trial 10 with value: 0.42273687798622045.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5011890, number of negative: 20732961
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6455
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.421135 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194675 -> initscore=-1.419912
[LightGBM] [Info] Start training from score -1.419912


[I 2024-05-26 22:20:01,036] Trial 11 finished with value: 0.42286999677013476 and parameters: {'n_estimators': 2929, 'max_depth': 10, 'num_leaves': 255, 'learning_rate': 0.009500982385282346, 'subsample': 0.7679562086220375, 'min_child_samples': 98}. Best is trial 10 with value: 0.42273687798622045.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012502, number of negative: 20732349
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6455
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.403291 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194699 -> initscore=-1.419760
[LightGBM] [Info] Start training from score -1.419760


[I 2024-05-26 23:13:04,589] Trial 12 finished with value: 0.44121767163482467 and parameters: {'n_estimators': 2946, 'max_depth': 12, 'num_leaves': 238, 'learning_rate': 0.0003239456911744623, 'subsample': 0.777476601621226, 'min_child_samples': 96}. Best is trial 10 with value: 0.42273687798622045.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012156, number of negative: 20732695
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6464
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.401622 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194686 -> initscore=-1.419846
[LightGBM] [Info] Start training from score -1.419846


[I 2024-05-26 23:51:03,046] Trial 13 finished with value: 0.4226157117248266 and parameters: {'n_estimators': 2939, 'max_depth': 10, 'num_leaves': 256, 'learning_rate': 0.00947599327270648, 'subsample': 0.7989612028795253, 'min_child_samples': 99}. Best is trial 13 with value: 0.4226157117248266.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013008, number of negative: 20731843
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6452
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.418987 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194719 -> initscore=-1.419635
[LightGBM] [Info] Start training from score -1.419635


[I 2024-05-27 00:32:09,904] Trial 14 finished with value: 0.44121054072252186 and parameters: {'n_estimators': 2354, 'max_depth': 10, 'num_leaves': 200, 'learning_rate': 8.889797450153279e-05, 'subsample': 0.8693235221511064, 'min_child_samples': 82}. Best is trial 13 with value: 0.4226157117248266.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013443, number of negative: 20731408
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6439
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.399963 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194736 -> initscore=-1.419527
[LightGBM] [Info] Start training from score -1.419527


[I 2024-05-27 01:17:21,720] Trial 15 finished with value: 0.4293133534645231 and parameters: {'n_estimators': 2778, 'max_depth': 13, 'num_leaves': 209, 'learning_rate': 0.0018211884426246924, 'subsample': 0.8549106763117804, 'min_child_samples': 84}. Best is trial 13 with value: 0.4226157117248266.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012479, number of negative: 20732372
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6453
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.438509 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194698 -> initscore=-1.419766
[LightGBM] [Info] Start training from score -1.419766


[I 2024-05-27 01:50:30,722] Trial 16 finished with value: 0.4396981638679627 and parameters: {'n_estimators': 2017, 'max_depth': 8, 'num_leaves': 256, 'learning_rate': 0.0006594260048252692, 'subsample': 0.6858770036331815, 'min_child_samples': 38}. Best is trial 13 with value: 0.4226157117248266.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012748, number of negative: 20732103
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6462
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.397454 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194709 -> initscore=-1.419699
[LightGBM] [Info] Start training from score -1.419699


[I 2024-05-27 02:33:59,343] Trial 17 finished with value: 0.44131353160101033 and parameters: {'n_estimators': 2554, 'max_depth': 15, 'num_leaves': 171, 'learning_rate': 3.784795797856084e-06, 'subsample': 0.8623815681634901, 'min_child_samples': 86}. Best is trial 13 with value: 0.4226157117248266.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5013154, number of negative: 20731697
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6470
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.420083 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194725 -> initscore=-1.419599
[LightGBM] [Info] Start training from score -1.419599


[I 2024-05-27 02:52:13,401] Trial 18 finished with value: 0.4411526968413599 and parameters: {'n_estimators': 1201, 'max_depth': 8, 'num_leaves': 75, 'learning_rate': 4.3326470701978e-05, 'subsample': 0.5721351803300325, 'min_child_samples': 100}. Best is trial 13 with value: 0.4226157117248266.

suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.



[LightGBM] [Info] Number of positive: 5012964, number of negative: 20731887
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 6435
[LightGBM] [Info] Number of data points in the train set: 25744851, number of used features: 39
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070 Ti, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 35 dense feature groups (883.88 MB) transferred to GPU in 0.417220 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194717 -> initscore=-1.419646
[LightGBM] [Info] Start training from score -1.419646


[I 2024-05-27 03:16:57,321] Trial 19 finished with value: 0.4238687045501581 and parameters: {'n_estimators': 1576, 'max_depth': 20, 'num_leaves': 227, 'learning_rate': 0.008780273761983077, 'subsample': 0.7172519358168575, 'min_child_samples': 73}. Best is trial 13 with value: 0.4226157117248266.


Best trial: score 0.4226157117248266,
params {'n_estimators': 2939, 'max_depth': 10, 'num_leaves': 256, 'learning_rate': 0.00947599327270648, 'subsample': 0.7989612028795253, 'min_child_samples': 99}


In [32]:
optuna.visualization.plot_param_importances(study)

In [33]:
model = lgb.LGBMClassifier(**study.best_trial.params)
model.fit(X_train_encoded, train_y)

[LightGBM] [Info] Number of positive: 5569860, number of negative: 23035531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.442915 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6454
[LightGBM] [Info] Number of data points in the train set: 28605391, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.194714 -> initscore=-1.419668
[LightGBM] [Info] Start training from score -1.419668


### Model Train and Inference

In [34]:
pred = model.predict_proba(X_test_encoded)
display(model.classes_)
display(pred)

array([0, 1], dtype=int64)

array([[0.78186456, 0.21813544],
       [0.87413343, 0.12586657],
       [0.88083507, 0.11916493],
       ...,
       [0.9210273 , 0.0789727 ],
       [0.80686313, 0.19313687],
       [0.72118189, 0.27881811]])

### Submission

In [35]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0
1,TEST_0000001,0
2,TEST_0000002,0
3,TEST_0000003,0
4,TEST_0000004,0
...,...,...
4538536,TEST_4538536,0
4538537,TEST_4538537,0
4538538,TEST_4538538,0
4538539,TEST_4538539,0


In [36]:
sample_submission['Click'] = pred[:,1]
sample_submission

Unnamed: 0,ID,Click
0,TEST_0000000,0.218135
1,TEST_0000001,0.125867
2,TEST_0000002,0.119165
3,TEST_0000003,0.407183
4,TEST_0000004,0.451049
...,...,...
4538536,TEST_4538536,0.220562
4538537,TEST_4538537,0.289496
4538538,TEST_4538538,0.078973
4538539,TEST_4538539,0.193137


In [37]:
sample_submission.to_csv('baseline_submission.csv', index=False)