In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path
import gc

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.read_csv(INPUT / "train.csv")
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_submission.csv")

In [None]:
train = train[train.columns[1:]].sample(frac=0.5, random_state=SEED)
#test = test[test.columns[1:]]

# Preprocessing

In [None]:
features = [col for col in train.columns if 'f' in col]

In [None]:
cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)
        
features = disc_features + cont_features

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')
train[target] = train[target].astype('uint8')

#test[cont_features] = test[cont_features].astype('float32')
#test[disc_features] = test[disc_features].astype('uint8')

In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
                     'objective': 'binary',
                     'n_estimators':N_ESTIMATORS,
                     'importance_type': 'gain',
                     'metric':'auc',
                     'boosting_type': 'gbdt',
                     'n_jobs' : -1,
                     
          
              'learning_rate':trial.suggest_loguniform('learning_rate', 3.81e-3, 3.91e-3),     
              'subsample': trial.suggest_uniform('subsample', 0.582, 0.584),        
              'subsample_freq': trial.suggest_int('subsample_freq', 1, 2),           
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1955, 0.1965),    
              'reg_lambda':trial.suggest_loguniform('reg_lambda', 1.16e-2, 1.18e-2),         
              'reg_alpha':trial.suggest_loguniform('reg_alpha', 0.0469, 0.0519),               
              'min_child_weight':trial.suggest_uniform('min_child_weight', 16.75, 16.85),     
              'min_child_samples':trial.suggest_int('min_child_samples', 411, 413),          
              'num_leaves':trial.suggest_int('num_leaves', 545, 547),      
              'max_depth':trial.suggest_int('max_depth', 4, 6),           
              #'max_bin':trial.suggest_int('max_bin', 50, 500),   
            
              #'device':'gpu',  # Use GPU acceleration
               #'gpu_platform_id': 0,
               #'gpu_device_id': 0,
           
              'cat_smooth':trial.suggest_uniform('cat_smooth', 36.35, 36.45),
              'cat_l2':trial.suggest_loguniform('cat_l2', 12.96, 13.00)
              #'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.005),
              #'class_weight':trial.suggest_categorical('class_weight',['balanced',None]),
              
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  lgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    if fold == 0:
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = lgb.LGBMClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                categorical_feature= disc_features,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            lgb_oof[val_idx] += model.predict_proba(X_valid)[:,-1] / len(seed_list)

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, lgb_oof[val_idx])

      print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

0.8564088847951197 ver1

half data
0.8564245100943821 ver2
0.8564532577523489 ver4
0.8564973394412352 ver6
0.8565029746525981 ver7
0.8565050246567316 ver8
0.8565710723899103 ver9
0.8565790316059593 ver10
0.8566150868786613 ver11
0.8566269837026499 ver12
no improve ver13