In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path
import gc

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

#import lightgbm as lgb
import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.read_csv(INPUT / "train.csv")
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_submission.csv")

In [None]:
train = train[train.columns[1:]].sample(frac=0.5, random_state=SEED)
#test = test[test.columns[1:]]

# Preprocessing

In [None]:
features = [col for col in train.columns if 'f' in col]

In [None]:
cont_features =[]
disc_features =[]

for col in features:
    if train[col].dtype=='float64':
        cont_features.append(col)
    else:
        disc_features.append(col)
        
features = disc_features + cont_features

In [None]:
train[cont_features] = train[cont_features].astype('float32')
train[disc_features] = train[disc_features].astype('uint8')
train[target] = train[target].astype('uint8')

#test[cont_features] = test[cont_features].astype('float32')
#test[disc_features] = test[disc_features].astype('uint8')

In [None]:
train[features]

In [None]:
train[target]

In [None]:
del disc_features
del cont_features

gc.collect()

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
                     'objective': 'binary:logistic',
                     'n_estimators':N_ESTIMATORS,
                     'importance_type': 'total_gain',
                     'booster': 'gbtree',
                     'n_jobs' : -1,
                     'tree_method': 'hist',
           
              'learning_rate':trial.suggest_loguniform('learning_rate', 1e-4, 1),        
              'subsample': trial.suggest_uniform('subsample', 0.603, 0.803),            
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.141, 0.341),    
              'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-4, 3),          
              'reg_alpha':trial.suggest_loguniform('reg_alpha', 1e-4, 3),             
              'min_child_weight':trial.suggest_uniform('min_child_weight', 39.56, 51.56),    
              'max_depth':trial.suggest_int('max_depth', 3, 5),            
              'max_bin':trial.suggest_int('max_bin', 193, 293),       
              'gamma': trial.suggest_uniform('gamma', 1.35, 7.35)  
      
            
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #xgb_oof = np.zeros(train.shape[0])
  #xgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    if fold == 0:
      print(f"===== fold {fold} =====")
    
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
      pred=np.zeros(y_valid.shape[0])

      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = xgb.XGBClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )
            
            del X_train
            del y_train
            gc.collect()

            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)
      
      
      del X_valid
      del y_valid
      del model
      del pred
      gc.collect()

      print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

0.8558899481568522 ver1-2

half data
0.8564765979994122 ver3 
0.8565336187143888 ver5
no improve ver6
