In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

#import lightgbm as lgb
import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-sep-2021")

train = pd.read_csv(INPUT / "train.csv")
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

# Preprocessing

In [None]:
features = [col for col in train.columns if 'f' in col]

In [None]:
train['n_missing'] = train[features].isna().sum(axis=1)
#test['n_missing'] = test[features].isna().sum(axis=1)

features += ['n_missing']

In [None]:
train[features] = train[features].fillna(train[features].median())
#test[features] = test[features].fillna(test[features].median())

In [None]:
ss = StandardScaler()


In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
                     'objective': 'binary:logistic',
                     'n_estimators':N_ESTIMATORS,
                     'importance_type': 'total_gain',
                     'booster': 'gbtree',
                     'n_jobs' : -1,
                     'tree_method': 'hist',
           
               'learning_rate':trial.suggest_loguniform('learning_rate', 0.89e-2, 1.39e-2),
               'subsample': trial.suggest_uniform('subsample', 0.646, 0.652),
               'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.307, 0.313),
               'reg_lambda':trial.suggest_loguniform('reg_lambda', 1.8, 1.9),
               'reg_alpha':trial.suggest_loguniform('reg_alpha', 5.906e-3, 6.406e-3),
               'min_child_weight':trial.suggest_uniform('min_child_weight', 32.194, 32.694),
               'max_depth':trial.suggest_int('max_depth', 4, 6),
               'max_bin':trial.suggest_int('max_bin', 275, 281),
                'gamma': trial.suggest_uniform('gamma', 4.626, 4.726)   
      
            
               # 'device':'gpu',  # Use GPU acceleration
               # 'gpu_platform_id': 0,
               # 'gpu_device_id': 0,
 
                #'min_child_samples':trial.suggest_int('min_child_samples', 50, 500),
               #'num_leaves':trial.suggest_int('num_leaves', 50, 500),
                #'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
               #'metric':'auc',
              #'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.005),
              #'class_weight':trial.suggest_categorical('class_weight',['balanced',None]),
              #'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              #'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #xgb_oof = np.zeros(train.shape[0])
  #xgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
      print(f"===== fold {fold} =====")
    
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]

      X_train[features] = ss.fit_transform(X_train[features])
      X_valid[features] = ss.transform(X_valid[features])
      #X_test[features] = ss.transform(X_test[features])

      pred=np.zeros(y_valid.shape[0])

      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = xgb.XGBClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            #xgb_oof[val_idx] = model.predict_proba(X_valid)
            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)

      print(f"fold {fold} - xgb rmse: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

      #print(f"oof xgb_rmse = {mean_squared_error(train[target], xgb_oof, squared=False)}")
  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

0.8153011885518184 ver7 of ver4-7

0.8155223791713297 ver8 of ver8-11 (4 trials only) continue ver12-14 (6 trials)

0.8156953001633805 ver15 of ver15-18 (5 trials only) continue ver19-22 (9 trials)

0.8158065866366064 ver25 of ver23-26 (6 trials only) continue ver27-30 (7 trials)

0.8160744917112783 ver36 of ver31-34 (6 trials only) continue ver35-38 (5 trials)

0.8161638760838015 ver40 of ver39-42 (1 trial only) continue ver43-46 (1 trial) continue cut folds ver47-50 (3 trials) continue ver51-54 (4 trials) continue ver55-56 (2 trials)

0.8162280313549242 ver59 of ver57-59 (2 trials only) continue ver60-63 (4 trials) continue ver64-67 (3 trials) continue ver68-70 (1 trial)

ver71-73 (3 trials) continue ver74-77 (4 trials) continue ver78-81 (4 trials)