In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

#import lightgbm as lgb
#import xgboost as xgb
import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-sep-2021")

train = pd.DataFrame(pd.read_csv(INPUT / "train.csv")[target])
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
train['lgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb_oof.npy")
#test['lgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb_pred.npy")

train['lgb2_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb2_oof.npy")
#test['lgb2_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb2_pred.npy")

train['xgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_xgb_oof.npy")
#test['xgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_xgb_pred.npy")

train['lgb_bizen_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_bizen_oof.npy")
#test['lgb_bizen_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_bizen_pred.npy")

train['lgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_dmitry_oof.npy")
#test['lgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_dmitry_pred.npy")

train['xgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_dmitry_oof.npy")
#test['xgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_dmitry_pred.npy")

train['lgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_manav_oof.npy")
#test['lgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_manav_pred.npy")

train['xgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_manav_oof.npy")
#test['xgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_manav_pred.npy")

In [None]:
train['ridge_pred'] = np.load("../input/tps-sep-2021-stacking-ridge-lv0/ridge_oof.npy")
#test['ridge_pred'] = np.load("../input/tps-sep-2021-stacking-ridge-lv0/ridge_pred.npy")

# Preprocessing

In [None]:
features = [col for col in train.columns if 'pred' in col]

In [None]:
ss = StandardScaler()

In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
                     'objective': 'CrossEntropy',                
                      'n_estimators':N_ESTIMATORS,                 
                      'thread_count' : -1,              
                      'bootstrap_type': 'Bernoulli',             
                      'eval_metric': 'AUC',       
      
                      'learning_rate':trial.suggest_loguniform('learning_rate', 1.8e-2, 6.8e-2),           
                      'subsample': trial.suggest_uniform('subsample', 0.833, 0.883),             
                      'colsample_bylevel':trial.suggest_uniform('colsample_bytree', 0.800, 0.850),            
                      'max_depth':trial.suggest_int('max_depth', 2, 4),           
                      'min_child_samples':trial.suggest_int('min_child_samples', 203, 227),        
                      'max_bin':trial.suggest_int('max_bin', 322, 346),             
                      'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-3, 1e-2),            
                      'random_strength': trial.suggest_uniform('random_strength', 18.8, 21.4),           
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  ctb_oof = np.zeros(train.shape[0])
  #ctb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
        
      X_train[features] = ss.fit_transform(X_train[features])
      X_valid[features] = ss.transform(X_valid[features])
      #X_test[features] = ss.transform(X_test[features])

      
      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = ctb.CatBoostClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                use_best_model=True,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            ctb_oof[val_idx] += model.predict_proba(X_valid)[:,-1] / len(seed_list)
            

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, ctb_oof[val_idx])

      print(f"fold {fold} - ctb rmse: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

  auc_oof = roc_auc_score(train[target], ctb_oof)
  print(f"oof ctb_rmse = {auc_oof}")
  
  return auc_oof

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 5)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

/////// 8 trees + ridge /////////

0.8169438781582106 ver1

0.8169542033535098 ver2

0.8169630119148773 ver3

0.8169836304419348 ver4 of ver4-6

no improve ver7-9