In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from os.path import join, isfile
from os import path, scandir, listdir

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

#import lightgbm as lgb
import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.DataFrame(pd.read_csv(INPUT / "train.csv")[target])
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
def list_all_files(location='../input/tps-oct-lv0', pattern=None, recursive=True):
    """
    This function returns a list of files at a given location (including subfolders)
    
    - location: path to the directory to be searched
    - pattern: part of the file name to be searched (ex. pattern='.csv' would return all the csv files)
    - recursive: boolean, if True the function calls itself for every subdirectory it finds
    """
    subdirectories= [f.path for f in scandir(location) if f.is_dir()]
    files = [join(location, f) for f in listdir(location) if isfile(join(location, f))]
    if recursive:
        for directory in subdirectories:
            files.extend(list_all_files(directory))
    if pattern:
        files = [f for f in files if pattern in f]
    return files

In [None]:
names = ['bizen', 'henke', 'hamza', '28smiles','kashif', 'kosta', 'kaveh', 'dlaststark', 'pca', 'xgb2']

In [None]:
pred = list_all_files(pattern='oof')



for i in range(len(names)):
    avv = []
    
    for file in pred:
        if names[i] in file.split('/')[3]:
            avv.append(np.load(file))
    train[names[i]] = np.mean(avv, axis=0)
            
    
train.columns

In [None]:
avv = []
for i in range(5):
    avv.append(np.load("../input/tps-oct-lv0/"+str(i+2017)+"lgb_oof.npy"))
train['lgb'] = np.mean(avv, axis=0)

avv = []
for i in range(5):
    avv.append(np.load("../input/tps-oct-lv0/agg"+str(i+1)+"_xgb_oof.npy"))
train['xgb'] = np.mean(avv, axis=0)

In [None]:
train

# Preprocessing

In [None]:
features = train.columns[1:]

In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
         'objective': 'binary:logistic',                   
          'n_estimators':N_ESTIMATORS,                   
          'importance_type': 'total_gain',               
          'booster': 'gbtree',                   
          'n_jobs' : -1,                
          'tree_method': 'hist',       
      
          'learning_rate':trial.suggest_loguniform('learning_rate', 1e-4, 5),        
              'subsample': trial.suggest_uniform('subsample', 0.519, 0.919),            
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.372, 0.772),    
              'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-4, 12),          
              'reg_alpha':trial.suggest_loguniform('reg_alpha', 1e-4, 12),             
              'min_child_weight':trial.suggest_uniform('min_child_weight', 1e-4, 18.7),    
              'max_depth':trial.suggest_int('max_depth', 4, 8),            
              'max_bin':trial.suggest_int('max_bin', 364, 564),       
              'gamma': trial.suggest_uniform('gamma', 6, 30)  
      
      
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #xgb_oof = np.zeros(train.shape[0])
  #xgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    if fold ==0:
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
        
      pred=np.zeros(y_valid.shape[0])
     
      
      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = xgb.XGBClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)
            

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)

      print(f"fold {fold} - xgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 200)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

/////// best average /////////
0.8566642366353803 ver2
no improve ver15

/// best raw //////
0.856687427907201 ver3
no improve ver4

//// all average /////
0.8566818882519887 ver6
0.8566826756027294 ver12
0.8566971988663943 ver13
no improve ver14

//// all raw ////
0.8566598743812758 ver7
0.8566729572935855 ver8
0.8566756253960961 ver9
0.8566930640625039 ver10
no improve ver11