In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from os.path import join, isfile
from os import path, scandir, listdir

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.DataFrame(pd.read_csv(INPUT / "train.csv")[target])
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
def list_all_files(location='../input/tps-oct-lv0', pattern=None, recursive=True):
    """
    This function returns a list of files at a given location (including subfolders)
    
    - location: path to the directory to be searched
    - pattern: part of the file name to be searched (ex. pattern='.csv' would return all the csv files)
    - recursive: boolean, if True the function calls itself for every subdirectory it finds
    """
    subdirectories= [f.path for f in scandir(location) if f.is_dir()]
    files = [join(location, f) for f in listdir(location) if isfile(join(location, f))]
    if recursive:
        for directory in subdirectories:
            files.extend(list_all_files(directory))
    if pattern:
        files = [f for f in files if pattern in f]
    return files

In [None]:
names = ['bizen', 'henke', 'hamza', '28smiles','kashif', 'kosta', 'kaveh', 'dlaststark', 'pca', 'xgb2']

In [None]:
namesec = ['lonnie', 'hgb', 'xgb_d2s', 'ctb_d2s']

In [None]:
pred = list_all_files(pattern='oof')


for i in range(len(names)):
    cnt=1
    for file in pred:
        if names[i] in file.split('/')[3]:
            train[names[i]+str(cnt)] = np.load(file) 
            cnt += 1


pred = list_all_files(location='../input/tps-oct-lv0-sec', pattern='oof')

for i in range(len(namesec)):
    cnt=1
    for file in pred:
        if namesec[i] in file.split('/')[3]:
            train[namesec[i]+str(cnt)] = np.load(file) 
            cnt += 1
    
train.columns

In [None]:
for i in range(5):

    train['lgblv0'+str(i+1)] = np.load("../input/tps-oct-lv0/"+str(i+2017)+"lgb_oof.npy")
    #test['lgb'+str(i+1)] = np.load("../input/tps-oct-lv0/"+str(i+2017)+"lgb_pred.npy")
    
    train['xgblv0'+str(i+1)] = np.load("../input/tps-oct-lv0/agg"+str(i+1)+"_xgb_oof.npy")
    #test['xgb'+str(i+1)] = np.load("../input/tps-oct-lv0/agg"+str(i+1)+"_xgb_pred.npy")

In [None]:
#lv1

nameslv1 = ['ridge', 'gnb', 'lgb', 'xgb', 'rf', 'nn']

pred = list_all_files(location='../input/tps-oct-lv1-backup', pattern='oof')

for i in range(len(nameslv1)):
    cnt=1
    for file in pred:
        if nameslv1[i] in file.split('/')[3]:
            train[nameslv1[i]+str(cnt)] = np.load(file) 
            cnt += 1
    
train.columns

In [None]:
train

# Preprocessing

In [None]:
features = train.columns[1:]

In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
         'objective': 'binary',
                     'n_estimators':N_ESTIMATORS,
                     'importance_type': 'gain',
                     'metric':'auc',
                     'boosting_type': 'gbdt',
                     'n_jobs' : -1,    
      
          
         
            'learning_rate':trial.suggest_loguniform('learning_rate', 1e-2, 1e-1),     
              'subsample': trial.suggest_uniform('subsample', 0.273, 0.373),        
              'subsample_freq': trial.suggest_int('subsample_freq', 3, 5),           
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.54, 0.64),    
              'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-4, 1),         
              'reg_alpha':trial.suggest_loguniform('reg_alpha',1e-4, 1),               
              'min_child_weight':trial.suggest_uniform('min_child_weight', 19.2, 25.2),     
              'min_child_samples':trial.suggest_int('min_child_samples', 425, 475),          
              'num_leaves':trial.suggest_int('num_leaves', 317, 367),      
              'max_depth':trial.suggest_int('max_depth', 2, 4),           
              #'max_bin':trial.suggest_int('max_bin', 50, 500),   
      
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #lgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    if fold ==0:
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
        
      pred=np.zeros(y_valid.shape[0])
     
      
      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = lgb.LGBMClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)
            

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)

      print(f"fold {fold} - lgb auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

/////// lv0+lv1 average /////////
0.856703267072104 ver7
0.856715435183553 ver13
no improve ver14

//// lv0+lv1 raw //// 
0.8567077460263183 ver1
0.8567155940337026 ver8
0.8567208099886101 ver19
0.856731474998645 ver21
0.8567409273075384 ver22
no improve ver23-24

/////// lv0 best+lv1 average ///////// 
0.8566859160557784 ver6
0.8566939820633677 ver12
no improve ver15

/////// lv0 best+lv1 raw ///////// 
0.8566760457964915 ver2
0.8566872031069894 ver9
no improve ver18

/////// lv1 average ///////// 
0.856700716419704 ver5
0.8567110046293843 ver11
no improve ver16

//// lv1 raw ////
0.8566982133173489 ver4
0.8567063121249692 ver10
0.8567077575763291 ver17
no improve ver20

----------- rectify xgb, lgb overlap ---------------


//// lv0+lv1 raw //// 
0.8567024172713045 ver30 of ver30-33
0.8567057899744778 ver34 of ver34-37
0.8567237764414013 ver40 of ver38-41
no improve ver42-45