In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path
import gc

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-sep-2021")

train = pd.read_csv(INPUT / "train.csv")
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
trainfeat = pd.read_csv("../input/tps-sep-2021-kmeans-gmm-features-extraction/trainfeat.csv")
#testfeat = pd.read_csv("../input/tps-sep-2021-kmeans-gmm-features-extraction/testfeat.csv")

In [None]:
trainfeat

# Preprocessing

In [None]:
train['n_missing'] = train.isna().sum(axis=1)
#test['n_missing'] = test[features].isna().sum(axis=1)


In [None]:
train = pd.concat([train['n_missing'], trainfeat.iloc[:,118:], train[target]], axis=1)

del trainfeat
gc.collect()

In [None]:
features = [col for col in train.columns if 'f' in col]

features += ['n_missing']

In [None]:
#train[features] = train[features].fillna(train[features].median())
#test[features] = test[features].fillna(test[features].median())

In [None]:
ss = StandardScaler()


In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
                     'objective': 'binary',
                     'n_estimators':N_ESTIMATORS,
                     'importance_type': 'gain',
                     'metric':'auc',
                     'boosting_type': 'gbdt',
                     'n_jobs' : -1,
           
              'learning_rate':trial.suggest_loguniform('learning_rate', 1e-4, 20),       
              'subsample': trial.suggest_uniform('subsample', 0.1, 1),           
              'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),          
              'colsample_bytree':trial.suggest_uniform('colsample_bytree', 0.1, 1),    
              'reg_lambda':trial.suggest_loguniform('reg_lambda', 1e-4, 50),             
              'reg_alpha':trial.suggest_loguniform('reg_alpha', 1e-4, 50),           
              'min_child_weight':trial.suggest_uniform('min_child_weight', 1e-4, 50),     
              'min_child_samples':trial.suggest_int('min_child_samples', 50, 500),            
              'num_leaves':trial.suggest_int('num_leaves', 50, 500),            
              'max_depth':trial.suggest_int('max_depth', 1, 10),           
              'max_bin':trial.suggest_int('max_bin', 50, 500),     
      
            
               # 'device':'gpu',  # Use GPU acceleration
               # 'gpu_platform_id': 0,
               # 'gpu_device_id': 0,
           
             
              #'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.005),
              #'class_weight':trial.suggest_categorical('class_weight',['balanced',None]),
              #'cat_smooth':trial.suggest_int('cat_smooth', 5, 100),
              #'cat_l2':trial.suggest_loguniform('cat_l2', 1e-3, 100)
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #lgb_oof = np.zeros(train.shape[0])
  #lgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
      gc.collect()
      X_train[features] = ss.fit_transform(X_train[features])
      X_valid[features] = ss.transform(X_valid[features])
      #X_test[features] = ss.transform(X_test[features])
      
      pred=np.zeros(y_valid.shape[0])

      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed
            gc.collect()

            model = lgb.LGBMClassifier(**param_space)
            model.fit(
                X_train, 
                y_train,
                eval_set=[(X_valid, y_valid)],
                eval_metric='auc',
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                verbose=VERBOSE,
            )

            #lgb_oof[val_idx] = model.predict_proba(X_valid)
            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)

      print(f"fold {fold} - lgb rmse: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

      #print(f"oof lgb_rmse = {mean_squared_error(train[target], lgb_oof, squared=False)}")
  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

0.8079328887811741 ver5 (8 trials only)
continue 0.8087624898317669 ver6 (3 trials)

////////

0.8158272768581242 ver6 of ver6-8

0.8159326658825895 ver9 of ver9-11

no improvement ver12-14