In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import RidgeClassifier
from sklearn.utils.extmath import softmax

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'claim'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 10
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

In [None]:
class RidgeClassifierwithProba(RidgeClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-sep-2021")

train = pd.DataFrame(pd.read_csv(INPUT / "train.csv")[target])
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
train['lgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb_oof.npy")
#test['lgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb_pred.npy")

train['lgb2_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb2_oof.npy")
#test['lgb2_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_lgb2_pred.npy")

train['xgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_xgb_oof.npy")
#test['xgb_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_xgb_pred.npy")

train['lgb_bizen_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_bizen_oof.npy")
#test['lgb_bizen_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_bizen_pred.npy")

train['lgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_dmitry_oof.npy")
#test['lgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_dmitry_pred.npy")

train['xgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_dmitry_oof.npy")
#test['xgb_dmitry_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_dmitry_pred.npy")

train['lgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_manav_oof.npy")
#test['lgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/lgb_manav_pred.npy")

train['xgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_manav_oof.npy")
#test['xgb_manav_pred'] = np.load("../input/tps-sep-lv0-base-trees/xgb_manav_pred.npy")


In [None]:
train['ridge_pred'] = np.load("../input/tps-sep-lv0-base-trees/ridge_oof.npy")
#test['ridge_pred'] = np.load("../input/tps-sep-lv0-base-trees/ridge_pred.npy")

train['nn_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_nn_oof.npy")
#test['nn_pred'] = np.load("../input/tps-sep-lv0-base-trees/agg_nn_pred.npy")

In [None]:
#lv1 pred

train['lgb_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/lgb_lv1_oof.npy")
#test['lgb_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/lgb_lv1_pred.npy")

train['xgb_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/xgb_lv1_oof.npy")
#test['xgb_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/xgb_lv1_pred.npy")

train['ctb_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/ctb_lv1_oof.npy")
#test['ctb_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/ctb_lv1_pred.npy")

train['ridge_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/ridge_lv1_oof.npy")
#test['ridge_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/ridge_lv1_pred.npy")

train['nn_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/agg_nn_lv1_oof.npy")
#test['nn_lv1n_pred'] = np.load("../input/tps-sep-lv1-nn/agg_nn_lv1_pred.npy")

In [None]:
#own

train['ridge_lv2n_pred'] = np.load("../input/tps-sep-lv2/ridge_lv2n_oof.npy")
#test['ridge_lv2n_pred'] = np.load("../input/tps-sep-lv2/ridge_lv2n_pred.npy")

train['ridge_lv2nre_pred'] = np.load("../input/tps-sep-lv2/ridge_lv2nre_oof.npy")
#test['ridge_lv2nre_pred'] = np.load("../input/tps-sep-lv2/ridge_lv2nre_pred.npy")

train['ridge_lv2nre2_pred'] = np.load("../input/tps-sep-lv2/ridge_lv2nre2_oof.npy")
#test['ridge_lv2nre2_pred'] = np.load("../input/tps-sep-lv2/ridge_lv2nre2_pred.npy")


# Preprocessing

In [None]:
features = [col for col in train.columns if 'pred' in col]

In [None]:
ss = StandardScaler()

In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = { 
             'max_iter':N_ESTIMATORS,
               'tol':1e-8,
        
               'alpha':trial.suggest_loguniform('alpha', 1e-4, 12),
              'solver':trial.suggest_categorical('solver',['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
                }
            

  seed_list=[SEED]

  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  ridge_oof = np.zeros(train.shape[0])
  #ridge_pred = np.zeros(test.shape[0])
  auc_list=[]
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
      
        
      X_train[features] = ss.fit_transform(X_train[features])
      X_valid[features] = ss.transform(X_valid[features])
      #X_test[features] = ss.transform(X_test[features])


      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = RidgeClassifierwithProba(**param_space)
            model.fit(
                X_train, 
                y_train,
                
            )

            #ridge_oof[val_idx] = model.predict_proba(X_valid)
            
            ridge_oof[val_idx] += model.predict_proba(X_valid)[:,-1] / len(seed_list) 

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, ridge_oof[val_idx])
      auc_list.append(auc)

      print(f"fold {fold} - ridge rmse: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            
  
  auc_oof = roc_auc_score(train[target], ridge_oof)
  print(f"oof ridge_rmse = {auc_oof}")
  
  return auc_oof

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 3)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

/////// 8 trees + ridge /////////

0.8171131381161691 ver2
0.8171131389008243 ver3 of ver3-5
no improve ver6-8

/////// 8 trees + ridge + nn ///////// 
0.8171307543975791 ver11 of ver10-13
0.8171307549555561 ver14 of ver14-17
no improve ver18-21

/// re-input lv2 ridge nn ///
0.8171424458040443 ver24 of ver23-26
0.8171424622948814 ver35 of ver27-35
no improve ver36-44


/// re-input2 lv2 ridge nn ///
0.8171440428346225 ver46 of ver46-54
0.8171440915006832 ver57 of ver55-63
no improve ver64-72

/// re-input3 lv2 ridge nn ///
0.8171433205333313 ver76 of ver74-82
no improve ver89-94

only re-input
0.8171435786500273 ver85 of ver83-85

only last re-input
0.8171436931181472 ver87 of ver86-88