In [None]:
import numpy as np
import pandas as pd
import random
import os
import time
from pathlib import Path

from os.path import join, isfile
from os import path, scandir, listdir

from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import RidgeClassifier
from sklearn.utils.extmath import softmax

#import lightgbm as lgb
#import xgboost as xgb
#import catboost as ctb

#import matplotlib.pyplot as plt
#import seaborn as sns

import warnings
warnings.simplefilter('ignore')

# Parameters

In [None]:
target = 'target'

DEBUG = False

if DEBUG:
    N_ESTIMATORS = 1
    N_SPLITS = 2
    SEED = 2017
    CVSEED = 2017
    EARLY_STOPPING_ROUNDS = 1
    VERBOSE = 100
    #N_ITERS = 2
else:
    N_SPLITS = 5
    N_ESTIMATORS = 20000
    EARLY_STOPPING_ROUNDS = 300
    VERBOSE = 1000
    SEED = 2017
    CVSEED = 2017
    #N_ITERS = 10

In [None]:
def set_seed(seed=2017):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
set_seed(SEED)

In [None]:
class RidgeClassifierwithProba(RidgeClassifier):
    def predict_proba(self, X):
        d = self.decision_function(X)
        d_2d = np.c_[-d, d]
        return softmax(d_2d)

# Datasets

In [None]:
INPUT = Path("../input/tabular-playground-series-oct-2021")

train = pd.DataFrame(pd.read_csv(INPUT / "train.csv")[target])
#test = pd.read_csv(INPUT / "test.csv")
#submission = pd.read_csv(INPUT / "sample_solution.csv")

In [None]:
def list_all_files(location='../input/tps-oct-lv0', pattern=None, recursive=True):
    """
    This function returns a list of files at a given location (including subfolders)
    
    - location: path to the directory to be searched
    - pattern: part of the file name to be searched (ex. pattern='.csv' would return all the csv files)
    - recursive: boolean, if True the function calls itself for every subdirectory it finds
    """
    subdirectories= [f.path for f in scandir(location) if f.is_dir()]
    files = [join(location, f) for f in listdir(location) if isfile(join(location, f))]
    if recursive:
        for directory in subdirectories:
            files.extend(list_all_files(directory))
    if pattern:
        files = [f for f in files if pattern in f]
    return files

In [None]:
names = ['bizen', 'henke', 'hamza', '28smiles','kashif', 'kosta', 'kaveh', 'dlaststark', 'pca', 'xgb2']

In [None]:
pred = list_all_files(pattern='oof')


for i in range(len(names)):
    cnt=1
    for file in pred:
        if names[i] in file.split('/')[3]:
            train[names[i]+str(cnt)] = np.load(file) 
            cnt += 1
            
    
train.columns

In [None]:
for i in range(5):

    train['lgb'+str(i+1)] = np.load("../input/tps-oct-lv0/"+str(i+2017)+"lgb_oof.npy")
    #test['lgb'+str(i+1)] = np.load("../input/tps-oct-lv0/"+str(i+2017)+"lgb_pred.npy")
    
    train['xgb'+str(i+1)] = np.load("../input/tps-oct-lv0/agg"+str(i+1)+"_xgb_oof.npy")
    #test['xgb'+str(i+1)] = np.load("../input/tps-oct-lv0/agg"+str(i+1)+"_xgb_pred.npy")

In [None]:
train

# Preprocessing

In [None]:
features = train.columns[1:]

In [None]:
ss = StandardScaler()

In [None]:
train[features]

In [None]:
train[target]

# Optuna

In [None]:
# Optuna for parameter search
!pip install -q optuna

import optuna
import pickle

In [None]:
def objective(trial, X=train[features], y=train[target]):
 
  param_space = {
          'max_iter':N_ESTIMATORS,
               'tol':1e-8,
        
               'alpha':trial.suggest_loguniform('alpha', 14200, 19200),
              'solver':trial.suggest_categorical('solver',['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
      
                }
            

  seed_list=[SEED]
  kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=CVSEED)
  #xgb_oof = np.zeros(train.shape[0])
  #xgb_pred = np.zeros(test.shape[0])
  
  #X_tr, X_va, y_tr, y_va = train_test_split(X,y,test_size=.2,random_state=CVSEED)
  
  for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    if fold ==0:
      print(f"===== fold {fold} =====")
   
      X_train = train[features].iloc[trn_idx]
      y_train = train[target].iloc[trn_idx]
      X_valid = train[features].iloc[val_idx]
      y_valid = train[target].iloc[val_idx]
      #X_test = test[features]
        
      X_train[features] = ss.fit_transform(X_train[features])
      X_valid[features] = ss.transform(X_valid[features])        
        
      pred=np.zeros(y_valid.shape[0])
     
      
      start = time.time()
      for inseed in seed_list:
            param_space['random_state'] = inseed

            model = RidgeClassifierwithProba(**param_space)
            model.fit(
                X_train, 
                y_train,
               
            )

            pred += model.predict_proba(X_valid)[:,-1] / len(seed_list)
            

      elapsed = time.time() - start
      auc = roc_auc_score(y_valid, pred)

      print(f"fold {fold} - ridge auc: {auc:.6f}, elapsed time: {elapsed:.2f}sec\n")            

  
  return auc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials= 200)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
study.best_params

# Log

/////// best average /////////

0.8565849212607524 ver2
0.8565858063615852 ver3
0.8565862040619594 ver4
0.856587385563071 ver5
0.8565896194651729 ver6
0.8565948154700619 ver7
0.8566155528895737 ver8
0.8566327244057305 ver9
no improve ver10

//// best raw ////
0.856625492498926 ver11
0.856632966305958 ver20
0.8566329720059634 ver21
0.8566329724059638 ver22
no improve ver23

/// all raw ///
0.8565996074745708 ver12
0.8566164331904019 ver17
0.8566280215013055 ver18
no improve ver19

//// all average ///////
0.8566273223006475 ver13
0.8566273390006633 ver14
0.856627340800665 ver15
no improve ver16
