In [1]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from functools import partial
from itertools import combinations
import gc

## sklearn imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    KFold
)
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn import ensemble
from category_encoders import (
    OneHotEncoder,
    OrdinalEncoder,
    CountEncoder,
    CatBoostEncoder
)
from sklearn.linear_model import LogisticRegression

from imblearn.under_sampling import RandomUnderSampler

## hyperparameter optimizer
import optuna

## boosting libraries
import xgboost as xgb
from catboost import (
    CatBoost,
    CatBoostClassifier,
    CatBoostRegressor
)
from catboost import Pool

# supress warnings
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# configure pandas
pd.set_option('display.max_columns', None)

In [2]:
# load data
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

target_col = 'Class'

In [3]:
# prepare train and test data
X_train = df_train.drop([f'{target_col}'], axis=1).reset_index(drop=True)
y_train = df_train[f'{target_col}'].reset_index(drop=True)
X_test = df_test.reset_index(drop=True)

print(f"X_train shape :{X_train.shape} , y_train shape :{y_train.shape}")
print(f"X_test shape :{X_test.shape}")

# Delete the train and test dataframes to free up memory
del df_train, df_test

X_train shape :(117564, 9) , y_train shape :(117564,)
X_test shape :(78377, 9)


In [4]:
class Preprocessor:
    def __init__(self, numeric_columns=None, max_pattern=2):
        self.numeric_columns = numeric_columns
        self.max_pattern = max_pattern
        self.scaler = None

    def preprocess(self, X_train, X_test):
        X_train = self.create_numeric_combinations(X_train)
        X_test = self.create_numeric_combinations(X_test)

        numeric_columns = [_ for _ in X_train.columns if X_train[_].dtype=='float']
        scaler = StandardScaler()
        X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
        X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

        return X_train, X_test
    
    def create_numeric_combinations(self, df):
        new_cols = []
        for comb in range(2, len(self.numeric_columns) + 1):
            for col in combinations(self.numeric_columns, comb):
                if len(col) > self.max_pattern:
                    break
                col_names = list(col)
                new_col = '_'.join(col_names) + '_mult'
                df[new_col] = df[col_names[0]]
                for c in col_names[1:]:
                    df[new_col] *= df[c]
                new_cols.append(new_col)

        return df

In [5]:
numeric_columns = [_ for _ in X_test.columns if 'is_generated' not in _]
pp = Preprocessor(numeric_columns=numeric_columns)
X_train, X_test = pp.preprocess(X_train=X_train, X_test=X_test)
print(f"X_train shape :{X_train.shape}", f"X_test shape :{X_test.shape}")

X_train shape :(117564, 45) X_test shape :(78377, 45)


In [6]:
class Splitter:
    def __init__(self, test_size=0.2, kfold=True, n_splits=5):
        self.test_size = test_size
        self.kfold = kfold
        self.n_splits = n_splits

    def split_data(self, X, y, random_state_list):
        if self.kfold:
            for random_state in random_state_list:
                kf = StratifiedKFold(
                    n_splits=self.n_splits,
                    random_state=random_state,
                    shuffle=True
                )
                for train_index, val_index in kf.split(X, y):
                    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        else:
                for random_state in random_state_list:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X,
                        y, 
                        test_size=self.test_size, 
                        random_state=random_state
                    )
                    yield X_train, X_val, y_train, y_val

In [7]:
#random.randint(1000,9999)

In [8]:
kfold = True
n_splits = 10
random_state = 317
random_state_list = [1792, 7145, 7237]
n_estimators = 9999
early_stopping_rounds = 100
verbose = False
device = 'cpu'

splitter = Splitter(kfold=kfold, n_splits=n_splits)

In [17]:
class StackingClassifier:
    def __init__(self, n_estimators=100, device='cpu', random_state=0):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.base_models = self._define_base_models()
        self.meta_models = self._define_class_models()
        self.len_base = len(self.base_models)
        self.len_meta = len(self.meta_models)

    def _define_class_models(self,):
        xgb_params = {
            'n_estimators': self.n_estimators,
            'max_depth': 4,
            'learning_rate': 0.06333221939055333,
            'min_child_weight': 4,
            'gamma': 5.301218558776368e-08,
            'subsample': 0.41010429946197946,
            'colsample_bytree': 0.8298539920447499,
            'reg_alpha': 0.000517878113716743,
            'reg_lambda': 0.00030121415155097723,
            'n_jobs': -1,
            'objective': 'binary:logistic',
            'verbosity': 0,
            'eval_metric': 'logloss',
            'random_state': self.random_state
        }
        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
        
        cb_params = {
            'iterations': self.n_estimators,
            'depth': 3,
            'learning_rate': 0.15687380686250746,
            'l2_leaf_reg': 4.0368544113430485,
            'random_strength': 0.1279482215776108,
            'max_bin': 238,
            'od_wait': 49,
            'one_hot_max_size': 39,
            'grow_policy': 'SymmetricTree',
            'bootstrap_type': 'Bayesian',
            'od_type': 'Iter',
            'loss_function': 'Logloss',
            'task_type': self.device.upper(),
            'random_state': self.random_state
        }

        class_models = {
            'xgb_class': xgb.XGBClassifier(**xgb_params),
            'cat_class': CatBoostClassifier(**cb_params)
        }

        return class_models
        
    def _define_reg_model(self):

        xgb_params = {
            'n_estimators': self.n_estimators,
            'max_depth': 4,
            'learning_rate': 0.06604482627857397,
            'min_child_weight': 9,
            'gamma': 2.785627092225762e-06,
            'subsample': 0.3600730418583202,
            'colsample_bytree': 0.643296031751869,
            'reg_alpha': 0.00048086062508489406,
            'reg_lambda': 8.080844212784364e-06,
            'n_jobs': -1,
            'objective': 'reg:logistic',
            'verbosity': 0,
            'eval_metric': 'rmse',
            'random_state': self.random_state
        }
        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
        
        cb_params = {
            'iterations': self.n_estimators,
            'depth': 5,
            'learning_rate': 0.12947105266151432,
            'l2_leaf_reg': 0.6169164517797081,
            'random_strength': 0.21235850198764036,
            'max_bin': 212,
            'od_wait': 67,
            'one_hot_max_size': 73,
            'grow_policy': 'Depthwise',
            'bootstrap_type': 'Bayesian',
            'od_type': 'Iter',
            'loss_function': 'RMSE',
            'task_type': self.device.upper(),
            'random_state': self.random_state
        }

        reg_models = {
            'xgb_reg': xgb.XGBRegressor(**xgb_params),
            'cat_reg': CatBoostRegressor(**cb_params)
        }

        return reg_models
    
    def _define_add_model(self):

        add_models = {
            'hgbc_class': ensemble.HistGradientBoostingClassifier(max_iter=500, max_depth=4, random_state=self.random_state),
            'lr_class': LogisticRegression(max_iter=1000, n_jobs=-1),
            'rf_class': ensemble.RandomForestClassifier(n_estimators=100, max_depth=4, random_state=self.random_state, n_jobs=-1)
        }

        return add_models
    
    def _define_base_models(self,):

        class_models = self._define_class_models()
        reg_models = self._define_reg_model()
        add_models = self._define_add_model()
        base_models = {
            **class_models,
            **reg_models,
            **add_models
        }

        return base_models

In [10]:
## define optuna optimizer for weights
class OptunaWeights:

    def __init__(self, random_state):

        self.study = None
        self.weights = None
        self.random_state = random_state

    def _objective(self, trial, y_true, y_preds):

        # define weights for the predictions of each model
        weights = [trial.suggest_float(f'weight{n}', 0, 1) for n in range(len(y_preds))]

        # calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=weights)

        # calculate the Logloss score for the weighted prediction
        score = log_loss(y_true, weighted_pred)

        return score
    
    def fit(self, y_true, y_preds, n_trials=300):

        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        self.study = optuna.create_study(sampler=sampler, study_name="OptunaWeights", direction='minimize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=n_trials)
        self.weights = [self.study.best_params[f'weight{n}'] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=self.weights)

        return weighted_pred
    
    def fit_predict(self, y_true, y_preds, n_trials=300):
        self.fit(y_true, y_preds, n_trials=n_trials)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

In [18]:
# base model and evaluating using stacking class
base_model_num = StackingClassifier().len_base

# initialize empty list and arrays for storing model objects
# best iterations, scores, and predictions
models =[]
best_iterations = []
scores = []
oof_predss = np.zeros((X_train.shape[0], base_model_num))
test_predss = np.zeros((X_test.shape[0], base_model_num))

# loop over each split and train base models using training data and
# evaluate on validation data
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_train, y_train, random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits

    stacking_clf = StackingClassifier(n_estimators, device, random_state)
    base_models = stacking_clf.base_models

    # initialize lists to store oof and predictions for each base model
    oof_preds = []
    test_preds = []

    for name, model in base_models.items():
        if name in ['rf_class', 'hgbc_class', 'lr_class']:
            model.fit(X_train_, y_train_)
        else:
            model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=verbose)
        
        if 'class' in name:
            y_val_pred = model.predict_proba(X_val)[:, 1]
            test_pred = model.predict_proba(X_test)[:, 1]
        else:
            y_val_pred = model.predict(X_val)
            test_pred = model.predict(X_test)
        
        score = log_loss(y_val, y_val_pred)
        print(f'Base MODEL {name} [FOLD-{n} SEED-{random_state_list[m]}] LogLoss score: {score:.5f}')

        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)

    # stack oof and test preds horizontally for each base model and store in oof_predss and test_predss respectively
    oof_preds = np.column_stack(oof_preds)
    oof_predss[X_val.index] = oof_preds
    test_preds = np.column_stack(test_preds)
    test_predss += test_preds / (n_splits * len(random_state_list))

    i += 1

gc.collect() 

Base MODEL xgb_class [FOLD-0 SEED-1792] LogLoss score: 0.03320
Base MODEL cat_class [FOLD-0 SEED-1792] LogLoss score: 0.03384
Base MODEL xgb_reg [FOLD-0 SEED-1792] LogLoss score: 0.03362
Base MODEL cat_reg [FOLD-0 SEED-1792] LogLoss score: 0.03487
Base MODEL hgbc_class [FOLD-0 SEED-1792] LogLoss score: 0.03544
Base MODEL lr_class [FOLD-0 SEED-1792] LogLoss score: 0.05491
Base MODEL rf_class [FOLD-0 SEED-1792] LogLoss score: 0.03814
Base MODEL xgb_class [FOLD-1 SEED-1792] LogLoss score: 0.02799
Base MODEL cat_class [FOLD-1 SEED-1792] LogLoss score: 0.02860
Base MODEL xgb_reg [FOLD-1 SEED-1792] LogLoss score: 0.02810
Base MODEL cat_reg [FOLD-1 SEED-1792] LogLoss score: 0.02970
Base MODEL hgbc_class [FOLD-1 SEED-1792] LogLoss score: 0.03005
Base MODEL lr_class [FOLD-1 SEED-1792] LogLoss score: 0.03649
Base MODEL rf_class [FOLD-1 SEED-1792] LogLoss score: 0.03239
Base MODEL xgb_class [FOLD-2 SEED-1792] LogLoss score: 0.03102
Base MODEL cat_class [FOLD-2 SEED-1792] LogLoss score: 0.03155
Ba

432

In [24]:
# stacking model with bas and meta models
meta_test_predss = np.zeros(X_test.shape[0])
ensemble_score = []
weights = []

# evaluate on validation data and store predictions on test
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(X_train, y_train, random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits

    train_index, val_index = X_train_.index, X_val.index

    # use predictions from base models as input features
    X_train_ = oof_predss[train_index]
    X_val = oof_predss[val_index]

    # get a set of base models and meta model using the get_model_function
    stacking_clf = StackingClassifier(n_estimators, device, random_state)
    meta_models = stacking_clf.meta_models

    # initialize lists to store oof and test predictions for each base model
    oof_preds = []
    test_preds = []

    # loop over each base model and fit on train data, eval on validation
    for name, model in meta_models.items():
        if name in ['rf_class', 'hgbc_class', 'lr_class']:
            model.fit(X_train_, y_train_)
        else:
            model.fit(X_train_, y_train_, eval_set=[(X_val, y_val)], early_stopping_rounds=early_stopping_rounds, verbose=verbose)

        y_val_pred = model.predict_proba(X_val)[:, 1]
        test_pred = model.predict_proba(test_predss)[:, 1]
        score = log_loss(y_val, y_val_pred)
        print(f'Meta MODEL {name} [FOLD-{n} SEED-{random_state_list[m]}] LogLoss score: {score:.5f}')

        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)

    # use optuna to find the best ensemble weights
    optweights = OptunaWeights(random_state=random_state)
    y_val_pred = optweights.fit_predict(y_val.values, oof_preds)
    score = log_loss(y_val, y_val_pred)
    print(f'Ensemble MODEL [FOLD-{n} SEED-{random_state_list[m]}] LogLoss score {score:.5f}')

    ensemble_score.append(score)
    weights.append(optweights.weights)
    meta_test_predss += optweights.predict(test_preds) / (n_splits * len(random_state_list))

    i += 1

gc.collect()


ValueError: Found input variables with inconsistent numbers of samples: [105807, 117564]