In [1]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
import numpy as np

import xgboost as xgb
import lightgbm as lgbm
import catboost as cat

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
df = fetch_california_housing(as_frame=True)["frame"]
print(df.shape)
df.head()

(20640, 9)


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
from sklearn.model_selection import train_test_split

trainX, testX, trainy, testy = train_test_split(df.drop(columns=['MedHouseVal']), df.MedHouseVal, test_size=0.15, shuffle=True, random_state=42)

In [9]:
from sklearn.model_selection import KFold
class Splitter:
    def __init__(self, kfold=True, n_splits=5):
        self.n_splits = n_splits
        self.kfold = kfold
        #self.greeks = greeks

    def split_data(self, X, y, random_state_list):
        if self.kfold == 'skf':
            for random_state in random_state_list:
                kf = KFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
                for train_index, val_index in kf.split(X, y):
                    if type(X) is np.ndarray:
                        X_train, X_val = X[train_index], X[val_index]
                        y_train, y_val = y[train_index], y[val_index]
                    else:
                        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
                    yield X_train, X_val, y_train, y_val
        else:
            raise ValueError(f"Invalid kfold: Must be True")

In [43]:
class Classifier:
    def __init__(self, n_estimators=100, device="cpu", random_state=42):
        self.n_estimators = n_estimators
        self.device = device
        self.random_state = random_state
        self.models = self._define_model()
        self.models_name = list(self._define_model().keys())
        self.len_models = len(self.models)
        
    def _define_model(self):
        xgb_params = {}
        param = {'n_estimators': 850, 'max_depth': 7, 'learning_rate': 0.035579334505517195, 
         'subsample': 0.7433060185168757, 'colsample_bytree': 0.9985782631581257, 'gamma': 0.014679853880810986}

        if self.device == 'gpu':
            xgb_params['tree_method'] = 'gpu_hist'
            xgb_params['predictor'] = 'gpu_predictor'
       
        models = {
            'xgb': xgb.XGBRegressor(**param),
            'lgbm': lgbm.LGBMRegressor(),
            'cat': cat.CatBoostRegressor(verbose=-1)
            #add some models with default params to "simplify" ensemble
            #'svc': SVC(random_state=self.random_state, probability=True),
            #'brf': BalancedRandomForestClassifier(random_state=self.random_state),
            #'lr': LogisticRegression(random_state=self.random_state)
        }
        return models

In [7]:
from sklearn.metrics import mean_squared_error
from functools import partial
import optuna

class OptunaWeights:
    def __init__(self, random_state, n_trials=1000):
        self.study = None
        self.weights = None
        self.random_state = random_state
        self.n_trials = n_trials

    def _objective(self, trial, y_true, y_preds):
        # Define the weights for the predictions from each model
        weights = [trial.suggest_float(f"weight{n}", 1e-14, 1) for n in range(len(y_preds))]

        # Calculate the weighted prediction
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=weights)

        # Calculate the score for the weighted prediction
        # score = log_loss(y_true, weighted_pred)
        score = mean_squared_error(y_true, weighted_pred)
        return score

    def fit(self, y_true, y_preds):
        optuna.logging.set_verbosity(optuna.logging.ERROR)
        sampler = optuna.samplers.CmaEsSampler(seed=self.random_state)
        pruner = optuna.pruners.HyperbandPruner()
        self.study = optuna.create_study(sampler=sampler, pruner=pruner, study_name="OptunaWeights", direction='minimize')
        objective_partial = partial(self._objective, y_true=y_true, y_preds=y_preds)
        self.study.optimize(objective_partial, n_trials=self.n_trials)
        self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))]

    def predict(self, y_preds):
        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict'
        weighted_pred = np.average(np.array(y_preds).T, axis=1, weights=self.weights)
        return weighted_pred

    def fit_predict(self, y_true, y_preds):
        self.fit(y_true, y_preds)
        return self.predict(y_preds)
    
    def weights(self):
        return self.weights

In [15]:
trained_models = {'xgb':[], 'cat':[], 'lgbm':[]}

trained_models.keys()

dict_keys(['xgb', 'cat', 'lgbm'])

In [35]:
%%time

import random
from copy import deepcopy
from catboost import Pool
import gc

kfold = 'skf'
n_splits = 5
n_reapts = 5
random_state = 42
n_estimators = 99999
early_stopping_rounds = 99
verbose = False
device = 'cpu'
# Fix seed
random.seed(random_state)
random_state_list = random.sample(range(9999), n_reapts)
#random_state_list = [42]

# Initialize an array for storing test predictions
classifier = Classifier(n_estimators, device, random_state)
test_predss = np.zeros((testX.shape[0]))
oof_predss = np.zeros((trainX.shape[0], n_reapts))
ensemble_score, ensemble_score_ = [], []
weights = []
oof_each_predss = []
oof_each_preds = np.zeros((trainX.shape[0], classifier.len_models))

test_each_predss = []
test_each_preds = np.zeros((testX.shape[0], classifier.len_models))

trained_models = {'xgb':[], 'cat':[], 'lgbm':[]}
score_dict = dict(zip(classifier.models_name, [[] for _ in range(classifier.len_models)]))

splitter = Splitter(kfold=kfold, n_splits=n_splits)

for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(trainX.reset_index(drop=True), trainy.reset_index(drop=True), random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits
            
    # Get a set of classifier models
    classifier = Classifier(n_estimators, device, random_state_list[m])
    models = classifier.models
    
    # Initialize lists to store oof and test predictions for each base model
    oof_preds = []
    test_preds = []
    
    # Loop over each base model and fit it to the training data, evaluate on validation data, and store predictions
    for name, model in models.items():
        if ('xgb' in name) or ('lgbm' in name) or ('cat' in name):
            #train_w0, train_w1 = calc_log_loss_weight(y_train_)
            #valid_w0, valid_w1 = calc_log_loss_weight(y_val)
            if 'xgb' in name:
                model.fit(
                    X_train_, y_train_, 
                    eval_set=[(X_val, y_val)],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'lgbm' in name:
                model.fit(
                    X_train_, y_train_, 
                    eval_set=[(X_val, y_val)],
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            elif 'cat' in name:
                model.fit(
                    Pool(X_train_, y_train_), 
                    eval_set=Pool(X_val, y_val), 
                    early_stopping_rounds=early_stopping_rounds, verbose=verbose)
            
        else:
            model.fit(X_train_, y_train_)
          
        if name in trained_models.keys():
            trained_models[f'{name}'].append(deepcopy(model))
        
        test_pred = model.predict(testX)
        y_val_pred = model.predict(X_val)
        
        # Calculate recall and precision scores
        mse = mean_squared_error(y_val, y_val_pred)
        print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] MSE score: {mse:.5f}')
        print('-'*50)
        #score = balanced_log_loss(y_val, y_val_pred)
        #score_dict[name].append(score)
        #print(f'{name} [FOLD-{n} SEED-{random_state_list[m]}] BalancedLogLoss score: {score:.5f}')
        #print('-'*50)
        
        oof_preds.append(y_val_pred)
        test_preds.append(test_pred)
    
    # Use Optuna to find the best ensemble weights
    optweights = OptunaWeights(random_state=random_state_list[m])
    y_val_pred = optweights.fit_predict(y_val.values, oof_preds)
    
    #score = balanced_log_loss(y_val, y_val_pred)
    #score_ = roc_auc_score(y_val, y_val_pred)
    #print(f'--> Ensemble [FOLD-{n} SEED-{random_state_list[m]}] BalancedLogLoss score {score:.5f}')
    #print('='*50)
    #ensemble_score.append(score)
    #ensemble_score_.append(score_)
    weights.append(optweights.weights)
    
    # Predict to X_test by the best ensemble weights
    test_predss += optweights.predict(test_preds) / (n_splits * len(random_state_list))
    
    oof_predss[X_val.index, m] += optweights.predict(oof_preds)
    
    oof_each_preds[X_val.index] = np.stack(oof_preds).T
    test_each_preds += np.array(test_preds).T / n_splits
    
    if n == (n_splits - 1):
        oof_each_predss.append(oof_each_preds)
        oof_each_preds = np.zeros((trainX.shape[0], classifier.len_models))
        test_each_predss.append(test_each_preds)
        test_each_preds = np.zeros((testX.shape[0], classifier.len_models))
    
    gc.collect()
    
oof_each_predss = np.mean(np.array(oof_each_predss), axis=0)
test_each_predss = np.mean(np.array(test_each_predss), axis=0)
oof_each_predss = np.concatenate([oof_each_predss, np.mean(oof_predss, axis=1).reshape(-1, 1)], axis=1)
test_each_predss = np.concatenate([test_each_predss, test_predss.reshape(-1, 1)], axis=1)



xgb [FOLD-0 SEED-1824] MSE score: 0.23793
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-0 SEED-1824] MSE score: 0.23761
--------------------------------------------------
cat [FOLD-0 SEED-1824] MSE score: 0.22380
--------------------------------------------------




xgb [FOLD-1 SEED-1824] MSE score: 0.21666
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-1 SEED-1824] MSE score: 0.20925
--------------------------------------------------
cat [FOLD-1 SEED-1824] MSE score: 0.18692
--------------------------------------------------




xgb [FOLD-2 SEED-1824] MSE score: 0.22993
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-2 SEED-1824] MSE score: 0.21895
--------------------------------------------------
cat [FOLD-2 SEED-1824] MSE score: 0.19961
--------------------------------------------------




xgb [FOLD-3 SEED-1824] MSE score: 0.21769
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-3 SEED-1824] MSE score: 0.21173
--------------------------------------------------
cat [FOLD-3 SEED-1824] MSE score: 0.18964
--------------------------------------------------




xgb [FOLD-4 SEED-1824] MSE score: 0.22462
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-4 SEED-1824] MSE score: 0.21850
--------------------------------------------------
cat [FOLD-4 SEED-1824] MSE score: 0.19439
--------------------------------------------------




xgb [FOLD-0 SEED-409] MSE score: 0.20056
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-0 SEED-409] MSE score: 0.19470
--------------------------------------------------
cat [FOLD-0 SEED-409] MSE score: 0.17390
--------------------------------------------------




xgb [FOLD-1 SEED-409] MSE score: 0.23562
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-1 SEED-409] MSE score: 0.23343
--------------------------------------------------
cat [FOLD-1 SEED-409] MSE score: 0.21173
--------------------------------------------------




xgb [FOLD-2 SEED-409] MSE score: 0.22881
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-2 SEED-409] MSE score: 0.21931
--------------------------------------------------
cat [FOLD-2 SEED-409] MSE score: 0.20465
--------------------------------------------------




xgb [FOLD-3 SEED-409] MSE score: 0.23081
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-3 SEED-409] MSE score: 0.22295
--------------------------------------------------
cat [FOLD-3 SEED-409] MSE score: 0.19942
--------------------------------------------------




xgb [FOLD-4 SEED-409] MSE score: 0.22942
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-4 SEED-409] MSE score: 0.21809
--------------------------------------------------
cat [FOLD-4 SEED-409] MSE score: 0.19888
--------------------------------------------------




xgb [FOLD-0 SEED-4506] MSE score: 0.22762
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-0 SEED-4506] MSE score: 0.21572
--------------------------------------------------
cat [FOLD-0 SEED-4506] MSE score: 0.19422
--------------------------------------------------




xgb [FOLD-1 SEED-4506] MSE score: 0.20970
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-1 SEED-4506] MSE score: 0.19797
--------------------------------------------------
cat [FOLD-1 SEED-4506] MSE score: 0.18274
--------------------------------------------------




xgb [FOLD-2 SEED-4506] MSE score: 0.22367
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-2 SEED-4506] MSE score: 0.21831
--------------------------------------------------
cat [FOLD-2 SEED-4506] MSE score: 0.19638
--------------------------------------------------




xgb [FOLD-3 SEED-4506] MSE score: 0.23838
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-3 SEED-4506] MSE score: 0.23514
--------------------------------------------------
cat [FOLD-3 SEED-4506] MSE score: 0.22132
--------------------------------------------------




xgb [FOLD-4 SEED-4506] MSE score: 0.22563
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-4 SEED-4506] MSE score: 0.22039
--------------------------------------------------
cat [FOLD-4 SEED-4506] MSE score: 0.19979
--------------------------------------------------




xgb [FOLD-0 SEED-4012] MSE score: 0.22717
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-0 SEED-4012] MSE score: 0.22207
--------------------------------------------------
cat [FOLD-0 SEED-4012] MSE score: 0.20209
--------------------------------------------------




xgb [FOLD-1 SEED-4012] MSE score: 0.24960
--------------------------------------------------
Finished loading model, total used 99 iterations
lgbm [FOLD-1 SEED-4012] MSE score: 0.23860
--------------------------------------------------
cat [FOLD-1 SEED-4012] MSE score: 0.21252
--------------------------------------------------




xgb [FOLD-2 SEED-4012] MSE score: 0.23881
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-2 SEED-4012] MSE score: 0.22867
--------------------------------------------------
cat [FOLD-2 SEED-4012] MSE score: 0.20527
--------------------------------------------------




xgb [FOLD-3 SEED-4012] MSE score: 0.23335
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-3 SEED-4012] MSE score: 0.23277
--------------------------------------------------
cat [FOLD-3 SEED-4012] MSE score: 0.20695
--------------------------------------------------




xgb [FOLD-4 SEED-4012] MSE score: 0.20043
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-4 SEED-4012] MSE score: 0.18864
--------------------------------------------------
cat [FOLD-4 SEED-4012] MSE score: 0.16937
--------------------------------------------------




xgb [FOLD-0 SEED-3657] MSE score: 0.24186
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-0 SEED-3657] MSE score: 0.23968
--------------------------------------------------
cat [FOLD-0 SEED-3657] MSE score: 0.21903
--------------------------------------------------




xgb [FOLD-1 SEED-3657] MSE score: 0.23549
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-1 SEED-3657] MSE score: 0.22670
--------------------------------------------------
cat [FOLD-1 SEED-3657] MSE score: 0.20221
--------------------------------------------------




xgb [FOLD-2 SEED-3657] MSE score: 0.21868
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-2 SEED-3657] MSE score: 0.21451
--------------------------------------------------
cat [FOLD-2 SEED-3657] MSE score: 0.19335
--------------------------------------------------




xgb [FOLD-3 SEED-3657] MSE score: 0.20922
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-3 SEED-3657] MSE score: 0.20834
--------------------------------------------------
cat [FOLD-3 SEED-3657] MSE score: 0.18467
--------------------------------------------------




xgb [FOLD-4 SEED-3657] MSE score: 0.22124
--------------------------------------------------
Finished loading model, total used 100 iterations
lgbm [FOLD-4 SEED-3657] MSE score: 0.21177
--------------------------------------------------
cat [FOLD-4 SEED-3657] MSE score: 0.18905
--------------------------------------------------
CPU times: user 8min 25s, sys: 39.4 s, total: 9min 4s
Wall time: 6min 12s


In [25]:
trainX.shape

(17544, 8)

In [28]:
df.shape

(20640, 9)

In [36]:
# Calculate the mean score of the ensemble
#mean_score = np.mean(ensemble_score)
#std_score = np.std(ensemble_score)
#print(f'Mean Optuna Ensemble {mean_score:.5f} ± {std_score:.5f} \n')

print('--- Optuna Weights---')
mean_weights = np.mean(weights, axis=0)
std_weights = np.std(weights, axis=0)
for name, mean_weight, std_weight in zip(models.keys(), mean_weights, std_weights):
    print(f'{name}: {mean_weight:.5f} ± {std_weight:.5f}')

--- Optuna Weights---
xgb: 0.15833 ± 0.07208
lgbm: 0.04257 ± 0.05067
cat: 0.78652 ± 0.10304


In [44]:
%%time

stack_test_predss = np.zeros((testX.shape[0]))
stack_scores = []
stack_models = []
splitter = Splitter(kfold=kfold, n_splits=n_splits)
for i, (X_train_, X_val, y_train_, y_val) in enumerate(splitter.split_data(oof_each_predss, trainy.reset_index(drop=True), random_state_list=random_state_list)):
    n = i % n_splits
    m = i // n_splits
    classifier = Classifier(n_estimators, device, random_state_list[m])
    models = classifier.models
    model = models['xgb']
    model.fit(X_train_, y_train_, 
    eval_set=[(X_val, y_val)], 
    early_stopping_rounds=early_stopping_rounds,
    verbose=verbose)
    
    #train_w0, train_w1 = calc_log_loss_weight(y_train_)
    #valid_w0, valid_w1 = calc_log_loss_weight(y_val)
    '''
    if 'xgb' in one_model:
        model.fit(
        X_train_, y_train_, sample_weight=y_train_.map({0: train_w0, 1: train_w1}),
        eval_set=[(X_val, y_val)],
       # eval_metric='logloss',
        sample_weight_eval_set=[y_val.map({0: valid_w0, 1: valid_w1})],
        early_stopping_rounds=early_stopping_rounds,
        verbose=verbose)
    elif 'tab' in one_model:
        model.fit(X_train_, y_train_, overwrite_warning =True)
    '''
    #model.fit(X_train_, y_train_, overwrite_warning =True)
    
    test_pred = model.predict(test_each_predss)
    y_val_pred = model.predict(X_val)

    mse = mean_squared_error(y_val, y_val_pred)

    #score = balanced_log_loss(y_val, y_val_pred)
    #stack_scores.append(score)
    #stack_models.append(deepcopy(model))
    
    stack_test_predss += test_pred / (n_splits * len(random_state_list))

# Calculate the mean LogLoss score of the ensemble
#mean_score = np.mean(ensemble_score)
#std_score = np.std(ensemble_score)
#print(f'Ensemble BalancedLogLoss score {mean_score:.5f} ± {std_score:.5f}')
# Print the mean and standard deviation of the ensemble weights for each model
print('--- Model Weights ---')
mean_weights = np.mean(weights, axis=0)
std_weights = np.std(weights, axis=0)
for name, mean_weight, std_weight in zip(models.keys(), mean_weights, std_weights):
    print(f'{name}: {mean_weight:.5f} ± {std_weight:.5f}')
print('')

# Calculate the mean LogLoss score of the ensemble
#mean_score = np.mean(stack_scores)
#std_score = np.std(stack_scores)
#print(f'Stacking BalancedLogLoss score {mean_score:.5f} ± {std_score:.5f}\n')

--- Model Weights ---
xgb: 0.15833 ± 0.07208
lgbm: 0.04257 ± 0.05067
cat: 0.78652 ± 0.10304

CPU times: user 4min 1s, sys: 7.1 s, total: 4min 9s
Wall time: 1min 50s


In [39]:
stack_test_predss

array([0.57150565, 0.86836469, 4.92260692, ..., 1.89486144, 1.99892141,
       1.51265009])

In [42]:
mean_squared_error(pd.Series(testy), stack_test_predss)

0.1942095818325349

In [41]:
pd.DataFrame({'predy':stack_test_predss, 'testy':pd.Series(testy)}).head(20)

Unnamed: 0,predy,testy
20046,0.571506,0.477
3024,0.868365,0.458
15663,4.922607,5.00001
20484,2.414125,2.186
9814,2.413817,2.78
13311,1.616256,1.587
7113,2.23878,1.982
7668,1.612635,1.575
18246,2.967142,3.4
5723,4.865237,4.466


In [45]:
mean_squared_error(pd.Series(testy), stack_test_predss)

0.19404162894645924

In [48]:
pd.DataFrame({'predy':stack_test_predss, 'testy':pd.Series(testy)}).sort_index().head(20)

Unnamed: 0,predy,testy
3,2.981289,3.413
6,2.345647,2.992
17,1.626399,1.555
31,1.236008,1.152
34,1.858446,1.097
35,1.369074,0.972
41,1.657917,1.5
42,1.255657,1.188
46,1.720502,1.425
57,0.785316,0.853
