In [None]:
"""
[V1]
* Stack model for XGBoost

ResNeSt:
https://www.kaggle.com/code/act18l/stacked-model-extratrees-adaboost
"""

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from warnings import filterwarnings
filterwarnings(action='ignore')

import matplotlib.pyplot as plt 
%matplotlib inline

In [3]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
df = pd.DataFrame(data=housing['data'], columns=housing['feature_names'])
df['target'] = housing['target']
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [4]:
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

lgb = LGBMRegressor(verbose=1)

steps = []
steps.append(('lgb',lgb))
pipeline = Pipeline(steps=steps)

In [8]:
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=5, min_samples_split=3, n_estimators=100)),
    StackingEstimator(estimator=XGBRegressor()),
    #StackingEstimator(estimator=AdaBoostRegressor(learning_rate=1.0, loss="linear", n_estimators=100)),
    #StackingEstimator(estimator=AdaBoostRegressor(learning_rate=0.001, loss="square", n_estimators=100)),
    #PCA(iterated_power=8, svd_solver="randomized"),
    ExtraTreesRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=4, min_samples_split=19, n_estimators=100))

In [9]:
from sklearn.model_selection import train_test_split

trainX, testX, trainy, testy = train_test_split(df.drop(['target'], axis=1), df[['target']], train_size=0.8, random_state=0)

In [10]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
n_splits = 5
kf = KFold(n_splits=n_splits, random_state=42, shuffle=True)

fold_losses = []
oof_each_predss = np.zeros(len(testX))
best_model = np.NaN
best_error = -np.float('inf')
for fold, (train_index, val_index) in enumerate(kf.split(trainX, trainy)):
    X_fold_train, X_fold_val = trainX.iloc[train_index, :], trainX.iloc[val_index, :]
    y_fold_train, y_fold_val = trainy.iloc[train_index, :], trainy.iloc[val_index, :]
    
    pipeline.fit(X_fold_train, y_fold_train)
    y_val_pred = pipeline.predict(X_fold_val)
    mse = mean_squared_error(y_fold_val, y_val_pred)
    print(f"Fold {fold+1}: MSE = {mse}")
    if mse>best_error:
        best_model = pipeline
    
    oof_each_predss += pipeline.predict(testX)/n_splits

Fold 1: MSE = 0.23547452406863237
Fold 2: MSE = 0.2667457052720998
Fold 3: MSE = 0.24718668014165326
Fold 4: MSE = 0.22771541525286507
Fold 5: MSE = 0.25582173613498366


In [11]:
mean_squared_error(testy, oof_each_predss)

0.23009559635279803

In [12]:
model = XGBRegressor()
model.fit(trainX, trainy)
mean_squared_error(testy, model.predict(testX))

0.21379308523299043