In [65]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.display import Image

import os, sys, re, datetime, time
from pathlib import Path

pj_dir = Path(os.getcwd()).parents[0]
data_dir = pj_dir/'data'
img_dir = pj_dir/'images'
src_dir = pj_dir/'src'
sys.path.append(str(src_dir))

from matplotlib import pyplot as plt
import japanize_matplotlib
import seaborn as sns
plt.style.use("bmh")
import numpy as np
import pandas as pd
import dask.dataframe as dd

from tqdm import tqdm_notebook
from dotenv import load_dotenv

In [15]:
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [47]:
from catboost import CatBoostClassifier, CatBoostRegressor, Pool, cv
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

In [1]:
from sklearn.model_selection import train_test_split
from catboost.datasets import titanic
import numpy as np

In [42]:
from sklearn.datasets import load_boston

# From tutorial

# Stacking

In [9]:
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50}, name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)

Metric: mean_absolute_error
Folds accuracy: [2.6547554270930753, 1.530020684497064, 1.9954728439069227, 2.038517331082455, 2.590018535101311, 2.7001268670192555, 1.6441529920970588, 2.5659198546681714, 2.472405404399628, 2.394314223788695]
Mean accuracy: 2.2585704163653633
Standard Deviation: 0.4057517903676271
Variance: 0.16463451538653479


## Blending

In [10]:
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')

# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.blend(proportion=0.2,seed=111)

# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
# Validate results using 10 fold cross-validation
results = stacker.validate(k=10,scorer=mean_absolute_error)

Metric: mean_absolute_error
Folds accuracy: [2.592903546519196, 1.3830221732582768, 1.1235856379677305, 3.8563833889182817, 1.925512881703217, 1.2012079643334295, 3.5078089382585635, 3.455189841415516, 1.2933569204511624, 3.2398309632533264]
Mean accuracy: 2.35788022560787
Standard Deviation: 1.0361893017777448
Variance: 1.0736882691186505


## Weight average

In [14]:
# load boston dataset from sklearn
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=111)

# create dataset
dataset = Dataset(X_train,y_train,X_test)

model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 151},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
model_knn = Regressor(dataset=dataset, estimator=KNeighborsRegressor, parameters={'n_neighbors': 15},name='knn')

pipeline = ModelsPipeline(model_rf,model_lr,model_knn)

weights = pipeline.find_weights(mean_absolute_error)
result = pipeline.weight(weights)

Best Score (mean_absolute_error): 2.0970365230474086
Best Weights: [8.99402665e-01 1.00597335e-01 1.31295661e-16]


# Own

In [3]:
seed = 42

### Load Data

In [24]:
import pandas as pd
from sklearn.datasets import load_wine

In [121]:
X, y = load_boston(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=seed)

### some models

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

In [59]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [125]:
models_dic = {
    'rf': RandomForestRegressor(n_estimators=50, random_state=seed),
    'lr': LinearRegression(normalize=True),
    'kr': KNeighborsRegressor(),
    'cr': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')
}

In [126]:
for name, model in models_dic.items():
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
    print(name, ': ', - np.mean(cv_results))

rf :  2.2253083333333334
lr :  3.4667269295247807
kr :  4.448708333333333
cr :  2.195290136923679


### Stacking 

In [127]:
dataset = Dataset(X_train, y_train, X_test)

models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=seed)

stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

cv_results = []
for y_true, y_pred in zip(y_trues, y_preds):
    cv_result = mean_absolute_error(y_true, y_pred)
    cv_results.append(cv_result)
print('stacking: ', np.mean(cv_results))

stacking:  2.1563606299364295


### Blending

In [128]:
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.blend(proportion=0.2, seed=seed)

stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

cv_results = []
for y_true, y_pred in zip(y_trues, y_preds):
    cv_result = mean_absolute_error(y_true, y_pred)
    cv_results.append(cv_result)
print('blending: ', np.mean(cv_results))

stacking:  3.0442182736964476


### Weighted Average

In [134]:
pipeline = ModelsPipeline(*models)
weights = pipeline.find_weights(mean_absolute_error)
pipeline_apply = pipeline.weight(weights)
cv_results = pipeline_apply.validate(scorer=mean_absolute_error, k=10)
print('weighted average: ', np.mean(cv_results))

Best Score (mean_absolute_error): 1.9893582436999917
Best Weights: [2.18887575e-01 1.94297608e-17 7.96579612e-18 7.81112425e-01]
Metric: mean_absolute_error
Folds accuracy: [1.978630110432124, 2.137259891149993, 2.145816457861829, 1.4496619629760552, 2.3527743239776573, 2.910698591290202, 2.0799869355876592, 1.74754010963542, 3.114361238261376, 2.3891019916145555]
Mean accuracy: 2.2305831612786875
Standard Deviation: 0.47211251675696475
Variance: 0.22289022847859533
weighted average:  2.2305831612786875


# pystacknet

In [136]:
from pystacknet.pystacknet import StackNetRegressor

In [142]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

In [144]:
models=[ 
    # 1st level
    [
        RandomForestRegressor(n_estimators=50, random_state=seed),
        LinearRegression(normalize=True),
        CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')     
    ],
    # 2nd level
    [
        Ridge(normalize=True),
        ExtraTreesRegressor(random_state=seed),
        XGBRegressor(random_state=seed)
    ],
    [
        LinearRegression(normalize=True)
    ]
]

In [146]:
model = StackNetRegressor(
    models, folds=10,
    restacking=False, use_retraining=True,
    random_state=seed, n_jobs=-1, verbose=1
)
kfold = KFold(n_splits=10, random_state=seed)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
print(name, ': ', -np.mean(cv_results))

Input Dimensionality 13 at Level 0 
3 models included in Level 0 
Fold 1/10 , model 0 , custom===4.279341 
Fold 1/10 , model 1 , custom===6.597792 
Fold 1/10 , model 2 , custom===4.060826 
Fold 2/10 , model 0 , custom===3.346264 
Fold 2/10 , model 1 , custom===5.585476 
Fold 2/10 , model 2 , custom===3.643938 
Fold 3/10 , model 0 , custom===2.557439 
Fold 3/10 , model 1 , custom===3.693772 
Fold 3/10 , model 2 , custom===3.141906 
Fold 4/10 , model 0 , custom===3.528272 
Fold 4/10 , model 1 , custom===4.962491 
Fold 4/10 , model 2 , custom===3.101538 
Fold 5/10 , model 0 , custom===4.177779 
Fold 5/10 , model 1 , custom===5.794933 
Fold 5/10 , model 2 , custom===3.672188 
Fold 6/10 , model 0 , custom===4.033046 
Fold 6/10 , model 1 , custom===6.208602 
Fold 6/10 , model 2 , custom===3.420754 
Fold 7/10 , model 0 , custom===3.373084 
Fold 7/10 , model 1 , custom===4.740010 
Fold 7/10 , model 2 , custom===4.007555 
Fold 8/10 , model 0 , custom===3.706942 
Fold 8/10 , model 1 , custom===3

Fold 5/10 , model 0 , custom===3.734073 
Fold 5/10 , model 1 , custom===4.351380 
Fold 5/10 , model 2 , custom===4.588915 
Fold 6/10 , model 0 , custom===5.819524 
Fold 6/10 , model 1 , custom===5.105008 
Fold 6/10 , model 2 , custom===5.193358 
Fold 7/10 , model 0 , custom===5.123734 
Fold 7/10 , model 1 , custom===4.705629 
Fold 7/10 , model 2 , custom===5.005370 
Fold 8/10 , model 0 , custom===3.415053 
Fold 8/10 , model 1 , custom===2.968655 
Fold 8/10 , model 2 , custom===2.831130 
Fold 9/10 , model 0 , custom===2.955297 
Fold 9/10 , model 1 , custom===3.273024 
Fold 9/10 , model 2 , custom===2.837420 
Fold 10/10 , model 0 , custom===4.185284 
Fold 10/10 , model 1 , custom===4.080353 
Fold 10/10 , model 2 , custom===3.419387 
Output dimensionality of level 1 is 3 
 level 1 lasted 4.106238 seconds 
Input Dimensionality 3 at Level 2 
1 models included in Level 2 
Fold 1/10 , model 0 , custom===4.746332 
Fold 2/10 , model 0 , custom===3.020624 
Fold 3/10 , model 0 , custom===2.637416

Input Dimensionality 13 at Level 0 
3 models included in Level 0 
Fold 1/10 , model 0 , custom===4.124555 
Fold 1/10 , model 1 , custom===7.705135 
Fold 1/10 , model 2 , custom===5.666679 
Fold 2/10 , model 0 , custom===3.013523 
Fold 2/10 , model 1 , custom===3.858647 
Fold 2/10 , model 2 , custom===2.724388 
Fold 3/10 , model 0 , custom===2.479396 
Fold 3/10 , model 1 , custom===3.743807 
Fold 3/10 , model 2 , custom===2.801582 
Fold 4/10 , model 0 , custom===3.422669 
Fold 4/10 , model 1 , custom===3.818408 
Fold 4/10 , model 2 , custom===2.265144 
Fold 5/10 , model 0 , custom===5.016259 
Fold 5/10 , model 1 , custom===5.801329 
Fold 5/10 , model 2 , custom===3.204026 
Fold 6/10 , model 0 , custom===3.422615 
Fold 6/10 , model 1 , custom===6.279948 
Fold 6/10 , model 2 , custom===4.561954 
Fold 7/10 , model 0 , custom===3.676677 
Fold 7/10 , model 1 , custom===4.764594 
Fold 7/10 , model 2 , custom===4.954520 
Fold 8/10 , model 0 , custom===3.239308 
Fold 8/10 , model 1 , custom===4

Fold 5/10 , model 0 , custom===4.116713 
Fold 5/10 , model 1 , custom===3.633432 
Fold 5/10 , model 2 , custom===3.552511 
Fold 6/10 , model 0 , custom===4.818413 
Fold 6/10 , model 1 , custom===4.576131 
Fold 6/10 , model 2 , custom===3.623979 
Fold 7/10 , model 0 , custom===4.999775 
Fold 7/10 , model 1 , custom===3.588170 
Fold 7/10 , model 2 , custom===3.898403 
Fold 8/10 , model 0 , custom===4.562606 
Fold 8/10 , model 1 , custom===3.994746 
Fold 8/10 , model 2 , custom===3.966182 
Fold 9/10 , model 0 , custom===3.050604 
Fold 9/10 , model 1 , custom===3.254689 
Fold 9/10 , model 2 , custom===2.801138 
Fold 10/10 , model 0 , custom===4.377358 
Fold 10/10 , model 1 , custom===3.253777 
Fold 10/10 , model 2 , custom===3.629442 
Output dimensionality of level 1 is 3 
 level 1 lasted 4.354584 seconds 
Input Dimensionality 3 at Level 2 
1 models included in Level 2 
Fold 1/10 , model 0 , custom===3.551156 
Fold 2/10 , model 0 , custom===2.796484 
Fold 3/10 , model 0 , custom===2.570291

Input Dimensionality 13 at Level 0 
3 models included in Level 0 
Fold 1/10 , model 0 , custom===2.595892 
Fold 1/10 , model 1 , custom===5.560950 
Fold 1/10 , model 2 , custom===3.497438 
Fold 2/10 , model 0 , custom===2.597480 
Fold 2/10 , model 1 , custom===3.960238 
Fold 2/10 , model 2 , custom===2.583975 
Fold 3/10 , model 0 , custom===4.978665 
Fold 3/10 , model 1 , custom===4.656291 
Fold 3/10 , model 2 , custom===3.046633 
Fold 4/10 , model 0 , custom===2.959359 
Fold 4/10 , model 1 , custom===4.828208 
Fold 4/10 , model 2 , custom===2.908432 
Fold 5/10 , model 0 , custom===4.601909 
Fold 5/10 , model 1 , custom===6.596757 
Fold 5/10 , model 2 , custom===3.910417 
Fold 6/10 , model 0 , custom===3.393147 
Fold 6/10 , model 1 , custom===5.727082 
Fold 6/10 , model 2 , custom===3.856140 
Fold 7/10 , model 0 , custom===2.159628 
Fold 7/10 , model 1 , custom===3.717978 
Fold 7/10 , model 2 , custom===2.850319 
Fold 8/10 , model 0 , custom===2.637633 
Fold 8/10 , model 1 , custom===4

Fold 5/10 , model 0 , custom===5.785021 
Fold 5/10 , model 1 , custom===4.326220 
Fold 5/10 , model 2 , custom===4.096095 
Fold 6/10 , model 0 , custom===4.890062 
Fold 6/10 , model 1 , custom===3.934046 
Fold 6/10 , model 2 , custom===3.960173 
Fold 7/10 , model 0 , custom===2.924537 
Fold 7/10 , model 1 , custom===3.117777 
Fold 7/10 , model 2 , custom===2.557212 
Fold 8/10 , model 0 , custom===3.337310 
Fold 8/10 , model 1 , custom===2.282507 
Fold 8/10 , model 2 , custom===2.115388 
Fold 9/10 , model 0 , custom===3.944140 
Fold 9/10 , model 1 , custom===3.487633 
Fold 9/10 , model 2 , custom===3.177442 
Fold 10/10 , model 0 , custom===4.135136 
Fold 10/10 , model 1 , custom===3.414205 
Fold 10/10 , model 2 , custom===3.085677 
Output dimensionality of level 1 is 3 
 level 1 lasted 4.274445 seconds 
Input Dimensionality 3 at Level 2 
1 models included in Level 2 
Fold 1/10 , model 0 , custom===3.611385 
Fold 2/10 , model 0 , custom===3.188989 
Fold 3/10 , model 0 , custom===4.521355

Input Dimensionality 13 at Level 0 
3 models included in Level 0 
Fold 1/10 , model 0 , custom===4.715888 
Fold 1/10 , model 1 , custom===5.293446 
Fold 1/10 , model 2 , custom===4.534604 
Fold 2/10 , model 0 , custom===2.684008 
Fold 2/10 , model 1 , custom===2.771786 
Fold 2/10 , model 2 , custom===2.723575 
Fold 3/10 , model 0 , custom===4.523356 
Fold 3/10 , model 1 , custom===4.296880 
Fold 3/10 , model 2 , custom===2.615705 
Fold 4/10 , model 0 , custom===2.969597 
Fold 4/10 , model 1 , custom===4.340736 
Fold 4/10 , model 2 , custom===3.352516 
Fold 5/10 , model 0 , custom===4.778472 
Fold 5/10 , model 1 , custom===7.032370 
Fold 5/10 , model 2 , custom===5.024373 
Fold 6/10 , model 0 , custom===2.909357 
Fold 6/10 , model 1 , custom===4.639648 
Fold 6/10 , model 2 , custom===2.899871 
Fold 7/10 , model 0 , custom===2.627274 
Fold 7/10 , model 1 , custom===3.949347 
Fold 7/10 , model 2 , custom===2.882348 
Fold 8/10 , model 0 , custom===2.604371 
Fold 8/10 , model 1 , custom===5