In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import numpy as np

In [2]:
seed = 42

# Dataload

In [3]:
X, y = load_boston(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=seed)

# Baseline

In [4]:
%%time
models_dic = {
    'random forest': RandomForestRegressor(n_estimators=50, random_state=seed),
    'linear regression': LinearRegression(normalize=True),
    'knn': KNeighborsRegressor(),
    'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')
}

for name, model in models_dic.items():
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
    print(f'{name} : {-np.mean(cv_results):.2f}')

random forest : 2.23
linear regression : 3.47
knn : 4.45




catboost : 2.20
CPU times: user 1min 43s, sys: 16.9 s, total: 2min
Wall time: 46.8 s


# heamy

In [5]:
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

### stacking

In [6]:
%%time
# datasetを準備
dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

# pipelineを定義
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
cv_results = []
for y_true, y_pred in zip(y_trues, y_preds):
    cv_result = mean_absolute_error(y_true, y_pred)
    cv_results.append(cv_result)
print(f'stacking: {-np.mean(cv_results):.2f}')

stacking: -2.16
CPU times: user 34.1 ms, sys: 6.63 ms, total: 40.7 ms
Wall time: 114 ms


### Blending

In [7]:
%%time
# datasetを準備
dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

# pipelineを定義
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.blend(proportion=0.2, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
cv_results = []
for y_true, y_pred in zip(y_trues, y_preds):
    cv_result = mean_absolute_error(y_true, y_pred)
    cv_results.append(cv_result)
print(f'blending: {-np.mean(cv_results):.2f}')

blending: -3.04
CPU times: user 30.5 ms, sys: 4.41 ms, total: 34.9 ms
Wall time: 60 ms


### Weighted Average

In [8]:
%%time
# datasetを準備
dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

# pipelineを定義
pipeline = ModelsPipeline(*models)

# 最適な重みを探索
weights = pipeline.find_weights(mean_absolute_error)
pipeline_apply = pipeline.weight(weights)

# 精度を出力
cv_results = pipeline_apply.validate(scorer=mean_absolute_error, k=10)
print(f'weighted averag: {-np.mean(cv_results):.2f}')

Best Score (mean_absolute_error): 1.9893582436999917
Best Weights: [2.18887575e-01 1.94297608e-17 7.96579612e-18 7.81112425e-01]
Metric: mean_absolute_error
Folds accuracy: [1.978630110432124, 2.137259891149993, 2.145816457861829, 1.4496619629760552, 2.3527743239776573, 2.910698591290202, 2.0799869355876592, 1.74754010963542, 3.114361238261376, 2.3891019916145555]
Mean accuracy: 2.2305831612786875
Standard Deviation: 0.47211251675696475
Variance: 0.22289022847859533
weighted averag: -2.23
CPU times: user 50.4 ms, sys: 9.36 ms, total: 59.7 ms
Wall time: 94.9 ms


# pystacknet

In [9]:
from pystacknet.pystacknet import StackNetRegressor

In [10]:
%%time
# アンサンブルに使うモデルを定義
models=[ 
    # 1st level
    [
        RandomForestRegressor(n_estimators=50, random_state=seed),
        LinearRegression(normalize=True),
        CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')     
    ],
    # 2nd level
    [
        Ridge(normalize=True),
        ExtraTreesRegressor(random_state=seed),
        XGBRegressor(random_state=seed)
    ],
    [
        LinearRegression(normalize=True)
    ]
]

# StackNetモデルを作成
model = StackNetRegressor(
    models, folds=10,
    restacking=False, use_retraining=True,
    random_state=seed, n_jobs=-1
)

# 精度出力
kfold = KFold(n_splits=10, random_state=seed)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
print(name, ': ', -np.mean(cv_results))

catboost :  2.2010970199601947
CPU times: user 30.3 s, sys: 18.8 s, total: 49.1 s
Wall time: 11min 31s
