In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
import numpy as np

In [None]:
seed = 60

# Dataload

In [None]:
X, y = load_boston(return_X_y=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=seed)

# Baseline

In [None]:
%%time
models_dic = {
    'random forest': RandomForestRegressor(n_estimators=50, random_state=seed),
    'linear regression': LinearRegression(normalize=True),
    'knn': KNeighborsRegressor(),
    'catboost': CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')
}

for name, model in models_dic.items():
    kfold = KFold(n_splits=10, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
    print(f'{name} : {-np.mean(cv_results):.2f}')

# heamy

In [None]:
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

### stacking

In [None]:
%%time
# datasetを準備
dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

# pipelineを定義、2nd levelデータセットの作成
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.stack(k=10, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
cv_results = []
for y_true, y_pred in zip(y_trues, y_preds):
    cv_result = mean_absolute_error(y_true, y_pred)
    cv_results.append(cv_result)
print(f'stacking: {np.mean(cv_results):.2f}')

# X_testを使ってpredict
y_pred = stacker.predict()

### Blending

In [None]:
%%time
# datasetを準備
dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

# pipelineを定義、2nd levelデータセットの作成
pipeline = ModelsPipeline(*models)
stack_ds = pipeline.blend(proportion=0.2, seed=seed)

# modelを作ってvalidation
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
y_trues, y_preds = stacker.validate(k=10)

# 精度出力
cv_results = []
for y_true, y_pred in zip(y_trues, y_preds):
    cv_result = mean_absolute_error(y_true, y_pred)
    cv_results.append(cv_result)
print(f'blending: {np.mean(cv_results):.2f}')

# X_testを使ってpredict
y_pred = stacker.predict()

### Weighted Average

In [None]:
%%time
# datasetを準備
dataset = Dataset(X_train, y_train, X_val) # X_testは今回使わないが入れないとエラーになる

# アンサンブルに使うモデルを定義
models = [
    Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50, 'random_state': seed}, name='rf'),
    Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True}, name='lr'),
    Regressor(dataset=dataset, estimator=KNeighborsRegressor, name='kr'),
    Regressor(dataset=dataset, estimator=CatBoostRegressor, parameters={'custom_metric': ['MAE'], 'random_seed': seed, 'logging_level': 'Silent'}, name='cr')
]

# pipelineを定義
pipeline = ModelsPipeline(*models)

# 最適な重みを探索
weights = pipeline.find_weights(mean_absolute_error)
pipeline_apply = pipeline.weight(weights)

# 精度を出力
cv_results = pipeline_apply.validate(scorer=mean_absolute_error, k=10)
print(f'weighted average: {np.mean(cv_results):.2f}')

# X_testを使ってpredict
y_pred = pipeline_apply.execute()

# pystacknet

In [None]:
%%time
from pystacknet.pystacknet import StackNetRegressor

# アンサンブルに使うモデルを定義
models=[ 
    # 1st level
    [
        RandomForestRegressor(n_estimators=50, random_state=seed),
        LinearRegression(normalize=True),
        CatBoostRegressor(custom_metric=['MAE'], random_seed=seed, logging_level='Silent')     
    ],
    # 2nd level
    [
        Ridge(normalize=True),
        ExtraTreesRegressor(random_state=seed),
        XGBRegressor(random_state=seed)
    ],
    [
        LinearRegression(normalize=True)
    ]
]

# StackNetモデルを作成
model = StackNetRegressor(
    models, folds=10,
    restacking=False, use_retraining=True,
    random_state=seed, n_jobs=-1
)

# 精度出力
kfold = KFold(n_splits=10, random_state=seed)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="neg_mean_absolute_error")
print(name, ': ', -np.mean(cv_results))

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams["figure.figsize"] = (16, 4)
plt.rcParams["font.family"] = "IPAexGothic"
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
x = [49.5, 54.1, 5.37, 64, 691]
y = [2.26, 2.14, 3.04, 2.16, 2.20]
labels = ['単純比較', 'Stacking', 'Blending', 'Weighted Average', 'Stacknet']

fig, ax = plt.subplots(figsize=(16, 8))
ax.scatter(x, y, s=600, c="pink", alpha=0.5)

for i, txt in enumerate(labels):
    if i == 1:
        ax.annotate(txt, (x[i], y[i]-.04), fontsize=20)
    else:
        ax.annotate(txt, (x[i], y[i]+.01), fontsize=15)
ax.set_ylabel('Absolute mean error', fontsize=20)
ax.set_xlabel('秒数', fontsize=20)
ax.set_xscale('log')