### Referência
[Aumentando o Poder Preditivo de Seus Modelos de Machine Learning com Stacking Ensembles](http://mariofilho.com/tutorial-aumentando-o-poder-preditivo-de-seus-modelos-de-machine-learning-com-stacking-ensembles/)

In [11]:
import numpy as np
import pandas as pd

### Carregando os dados

In [34]:
train = pd.read_csv("data/ames_raw.csv", index_col='Id')

X = train.drop('SalePrice', axis=1)
y = train.SalePrice.copy()
train.head().iloc[:, :5]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,60,RL,65.0,8450,Pave
2,20,RL,80.0,9600,Pave
3,60,RL,68.0,11250,Pave
4,70,RL,60.0,9550,Pave
5,60,RL,84.0,14260,Pave


### Funções auxiliares

In [41]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.ensemble import RandomForestRegressor

def rmsle(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

def rmsle_log_y(estimator, X, y):
    p = estimator.predict(X)
    return np.sqrt(mean_squared_error(y, p))

def rmsle_sqrt_y(estimator, X, y):
    p = estimator.predict(X)
    y = np.power(y, 2)
    p = np.power(p, 2)
    return np.sqrt(mean_squared_error(np.log1p(y), np.log1p(p)))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

### Feature set 1: variáveis "numéricas"

In [45]:
X1 = X.select_dtypes(include=[np.number]).fillna(-1)
print('Dims', X1.shape)
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X1, y, cv=kf, scoring=rmsle, n_jobs=-1).mean()
print(f"RMSLE: {error}")

Dims (1460, 36)
RMSLE: 0.14582352617986846


### Feature set 2: Ordinal Encoding

In [49]:
from sklearn.preprocessing import LabelEncoder

X2 = X.copy()
for col in X2.columns:
    if X2[col].dtype == object:
        enc = LabelEncoder()
        X2[col] = enc.fit_transform(X[col].fillna('Missing'))
        
print('Dims', X2.shape)
X2.fillna(-1, inplace=True)
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X2, y, cv=kf, scoring=rmsle, n_jobs=-1).mean()
print(f"RMSLE: {error}")

Dims (1460, 79)
RMSLE: 0.14383736485915208


### Transformações do Target

#### Log


In [53]:
# log
model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X1, np.log1p(y), cv=kf, scoring=rmsle_log_y, n_jobs=-1).mean()
print('RF, X1, log-target RMSLE:', error)

model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X2, np.log1p(y), cv=kf, scoring=rmsle_log_y, n_jobs=-1).mean()
print('RF, X2, log-target RMSLE:', error)

RF, X1, log-target RMSLE: 0.14516026930052636
RF, X2, log-target RMSLE: 0.14209480789088058


#### Raiz Quadrada

In [54]:
error = cross_val_score(model, X1, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y, n_jobs=-1).mean()
print('RF, X1, sqrt-target RMSLE:', error)

model = RandomForestRegressor(n_estimators=1000, random_state=0)
error = cross_val_score(model, X2, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y, n_jobs=-1).mean()
print('RF, X2, sqrt-target RMSLE:', error)

RF, X1, sqrt-target RMSLE: 0.14565293448427205
RF, X2, sqrt-target RMSLE: 0.14300460013198157


### Outro Modelo: GBM

In [56]:
# log
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X1, np.log1p(y), cv=kf, scoring=rmsle_log_y, n_jobs=-1).mean()
print('GBM, X1, log-target RMSLE:', error)

model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X2, np.log1p(y), cv=kf, scoring=rmsle_log_y, n_jobs=-1).mean()
print('GBM, X2, log-target RMSLE:', error)

GBM, X1, log-target RMSLE: 0.13349245491356662
GBM, X2, log-target RMSLE: 0.12980689048155078


In [57]:
# raiz quadrada
from sklearn.ensemble import GradientBoostingRegressor
                
model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X1, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y).mean()
print('GBM, X1, sqrt-target RMSLE:', error)

model = GradientBoostingRegressor(random_state=0)
error = cross_val_score(model, X2, np.sqrt(y), cv=kf, scoring=rmsle_sqrt_y).mean()
print('GBM, X2, sqrt-target RMSLE:', error)

GBM, X1, sqrt-target RMSLE: 0.13425897281342522
GBM, X2, sqrt-target RMSLE: 0.1309192356821107


# Stack

In [82]:
from itertools import product
from sklearn.linear_model import Ridge

kf_out = KFold(n_splits=5, shuffle=True, random_state=1)
kf_in = KFold(n_splits=5, shuffle=True, random_state=2)

cv_mean = []
for fold, (tr, ts) in enumerate(kf_out.split(X, y)):
    X1_train, X1_test = X1.iloc[tr], X1.iloc[ts]
    X2_train, X2_test = X2.iloc[tr], X2.iloc[ts]
    y_train, y_test = y.iloc[tr], y.iloc[ts]
    
    modelos = [GradientBoostingRegressor(random_state=0), RandomForestRegressor(random_state=0, n_estimators=10)]
    targets = [np.log1p, np.sqrt]
    feature_sets = [(X1_train, X1_test), (X2_train, X2_test)]
    
    
    predictions_cv = []
    predictions_test = []
    for model, target, feature_set in product(modelos, targets, feature_sets):
        predictions_cv.append(cross_val_predict(model, feature_set[0], target(y_train), cv=kf_in, n_jobs=-1).reshape(-1,1))
        model.fit(feature_set[0], target(y_train))
        ptest = model.predict(feature_set[1])
        predictions_test.append(ptest.reshape(-1,1))
        
    predictions_cv = np.concatenate(predictions_cv, axis=1)
    predictions_test = np.concatenate(predictions_test, axis=1)
    
    stacker = Ridge()
    stacker.fit(predictions_cv, np.log1p(y_train))
    error = rmsle_log_y(stacker, predictions_test, np.log1p(y_test))
    cv_mean.append(error)
    print('RMSLE Fold %d - RMSLE %.4f' % (fold, error))
    
print('RMSLE CV5 %.4f' % np.mean(cv_mean))

RMSLE Fold 0 - RMSLE 0.1248
RMSLE Fold 1 - RMSLE 0.1447
RMSLE Fold 2 - RMSLE 0.1257
RMSLE Fold 3 - RMSLE 0.1406
RMSLE Fold 4 - RMSLE 0.1086
RMSLE CV5 0.1289
