# Introdução

Nesse notebook, procuro realizar uma implementação simples do método de stacking, que consiste no agrupamento de modelos de forma hierárquica. Especificamente, aplicarei o método ao conjunto House Prices. Alguns passos foram previamente executados em outros notebooks, disponíveis no meu perfil, para facilitar o trabalho, como a exclusão de colunas com muitos valores faltantes.

Testaremos algumas configurações de stacking:
* 2 camadas (modelos previsores base, modelo blender no topo) - Score na leaderboard: 0.13446
* 3 camadas (modelos previsores base, modelos blender na segunda camada e modelo blender no topo)

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import BaggingRegressor, VotingRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
%matplotlib inline

print("Bibliotecas carregadas!")

Bibliotecas carregadas!


In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Carregando os dados

In [3]:
path = '../input/house-prices-advanced-regression-techniques'
train = pd.read_csv(os.path.join(path, 'train.csv'))
test = pd.read_csv(os.path.join(path, 'test.csv'))

print("Shape of sets:")
print("-- train: ", train.shape)
print("-- test: ", test.shape)

Shape of sets:
-- train:  (1460, 81)
-- test:  (1459, 80)


### Pré-processamento

In [4]:
sum_nan_values = train.isna().sum().reset_index()
# dropping columns
cols_to_drop = sum_nan_values[sum_nan_values[0] > 258]['index']
for col in cols_to_drop.values:
    train.drop(col, axis=1, inplace=True)
    test.drop(col, axis=1, inplace=True)
    
for item in sum_nan_values[sum_nan_values[0] > 0]['index'].values:
    if item not in cols_to_drop.values:
        sum_nan_values.loc[sum_nan_values['index']==item, 'dtype'] = train[item].dtype

cols_of_interest = sum_nan_values[~(sum_nan_values['dtype'].isna()) & (sum_nan_values[0] > 0) & ~(sum_nan_values['index'].isin(cols_to_drop))]
impute_with_NA_cols = cols_of_interest[cols_of_interest['dtype'] == 'object']['index'].values.tolist()
impute_with_mean_cols = cols_of_interest[cols_of_interest['dtype'] != 'object']['index'].values.tolist()

for col in impute_with_NA_cols:
    imputer = SimpleImputer(strategy='constant', fill_value='Not Available')
    train.loc[:, col] = imputer.fit_transform(train[col].values.reshape(-1,1))

for col in impute_with_mean_cols:
    imputer = SimpleImputer(strategy='mean')
    train.loc[:, col] = imputer.fit_transform(train[col].values.reshape(-1,1))

In [5]:
test_nans = test.isna().sum().reset_index()

for item in test_nans[test_nans[0] > 0]['index'].values:
    if item not in cols_to_drop.values:
        test_nans.loc[test_nans['index']==item, 'dtype'] = test[item].dtype

cols_of_interest = test_nans[(test_nans[0] > 0) & ~(test_nans['index'].isin(cols_to_drop))]
impute_with_NA_cols = cols_of_interest[cols_of_interest['dtype'] == 'object']['index'].values.tolist()
impute_with_mean_cols = cols_of_interest[cols_of_interest['dtype'] != 'object']['index'].values.tolist()

for col in impute_with_NA_cols:
    imputer = SimpleImputer(strategy='constant', fill_value='Not Available')
    #train.loc[:, col] = imputer.fit_transform(train[col].values.reshape(-1,1))
    test.loc[:, col] = imputer.fit_transform(test[col].values.reshape(-1,1))

for col in impute_with_mean_cols:
    imputer = SimpleImputer(strategy='mean')
    #train.loc[:, col] = imputer.fit_transform(train[col].values.reshape(-1,1))
    test.loc[:, col] = imputer.fit_transform(test[col].values.reshape(-1,1))

In [6]:
X = train.drop(['SalePrice', 'Id'], axis=1)
y = train['SalePrice']

X_test = test.drop('Id', axis=1)

# Stacking

Os modelos a serem utilizados na primeira camada serão árvores de decisão e support vector machines. Usarei os modelos tanto na versão básica (um modelo só, com parâmetros estabelecidos em outros notebooks) quanto em conjunto (utilizando bagging e boosting, duas técnicas de agrupamento de modelos). A estrutura do primeiro será:

* Camada 1: DecisionTreeRegressor, SVR (baseline e fine tuned) e AdaBoost
* Camada 2: GradientBoosting

In [7]:
X_train, X_hold, y_train, y_hold = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
categorical_cols = [col for col in X.columns.values if train[col].dtype == 'object']
numerical_cols = [col for col in X.columns.values if train[col].dtype != 'object']

In [9]:
preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), numerical_cols),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

### Funções auxiliares

Definimos abaixo as funções para pré-processar os dados, criar os modelos da primeira camada e adicionar atributos ao conjunto de espera.

**AVISO: Algumas funções abaixo não foram otimizadas.**

In [10]:
# modify to train models in parallel using joblib
def build_fit_models(training_data: tuple([list, pd.DataFrame, np.ndarray]),
                target: tuple([list, pd.Series, pd.DataFrame])):
    
    linear_svr = LinearSVR(C=10, epsilon=0)
    poly_svr = SVR(kernel='poly', C=100, coef0=0.7, degree=5, epsilon=1.5)
    tune_poly_svr = SVR(kernel='poly', C=200, degree=6, coef0=1.2, epsilon=2.3)
    adaboost_tree = AdaBoostRegressor(learning_rate=0.2, loss='linear', n_estimators=30)
    
    model_list = [linear_svr, poly_svr, tune_poly_svr, adaboost_tree]

    X = preprocessor.fit_transform(training_data)
    
    for model in model_list:
        model.fit(X, target)
    return model_list

def build_blender(param_grid: dict):
    boosted_model_search = GridSearchCV(GradientBoostingRegressor(), param_grid,
                                   cv=5, scoring='neg_mean_squared_log_error', #neg_mean_squared_log_error
                                   return_train_score=True)
    return boosted_model_search
    
def build_hold_set(holdout_set: tuple([list, pd.DataFrame, np.ndarray]), model_list: list):
    
    holdout_set = preprocessor.transform(holdout_set)
    holdout_set = pd.DataFrame(holdout_set.todense())
    model_predictions = {"model_"+str(i): [] for i, model in enumerate(model_list)}
    for i, model in enumerate(model_list):
        preds = model.predict(holdout_set)
        model_key = "model_"+str(i)
        model_predictions[model_key] = preds
    
    for key, value in model_predictions.items():
        if isinstance(holdout_set, pd.DataFrame):
            holdout_set[key] = value
        else:
            print("Invalid instance type: ", type(holdout_set))
            
    return holdout_set

def predict(model_list, best_estimator, test_set):
    first_layer_test = build_hold_set(test_set, model_list)
    preds = best_estimator.predict(first_layer_test)
    return preds

## 1ª Configuração - Fit & Predict

In [11]:
param_grid = {
    'n_estimators': [150, 200, 250, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.125],
    'loss': ['squared_error']
}

In [12]:
# cria e treina modelos de primeira camada e gera lista
models_list = build_fit_models(X_train, y_train)

# adiciona previsoes dos modelos de primeira camada ao conjunto de espera
holdout_set = build_hold_set(X_hold, models_list)

# cria e treina modelo blender (ultima camada)
blender_model = build_blender(param_grid)
blender_model.fit(holdout_set, y_hold)

GridSearchCV(cv=5, estimator=GradientBoostingRegressor(),
             param_grid={'learning_rate': [0.01, 0.05, 0.1, 0.125],
                         'loss': ['squared_error'],
                         'n_estimators': [150, 200, 250, 300]},
             return_train_score=True, scoring='neg_mean_squared_log_error')

In [13]:
# melhores parametros do blender encontrado via grid search
blender_model.best_params_

{'learning_rate': 0.1, 'loss': 'squared_error', 'n_estimators': 200}

In [14]:
# melhor score do melhor previsor blender
blender_model.best_score_

-0.01683210033458083

In [15]:
# gera previsoes do conjunto de teste
preds = predict(models_list, blender_model, X_test)
predictions = pd.DataFrame(preds)

submission_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': predictions[0]
})
submission_df.to_csv('./Stacked_Submission.csv', index=False)
submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,130758.031094
1,1462,186535.298721
2,1463,180472.511806
3,1464,188662.866756
4,1465,177654.672149
