# Stacking

In [1]:
import pandas as pd

# Cargar con la librería pandas el dataset bank.csv y mostrarlo
bank_data = pd.read_csv("data/bank.csv", delimiter=";")
bank_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [2]:
# Eliminar la columna "y" del dataset para almacenar el resultado en X (predictores)
# Guardar la columna "y" en la variable y, y convertir los "yes" en 1s y los "no" en 0s.

X = bank_data.drop("y", axis=1)
y = bank_data["y"].map({"yes": 1, "no": 0})

# Mostrar la cardinalidad de X e y.

print(f"Cardinalidad de X: {X.shape}")
print(f"Cardinalidad de y: {y.shape}")

Cardinalidad de X: (4521, 16)
Cardinalidad de y: (4521,)


In [3]:
# Dividir los datos en train y test.
# Tamaño de train: 80%
# Tamaño de test: 20%
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [4]:
# Seleccionar las variables numéricas y las categóricas de los predictores (X_train)

num_features = X_train.select_dtypes(include=['int64', 'float64']).columns
cat_features = X_train.select_dtypes(include=['object']).columns

In [5]:
# Instanciar un transformador StandardScaler para las variables numéricas y un OneHotEncoder para las variables categóricas
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

In [6]:
from sklearn.compose import ColumnTransformer

# Usar la clase ColumnTransformer para aplicar el num_transformer
# a las variables numéricas (num_features) y el cat_transformer a las variables categóricas (cat_features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [7]:
# Crear un pipeline con las fases preprocessor y un stacking
# Crear una lista de tuplas de la forma (identificador, estimador)
# La listar debe contener los estimadores:
# - RandomForestClassifier
# - SVC
# - DecisionTreeClassifier
# Crear un stacking classifier con los estimadores anteriores y
# una LogisticRegression como meta-modelo
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('rf', RandomForestClassifier(random_state=1234)),
    ('svc', SVC(random_state=1234)),
    ('dt', DecisionTreeClassifier(random_state=1234))
]

stacking_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), cv=5
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', stacking_clf)
])

In [8]:
# Crear una rejilla de hiperparámetros con los siguientes valores posible:
# Para RandomForest:
#      n_estimators: 10, 50
#      max_depth: None, 10
# Para SVC:
#      C: 0.1, 1.0
# Para LogisticRegression:
#      C: 0.1, 1.0, 10.0
param_grid = {
    'classifier__rf__n_estimators': [10, 50],
    'classifier__rf__max_depth': [None, 10],
    'classifier__svc__C': [0.1, 1.0],
    'classifier__final_estimator__C': [0.1, 1.0, 10.0]
}

param_grid = {
    'classifier__rf__n_estimators': [10],
    'classifier__rf__max_depth': [10],
    'classifier__svc__C': [0.1],
    'classifier__final_estimator__C': [0.1]
}

In [9]:
%%time

from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

CPU times: user 7.41 s, sys: 71.5 ms, total: 7.48 s
Wall time: 3.42 s


In [10]:
print(f"Mejores hiperparámetros encontrados: {grid_search.best_params_}")

Mejores hiperparámetros encontrados: {'classifier__final_estimator__C': 0.1, 'classifier__rf__max_depth': 10, 'classifier__rf__n_estimators': 10, 'classifier__svc__C': 0.1}


In [11]:
# Obtener las predicciones y el classificacion_report
from sklearn.metrics import classification_report

y_pred = grid_search.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       807
           1       0.53      0.16      0.25        98

    accuracy                           0.89       905
   macro avg       0.72      0.57      0.60       905
weighted avg       0.87      0.89      0.87       905

