## BAGGING

**Bootstrap + Aggregating**

Julia Hernández Cárdenas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import random
from scipy.stats import t, norm
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error, log_loss, roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("Default.csv")
x_todo = df[['balance', 'income', 'student']]
x_todo = pd.get_dummies(x_todo, columns=['student'], drop_first=True)
x_todo = x_todo.astype(np.float64)
X = sm.add_constant(x_todo)
y = (df["default"] == 'Yes').astype(int)

In [3]:
logit = sm.Logit(y, X)
results = logit.fit()

y_pred_prob = results.predict(X)

auc_logit = roc_auc_score(y, y_pred_prob)

y_pred_class = (y_pred_prob >= 0.5).astype(int)

accuracy_logit = accuracy_score(y, y_pred_class)

print(f"AUC Logistic Regression: {auc_logit:.4f}")
print(f"Accuracy Logistic Regression: {accuracy_logit:.4f}")

Optimization terminated successfully.
         Current function value: 0.078577
         Iterations 10
AUC Logistic Regression: 0.9496
Accuracy Logistic Regression: 0.9732


In [7]:
columnas = x_todo.columns.tolist()

B = 500
predicciones_boot = []

n = len(df)

for b in range(B):
    cols_sel = np.random.choice(columnas, 2, replace=False)
    X_sel = sm.add_constant(x_todo[cols_sel])
    
    i = np.random.choice(n, n, replace=True)
    X_b = X_sel.iloc[i]
    y_b = y.iloc[i]

    modelo_b = sm.Logit(y_b, X_b).fit(disp=False)

y_pred_prob = modelo_b.predict(X_sel).values
predicciones_boot.append(y_pred_prob)

predicciones_boot = np.array(predicciones_boot)
promedio_pred = predicciones_boot.mean(axis=0)
y_pred_bagging = (promedio_pred >= 0.5).astype(int)
accuracy_bagging = accuracy_score(y, y_pred_bagging)
auc_bagging = roc_auc_score(y, promedio_pred)

print(f"Accuracy Bagging: {accuracy_bagging:.4f}")
print(f"AUC Bagging: {auc_bagging:.4f}")

Accuracy Bagging: 0.9735
AUC Bagging: 0.9495


In [8]:
comparacion = pd.DataFrame({
    "Accuracy Logit": [accuracy_logit],
    "ROC Logit": [auc_logit],
    "Accuracy Bagging": [accuracy_bagging],
    "ROC Bagging": [auc_bagging]
})
comparacion

Unnamed: 0,Accuracy Logit,ROC Logit,Accuracy Bagging,ROC Bagging
0,0.9732,0.949558,0.9735,0.949523


Ambos modelos tienen prácticamente el mismo desempeño, aunque usualmente el modelo Bagging debe dar un resultado más preciso que el modelo normal. Estos dos resultados siendo "iguales" podría significar que el Logit inicial ya estaba bien definido y aprovecha de manera favorable la información del dataset