# Bagging

In [42]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


## Cargamos datos

In [43]:
df_def = pd.read_csv('Default.csv')

In [44]:
df_def['default_num'] = (df_def['default'] == 'Yes').astype(int)
X_def = df_def[['balance']]
y_def = df_def['default_num']
X_def_sm = sm.add_constant(X_def)

In [45]:
model_def = sm.Logit(y_def, X_def_sm).fit(disp=0)
se_def_analitico = model_def.bse

## Utilizamos Bootstrapp para simular 500 remuestreos 

In [46]:
def bootstrap_regression(X, y, model_type='linear', n_iterations=500):
    n_samples = len(y)
    coefs = []
    intercepts = []
    
    for _ in range(n_iterations):
        X_res, y_res = resample(X, y, n_samples=n_samples)
        
        if model_type == 'linear':
            model = Ridge(alpha=0) 
            model.fit(X_res, y_res)
        elif model_type == 'logistic':
            model = LogisticRegression(penalty=None, solver='lbfgs') 
            model.fit(X_res, y_res)
            
        coefs.append(model.coef_[0] if model_type=='linear' else model.coef_[0][0])
        intercepts.append(model.intercept_ if model_type=='linear' else model.intercept_[0])
        
    return np.array(intercepts), np.array(coefs)

In [47]:
boot_b0_def, boot_b1_def = bootstrap_regression(X_def, y_def, 'logistic')

print("Default: Resultados 1000 iteraciones")
print(f"Media Beta_0 (Intercept): {boot_b0_def.mean():.4f}  Std Dev (SE): {boot_b0_def.std():.4f}")
print(f"Media Beta_1 (Balance):   {boot_b1_def.mean():.4f}  Std Dev (SE): {boot_b1_def.std():.4f}")

Default: Resultados 1000 iteraciones
Media Beta_0 (Intercept): -10.6856  Std Dev (SE): 0.3625
Media Beta_1 (Balance):   0.0055  Std Dev (SE): 0.0002


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Logistic Regression

In [49]:
log_reg = LogisticRegression(penalty=None, solver='lbfgs')
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
acc_log = accuracy_score(y_test, y_pred_log)
acc_log

0.9726666666666667

## Aggregating de 2

In [50]:
bagging_2 = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=2,
    random_state=42
)
bagging_2.fit(X_train, y_train)
y_pred_bag2 = bagging_2.predict(X_test)
acc_bag2 = accuracy_score(y_test, y_pred_bag2)
acc_bag2

0.9673333333333334

## Bootstrap de 5000

In [53]:
bagging_5000 = BaggingClassifier(
    estimator=DecisionTreeClassifier(random_state=42),
    n_estimators=5000,
    random_state=42,
    n_jobs=-1 
)
bagging_5000.fit(X_train, y_train)
y_pred_bag5000 = bagging_5000.predict(X_test)
acc_bag5000 = accuracy_score(y_test, y_pred_bag5000)
acc_bag5000

0.9576666666666667

## Tabla comparativa

In [54]:
resultados = pd.DataFrame({
    'Modelo': ['Logistic Regression', 'Bagging Tree (n=2)', 'Bagging Tree (n=5000)'],
    'Accuracy': [acc_log, acc_bag2, acc_bag5000]
})

print(resultados)

                  Modelo  Accuracy
0    Logistic Regression  0.972667
1     Bagging Tree (n=2)  0.967333
2  Bagging Tree (n=5000)  0.957667


## Conclusiones 


Da mayor el accuracy de la regresion logistica, lo que puede significar que me equivoque en alguna cosa, o simplemtente porque la regresion logistica tiene  una estructura lineal simple