In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
german_credit = pd.read_csv('../data/german_credit.csv')
german_credit = german_credit.drop(columns=['Unnamed: 0'])
german_credit


Unnamed: 0,Account_status,Duration,Credit_history,Purpose,Credit_amount,Savings_bonds,Present_employment_since,Installment_rate,Other_debtors_guarantors,Resident_since,...,Age,Other_installment_plans,Housing,Existing_credits,Job,People_maintenance_for,Telephone,Foreign_worker,Credit_risk,Gender
0,< 0 DM,6,critical account / other credits existing (not...,radio / television,1169,unknown / no savings account,>= 7 years,4,none,4,...,67,none,own,2,skilled employee / official,1,yes,yes,1,Female
1,0 < ... < 200 DM,48,existing credits paid back duly till now,radio / television,5951,< 100 DM,1 <= ... < 4 years,2,none,2,...,22,none,own,1,skilled employee / official,1,none,yes,0,Male
2,no checking account,12,critical account / other credits existing (not...,education,2096,< 100 DM,4 <= ... < 7 years,2,none,3,...,49,none,own,1,unskilled - resident,2,none,yes,1,Female
3,< 0 DM,42,existing credits paid back duly till now,furniture / equipment,7882,< 100 DM,4 <= ... < 7 years,2,guarantor,4,...,45,none,for free,1,skilled employee / official,2,none,yes,1,Female
4,< 0 DM,24,delay in paying off in the past,car (new),4870,< 100 DM,1 <= ... < 4 years,3,none,4,...,53,none,for free,2,skilled employee / official,2,none,yes,0,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking account,12,existing credits paid back duly till now,furniture / equipment,1736,< 100 DM,4 <= ... < 7 years,3,none,4,...,31,none,own,1,unskilled - resident,1,none,yes,1,Male
996,< 0 DM,30,existing credits paid back duly till now,car (used),3857,< 100 DM,1 <= ... < 4 years,4,none,4,...,40,none,own,1,management / self-employed / highly qualified ...,1,yes,yes,1,Female
997,no checking account,12,existing credits paid back duly till now,radio / television,804,< 100 DM,>= 7 years,4,none,4,...,38,none,own,1,skilled employee / official,1,none,yes,1,Female
998,< 0 DM,45,existing credits paid back duly till now,radio / television,1845,< 100 DM,1 <= ... < 4 years,4,none,4,...,23,none,for free,1,skilled employee / official,1,yes,yes,0,Female


In [3]:
# One-Hot Encoding for categorical columns
german_credit_encoded = pd.get_dummies(german_credit, drop_first=True)

# Split the dataset into features (X) and target (y)
X = german_credit_encoded.drop(columns='Credit_risk')
y = german_credit_encoded['Credit_risk']

# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Initialize an empty array to collect predictions from bagging
preds = np.zeros((len(y_test), 10))

# Bagging loop for 10 iterations
for i in range(10):
    # Create a bootstrap sample of the training data
    X_train_sampled, y_train_sampled = resample(X_train, y_train, replace=True, n_samples=len(X_train), random_state=i)
    
    # Train a Decision Tree on the bootstrap sample
    tree_model = DecisionTreeClassifier()
    tree_model.fit(X_train_sampled, y_train_sampled)
    
    # Make predictions on the test set (probabilities for AUC)
    predictions_test = tree_model.predict_proba(X_test)[:, 1]
    
    # Calculate AUC score
    auc_score = roc_auc_score(y_test, predictions_test)
    print(f"AUC for iteration {i+1}: {auc_score}")
    
    # Collect predictions from this model iteration
    preds[:, i] = predictions_test

# The preds array now contains predictions from 10 different trees.

AUC for iteration 1: 0.5970608339029392
AUC for iteration 2: 0.6632577948367422
AUC for iteration 3: 0.6434092223565908
AUC for iteration 4: 0.6035280508964719
AUC for iteration 5: 0.6429623008570378
AUC for iteration 6: 0.6080498448919501
AUC for iteration 7: 0.6030022608969977
AUC for iteration 8: 0.6159366948840633
AUC for iteration 9: 0.6207213838792786
AUC for iteration 10: 0.6474840948525159


In [5]:
# Calculate the mean of predictions across rows
preds_mean = np.mean(preds, axis=1)

# Calculate AUC using the averaged predictions
final_auc = roc_auc_score(y_test, preds_mean)
print(f"Final AUC using averaged predictions: {final_auc}")

Final AUC using averaged predictions: 0.768336926231663
