In [32]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [17]:
# Load your dataset
df = pd.read_csv("german.data", sep=' ', names=[
    "Status", "Duration", "CreditHistory", "Purpose", "CreditAmount", "Savings", "EmploymentSince",
    "InstallmentRate", "PersonalStatusSex", "DebtorsGuarantors", "ResidenceSince", "Property",
    "Age", "OtherInstallmentPlans", "Housing", "ExistingCredits", "Job", "LiablePeople", "Telephone",
    "ForeignWorker", "Target"
])

df.head(5)


Unnamed: 0,Status,Duration,CreditHistory,Purpose,CreditAmount,Savings,EmploymentSince,InstallmentRate,PersonalStatusSex,DebtorsGuarantors,...,Property,Age,OtherInstallmentPlans,Housing,ExistingCredits,Job,LiablePeople,Telephone,ForeignWorker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [18]:
df.columns

Index(['Status', 'Duration', 'CreditHistory', 'Purpose', 'CreditAmount',
       'Savings', 'EmploymentSince', 'InstallmentRate', 'PersonalStatusSex',
       'DebtorsGuarantors', 'ResidenceSince', 'Property', 'Age',
       'OtherInstallmentPlans', 'Housing', 'ExistingCredits', 'Job',
       'LiablePeople', 'Telephone', 'ForeignWorker', 'Target'],
      dtype='object')

In [19]:
# Rename

df.rename(columns={'Status': 'Status', 'Duration' : 'Duration', 'CreditHistory' : 'Credit History', 'Purpose' : 'Purpose', 'CreditAmount' : 'Credit Amount',
       'Savings' : 'Savings', 'EmploymentSince' : 'Employment Since', 'InstallmentRate' : 'Installment Rate', 'PersonalStatusSex' : 'Sex',
       'DebtorsGuarantors' : 'Doctor Guarantor', 'ResidenceSince' : 'Residence Since', 'Property' : 'Property', 'Age' : 'Age',
       'OtherInstallmentPlans' : 'Other Installment Plan', 'Housing' : 'Housing', 'ExistingCredits' : 'Existing Credit', 'Job' : 'Job',
       'LiablePeople' : 'Liable People', 'Telephone' : 'Telephone', 'ForeignWorker' : 'Foreign Worker', 'Target' : 'Target'}, inplace=True)

df.head(5)

Unnamed: 0,Status,Duration,Credit History,Purpose,Credit Amount,Savings,Employment Since,Installment Rate,Sex,Doctor Guarantor,...,Property,Age,Other Installment Plan,Housing,Existing Credit,Job,Liable People,Telephone,Foreign Worker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [20]:
df["Target"].value_counts()


Target
1    700
2    300
Name: count, dtype: int64

In [21]:
# Target: 1 = good (1), 2 = bad (0)
df["Target"] = df["Target"].map({1: 1, 2: 0})

In [22]:
df["Target"].value_counts()

Target
1    700
0    300
Name: count, dtype: int64

In [23]:
# Encode categorical columns
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])
    
df. head(5)

Unnamed: 0,Status,Duration,Credit History,Purpose,Credit Amount,Savings,Employment Since,Installment Rate,Sex,Doctor Guarantor,...,Property,Age,Other Installment Plan,Housing,Existing Credit,Job,Liable People,Telephone,Foreign Worker,Target
0,0,6,4,4,1169,4,4,4,2,0,...,0,67,2,1,2,2,1,1,0,1
1,1,48,2,4,5951,0,2,2,1,0,...,0,22,2,1,1,2,1,0,0,0
2,3,12,4,7,2096,0,3,2,2,0,...,0,49,2,1,1,1,2,0,0,1
3,0,42,2,3,7882,0,3,2,2,2,...,1,45,2,2,1,2,2,0,0,1
4,0,24,3,0,4870,0,2,3,2,0,...,3,53,2,2,2,2,2,0,0,0


In [28]:
# Normalize numeric columns
scaler = StandardScaler()
numeric_cols = ['Duration', 'Credit Amount', 'Installment Rate', 'Residence Since', 'Age',
                'Existing Credit', 'Liable People']
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [38]:
# Train-test split
X = df.drop("Target", axis=1)
y = df["Target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Create models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
logreg_model = LogisticRegression(random_state=42)
xgb_model = XGBClassifier(random_state=42)

# Train models on the training data (assuming X_train and y_train are defined)
rf_model.fit(X_train, y_train)
logreg_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Get predicted probabilities for the positive class (class 1)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]  # Random Forest
y_proba_logreg = logreg_model.predict_proba(X_test)[:, 1]  # Logistic Regression
y_proba_xgb = xgb_model.predict_proba(X_test)[:, 1]  # XGBoost

# Calculate AUC for each model
rf_auc = roc_auc_score(y_test, y_proba_rf)
logreg_auc = roc_auc_score(y_test, y_proba_logreg)
xgb_auc = roc_auc_score(y_test, y_proba_xgb)

# Print AUC scores
print(f"Random Forest AUC: {rf_auc:.4f}")
print(f"Logistic Regression AUC: {logreg_auc:.4f}")
print(f"XGBoost AUC: {xgb_auc:.4f}")


Random Forest AUC: 0.8280
Logistic Regression AUC: 0.8161
XGBoost AUC: 0.8069


In [40]:
# Train models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
logreg = LogisticRegression(max_iter=1000)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
rf_model.fit(X_train, y_train)
logreg.fit(X_train, y_train)
xgb.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [41]:
# Predictions & Evaluation
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
y_proba_logreg = logreg.predict_proba(X_test)[:, 1]
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

# AUC scores
print("RandomForest AUC:", roc_auc_score(y_test, y_proba_rf))
print("LogReg AUC:", roc_auc_score(y_test, y_proba_logreg))
print("XGBoost AUC:", roc_auc_score(y_test, y_proba_xgb))

# Confusion Matrices (using y_test for true labels)
print("Confusion Matrix (rf):")
print(confusion_matrix(y_test, rf_model.predict(X_test)))

print("Confusion Matrix (XGB):")
print(confusion_matrix(y_test, xgb.predict(X_test)))

# Classification Report for XGBoost
print("Classification Report (XGB):")
print(classification_report(y_test, xgb.predict(X_test)))


RandomForest AUC: 0.8279841327082582
LogReg AUC: 0.8160836639019111
XGBoost AUC: 0.8069479504748166
Confusion Matrix (rf):
[[ 31  28]
 [ 11 130]]
Confusion Matrix (XGB):
[[ 30  29]
 [ 17 124]]
Classification Report (XGB):
              precision    recall  f1-score   support

           0       0.64      0.51      0.57        59
           1       0.81      0.88      0.84       141

    accuracy                           0.77       200
   macro avg       0.72      0.69      0.70       200
weighted avg       0.76      0.77      0.76       200



In [42]:
# Save predictions for Power BI
X_test_copy = X_test.copy()
X_test_copy["Default_Prob_LogReg"] = y_proba_logreg
X_test_copy["Default_Prob_XGB"] = y_proba_xgb
X_test_copy["Actual"] = y_test.values
X_test_copy["Risk_Segment"] = pd.cut(y_proba_xgb, bins=[0, 0.3, 0.6, 1.0], labels=["Low", "Medium", "High"])
X_test_copy.to_csv("credit_risk_dashboard_data.csv", index=False)