# 🧙‍♂️ Loan Default Prediction for Student Loans at Hogwarts

This notebook explores factors that influence whether magical students default on their student loans. We’ll use logistic regression and gradient boosting to analyze predictors like house affiliation, grades, discipline, and loan size.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("data/hogwarts_loan_data.csv")
df.head()

## 🔍 Exploratory Data Analysis

In [None]:
# Default rate
print(df['Defaulted'].value_counts(normalize=True))

# Correlation heatmap
numeric = df.select_dtypes(include='number')
sns.heatmap(numeric.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Boxplot: LoanAmount by Defaulted
sns.boxplot(x='Defaulted', y='LoanAmount', data=df)
plt.title('Loan Amount vs. Default Status')
plt.show()

# Average grades and allowance
sns.histplot(df[df['Defaulted']==0]['GradeAverage'], color='green', label='No Default', kde=True)
sns.histplot(df[df['Defaulted']==1]['GradeAverage'], color='red', label='Defaulted', kde=True)
plt.legend()
plt.title('Grade Average Distribution')
plt.show()


## 🛠️ Preprocessing & Model Training

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.drop("Defaulted", axis=1)
y = df_encoded["Defaulted"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

print("Logistic Regression Report:\n", classification_report(y_test, log_preds))
sns.heatmap(confusion_matrix(y_test, log_preds), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

print("XGBoost Report:\n", classification_report(y_test, xgb_preds))


In [None]:
importances = pd.Series(xgb_model.feature_importances_, index=X.columns)
importances.nlargest(10).plot(kind='barh')
plt.title('Top Feature Importances (XGBoost)')
plt.show()
