In [None]:
%load_ext autoreload
# %autoreload 2

# Our own model

### We decided to go with catboost as our black-box model as it is better fitted to categorical data such as this one, in which we have to predict whether an individual will default (1) or not (0).

In [66]:
import pandas as pd
from pathlib import Path

In [None]:
path = Path("../Data/dataproject2024.xlsx")
df = pd.read_excel(path)
print(df.info())
df

In [68]:
from sklearn.model_selection import train_test_split

X = df.drop(["ID", "Default (y)", "Pred_default (y_hat)", "PD", "Group"], axis=1)
y = df["Default (y)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
from catboost import CatBoostClassifier

cat_model = CatBoostClassifier(iterations=20, depth=10, learning_rate=0.1, loss_function="Logloss", verbose=False)
cat_model.fit(X_train, y_train)

In [70]:
y_pred_class = cat_model.predict(X_test)
y_pred_prob = cat_model.predict_proba(X_test)[:,1]

In [None]:
df_pred = pd.DataFrame([y_pred_class, y_pred_prob]).transpose()
df_pred = df_pred.rename(columns={0: "y_hat", 1:"prob_default"})
df_pred = df_pred.astype({"y_hat": int})
df_pred

In [None]:
from sklearn.metrics import accuracy_score

acc_cat = accuracy_score(df_pred["y_hat"], y_test)
acc_init_model = accuracy_score(df["Default (y)"], df["Pred_default (y_hat)"])

print(f'The accuracy with a CatboostClassifier model is {acc_cat: .3f}, whereas the accuracy of the initial model was {acc_init_model: .3f}.')

In [73]:
cat_model.save_model("../assets/cat_boost_model")

### Saving the predictions for the whole dataframe to build surrogates model for it

In [None]:
y_pred_prob_total = cat_model.predict_proba(X)[:,1]
y_pred_class_total = cat_model.predict(X)
df_pred_total = pd.DataFrame([y_pred_class_total, y_pred_prob_total]).transpose()
df_pred_total = df_pred_total.rename(columns={0: "y_hat", 1:"pred_default"})
df_pred_total = df_pred_total.astype({"y_hat": int})
df_pred_total

In [None]:
df_total = pd.concat([X, df_pred_total], axis=1)
df_total

In [76]:
df_total.to_csv("../Data/catboost_output.csv")

In [None]:
import sys
sys.path.append("..")

from src.shap import draw_shap_summary
draw_shap_summary(model=cat_model, data=X_train, model_name="cat_model")

In [None]:
from src.pdp_ice import draw_pdp_ice_graphs
draw_pdp_ice_graphs(model=cat_model, data=X_train, model_name="cat_model", variables=["Job tenure", "Homeowner"])

In [None]:
from src.part8 import xper_method

df2 = df.sample(500).drop(["ID", "Pred_default (y_hat)", "PD", "Group"], axis=1)
X_filtered = df2.drop("Default (y)", axis=1)
y_filtered = df2["Default (y)"]
 
X_train_filt, X_test_filt, y_train_filt, y_test_filt = train_test_split(X_filtered, y_filtered, test_size=0.15, random_state=42)

xper_method(model=cat_model, eval_metric="Accuracy", dataset=df2, target_col="Default (y)", X_train=X_train_filt, y_train=y_train_filt, X_test=X_test_filt, y_test=y_test_filt)