In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from xgboost import XGBClassifier
import shap
import matplotlib.pyplot as pltlotlib.pyplot as plt
np.random.seed(42)

In [None]:
np.random.seed(42)
n = 18000
data = pd.DataFrame({
    'tenure_months': np.random.gamma(5, 10, n),
    'avg_balance': np.random.normal(2500, 1200, n).clip(50),
    'txn_freq': np.random.poisson(25, n),
    'service_contacts': np.random.poisson(1.5, n),
    'product_count': np.random.randint(1, 6, n),
    'digital_usage_change': np.random.normal(0, 1, n),
    'balance_change': np.random.normal(0, 1, n)
})
logit = (
    -0.03 * data['tenure_months']
    -0.0003 * data['avg_balance']
    -0.15 * data['product_count']
    +0.4 * data['service_contacts']
    -0.6 * data['digital_usage_change']
    -0.5 * data['balance_change']
)
prob = 1 / (1 + np.exp(-logit))
data['churn'] = (np.random.rand(n) < prob * 0.6).astype(int)
# Save the exact dataset used
data.to_csv('../data/raw/novabank_customer_retention_synthetic.csv', index=False)


In [None]:
X = data.drop(columns=['churn'])
y = data['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_prob_lr = log_reg.predict_proba(X_test)[:, 1]
baseline_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_lr),
    'Precision': precision_score(y_test, y_pred_lr),
    'Recall': recall_score(y_test, y_pred_lr),
    'F1': f1_score(y_test, y_pred_lr),
    'ROC_AUC': roc_auc_score(y_test, y_prob_lr)
}
baseline_metrics

In [None]:
xgb = XGBClassifier(
    n_estimators=250,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='auc'
)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
y_prob_xgb = xgb.predict_proba(X_test)[:, 1]
improved_metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'Precision': precision_score(y_test, y_pred_xgb),
    'Recall': recall_score(y_test, y_pred_xgb),
    'F1': f1_score(y_test, y_pred_xgb),
    'ROC_AUC': roc_auc_score(y_test, y_prob_xgb)
}
improved_metrics

In [None]:
threshold = 0.7
high_risk = (y_prob_xgb >= threshold).astype(int)
precision_at_threshold = precision_score(y_test, high_risk)
recall_at_threshold = recall_score(y_test, high_risk)
precision_at_threshold, recall_at_threshold

In [None]:
explainer = shap.TreeExplainer(xgb)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, show=False)
plt.tight_layout()