In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, PowerTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.combine import SMOTETomek
import shap

df = pd.read_csv('cleaned_data.csv')

# Step 1: Prepare data
X = df.drop(columns=['TARGET'])
y = df['TARGET']

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Impute missing values using median
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Step 4: Yeo-Johnson Transformation (only numeric columns)
numeric_cols = X_train_imputed.select_dtypes(include=[np.number]).columns
yeo = PowerTransformer(method='yeo-johnson', standardize=False)
X_train_imputed[numeric_cols] = yeo.fit_transform(X_train_imputed[numeric_cols])
X_test_imputed[numeric_cols] = yeo.transform(X_test_imputed[numeric_cols])

# Step 5: Min-Max Scaling
scaler = MinMaxScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_imputed), columns=X_test.columns)

# Step 6: SMOTE-Tomek
smote_tomek = SMOTETomek(random_state=42)
X_res, y_res = smote_tomek.fit_resample(X_train_scaled, y_train)

# Step 7: Train XGBoost for SHAP
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_res, y_res)

# Step 8: SHAP feature importance
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_res)
shap_importance = np.abs(shap_values.values).mean(axis=0)

shap_feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': shap_importance
}).sort_values(by='importance', ascending=False)

top_features = shap_feature_importance['feature'].head(30).tolist()

# Step 9: Subset to top SHAP features
X_train_shap = X_res[top_features]
X_test_shap = X_test_scaled[top_features]

# Step 10: GridSearchCV on XGBoost
params = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1],
    'scale_pos_weight': [1, 3, 5]
}

grid = GridSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    param_grid=params,
    scoring='f1',
    cv=3,
    n_jobs=-1
)
grid.fit(X_train_shap, y_res)
final_model = grid.best_estimator_
y_pred_xgb = final_model.predict(X_test_shap)

print("🔷 XGBoost GridSearch Results")
print("Best Parameters:", grid.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print("Top 20 SHAP Features:\n", top_features)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


🔷 XGBoost GridSearch Results
Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'scale_pos_weight': 1, 'subsample': 0.8}
Accuracy: 0.9175731132435277
Confusion Matrix:
 [[57316  1145]
 [ 4258  2830]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95     58461
           1       0.71      0.40      0.51      7088

    accuracy                           0.92     65549
   macro avg       0.82      0.69      0.73     65549
weighted avg       0.91      0.92      0.91     65549

Top 20 SHAP Features:
 ['CRIFF_11', 'DEC_CRIFFCHNG1', 'TOTAL_CRIFF1', 'INCOME_BAND1', 'POP_CODE', 'TIMES_IRAC_SLIP', 'LAST_1_YR_RG2', 'PRI_OVERDUE_ACCTS1', 'PRI_ACTIVE_ACCTS1', 'CRIFF_22', 'LATEST_RG3_TENURE', 'NO_YRS_RG3', 'LAST_1_YR_RG1', 'LATEST_CR_DAYS', 'LAST_1_YR_RG3', 'LATEST_DR_DAYS', 'NO_LONS', 'CRIFF_33', 'AGREG_GROUP', 'ALL_LON_OUTS', 'max_consec_overspend', 'OLDEST_RESIDUAL_TENURE', 'CRIFF_55', 'CRIFF_66', '

In [2]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve

lgbm = LGBMClassifier(
    objective='binary',
    class_weight='balanced',      # Focus on minority
    n_estimators=500,
    learning_rate=0.1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1.0,
    reg_lambda=1.0,
    min_child_weight=10,
    random_state=42
)

lgbm.fit(X_train_shap, y_res)

# Predict and tune threshold
y_probs = lgbm.predict_proba(X_test_shap)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-6)
best_thresh = thresholds[np.argmax(f1_scores)]
y_pred = (y_probs >= best_thresh).astype(int)

print("\n🔷 Balanced LightGBM Results")
print(f"Best Threshold: {best_thresh:.2f}, F1: {max(f1_scores):.4f}")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[LightGBM] [Info] Number of positive: 233811, number of negative: 233811
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087967 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7642
[LightGBM] [Info] Number of data points in the train set: 467622, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

🔷 Balanced LightGBM Results
Best Threshold: 0.32, F1: 0.6050
Accuracy: 0.913316755404354
Confusion Matrix:
 [[55516  2945]
 [ 2737  4351]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.95      0.95     58461
           1       0.60      0.61      0.60      7088

    accuracy                           0.91     65549
   macro avg       0.77      0.78      0.78     65549
weighted avg       0.91      0.91      0.91     65549

