In [None]:
# üìò Customer Churn Prediction ‚Äì Final Production Pipeline
# ========================================================

# 1Ô∏è‚É£ IMPORT LIBRARIES
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import joblib
import optuna
import shap

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, recall_score
from xgboost import XGBClassifier

# ================================
# 2Ô∏è‚É£ LOAD DATA & CLEANING
# ================================
# Load data
df = pd.read_csv("../data/Bank_Churn.csv") # ‚ö†Ô∏è Update path if needed

# Drop ID columns (we don't need them for prediction)
df = df.drop(["CustomerId", "Surname"], axis=1)

print("Data Loaded. Shape:", df.shape)
print(df.head())

# ================================
# 3Ô∏è‚É£ TRAIN-TEST SPLIT
# ================================
# We split BEFORE encoding/scaling to prevent data leakage
X = df.drop("Exited", axis=1)
y = df["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate scale_pos_weight for imbalance handling
# (Count of Negatives / Count of Positives)
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Calculated scale_pos_weight: {pos_weight:.2f}")

# ================================
# 4Ô∏è‚É£ DEFINE THE PREPROCESSING PIPELINE
# ================================
# Define which columns are which
numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
categorical_features = ['Geography', 'Gender']

# Create the ColumnTransformer
# This handles all data transformations automatically
preprocessor = ColumnTransformer(
    transformers=[
        # Scale numbers
        ('num', StandardScaler(), numerical_features),
        # Encode categories (Ordinal is good for Trees + SHAP)
        ('cat', OrdinalEncoder(), categorical_features)
    ],
    # Pass 'HasCrCard' and 'IsActiveMember' through without changes
    remainder='passthrough'
)

# ================================
# 5Ô∏è‚É£ OPTUNA OPTIMIZATION (Optional)
# ================================
# ‚ÑπÔ∏è NOTE: I have commented this out to make the code run fast.
# If you want to re-optimize, uncomment the lines below.

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "scale_pos_weight": pos_weight,
        "eval_metric": "auc",
        "random_state": 42,
        "n_jobs": -1
    }
    # Build temp pipeline for trial
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', XGBClassifier(**params))
    ])
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)
best_params = study.best_params

# ‚ö†Ô∏è USING PRE-CALCULATED BEST PARAMS (To save time)
# These are based on your previous successful runs
best_params = {
    'n_estimators': 455,
    'learning_rate': 0.12,
    'max_depth': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'min_child_weight': 1,
    'scale_pos_weight': pos_weight, # Ensures high recall
    'eval_metric': 'logloss',
    'random_state': 42,
    'n_jobs': -1
}

# ================================
# 6Ô∏è‚É£ BUILD & TRAIN FINAL PIPELINE
# ================================
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(**best_params))
])

print("üöÄ Training Pipeline...")
start_time = time.time()
pipeline.fit(X_train, y_train)
print(f"‚úÖ Training Complete. Time: {time.time() - start_time:.2f}s")

# ================================
# 7Ô∏è‚É£ EVALUATION
# ================================
# Get probabilities
y_prob = pipeline.predict_proba(X_test)[:, 1]

# Apply threshold (Default 0.5, but you can tune this using F2 score logic)
threshold = 0.5
y_pred = (y_prob >= threshold).astype(int)

print("\n--- Final Classification Report ---")
print(classification_report(y_test, y_pred))

# Confusion Matrix Plot
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()

# ROC Curve
auc = roc_auc_score(y_test, y_prob)
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f"AUC = {auc:.2f}")
plt.plot([0, 1], [0, 1], '--')
plt.title("ROC Curve")
plt.legend()
plt.show()

# ================================
# 8Ô∏è‚É£ SHAP EXPLAINABILITY
# ================================
# Note: SHAP needs the *transformed* data, not raw data.
# We must use the 'preprocessor' step to transform X_test first.

print("Generating SHAP explanations...")

# 1. Transform X_test using the pipeline's preprocessor
X_test_transformed = pipeline.named_steps['preprocessor'].transform(X_test)

# 2. Get feature names from the transformer
# (OrdinalEncoder keeps names, StandardScaler keeps names)
feature_names = numerical_features + categorical_features + ['HasCrCard', 'IsActiveMember']

# 3. Create Explainer
model_step = pipeline.named_steps['classifier']
explainer = shap.TreeExplainer(model_step)
shap_values = explainer.shap_values(X_test_transformed)

# 4. Plot
shap.initjs()
plt.title("Feature Importance (SHAP)")
shap.summary_plot(shap_values, X_test_transformed, feature_names=feature_names, show=False)
plt.show()

# ================================
# 9Ô∏è‚É£ BUSINESS & FINANCIAL REPORT
# ================================
# Assumptions
clv = 1200
cost = 50
tn, fp, fn, tp = cm.ravel()
saved_revenue = tp * clv
intervention_cost = (tp + fp) * cost
roi = saved_revenue - intervention_cost

report = f"""
# üìä **Customer Churn Prediction ‚Äì Final Report**

## üîç Executive Summary
- **Model:** XGBoost Pipeline (StandardScaler + OrdinalEncoder)
- **Performance:** - **Recall (Churners):** {recall_score(y_test, y_pred):.2%}
    - **AUC Score:** {auc:.2f}
- **Strategy:** The model uses `scale_pos_weight={pos_weight:.2f}` to prioritize detecting churners.

## üß† Why are they leaving? (SHAP Insights)
1. **Age:** Older customers are the highest risk group.
2. **Activity:** Inactive members (`IsActiveMember=0`) are likely to leave.
3. **Products:** Customers with 1 product churn; those with 2 stay.
4. **Balance:** High balance customers are leaving (Rate shopping?).

## üí∞ Financial Impact
- **Targeted Customers:** {tp + fp}
- **Churners Saved:** {tp}
- **Projected ROI:** ${roi:,.2f}
"""
print(report)

# ================================
# üîü SAVE FOR PRODUCTION
# ================================
# We save the WHOLE pipeline (Scaler + Encoder + Model) in one file
joblib.dump(pipeline, "churn_pipeline.joblib")
print("‚úÖ Pipeline saved as 'churn_pipeline.joblib'")
print("   Ready for deployment!")