In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, classification_report,
    precision_recall_curve
)
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [3]:
df = pd.read_csv(r"C:\Users\dell\OneDrive\Desktop\bank churn\resources\analytical_base_table.csv")  # replace with actual data path
X = df.drop(columns=['Exited'])
y = df['Exited']

# Split into train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

# Define columns
scale_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts']


In [4]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), scale_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])


In [5]:
pipeline_smote = ImbPipeline([
    ('preprocessor',preprocessor),
    ('smote',SMOTE(random_state=42)),
    ('model',DecisionTreeClassifier(random_state=42))
])

pipeline_weight = ImbPipeline([
    ('preprocessor',preprocessor),
    ('model',DecisionTreeClassifier(class_weight='balanced',random_state=42))
    
])

In [19]:
param_smote = {
    'model__max_leaf_nodes': [10, 20, 30], 
    'model__min_samples_split': [2, 3, 4],
    'model__criterion': ['gini', 'entropy'],
    'model__class_weight':['balanced',None]
}

param_weight = {
    'model__max_leaf_nodes': [10, 20, 30], 
    'model__min_samples_split': [2, 3, 4],
    'model__criterion': ['gini', 'entropy']
}



In [20]:
grid_smote = GridSearchCV(
    estimator = pipeline_smote,
    param_grid = param_smote,
    scoring='roc_auc',
    cv=5,n_jobs=-1,verbose=2
)

grid_weight = GridSearchCV(
    estimator = pipeline_weight,
    param_grid = param_weight,
    scoring='roc_auc',
    cv=5,n_jobs=-1,verbose=2
)

In [21]:
grid_smote.fit(X_train,y_train)
best_model_smote = grid_smote.best_estimator_
print("Best Params:", grid_smote.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Params: {'model__class_weight': 'balanced', 'model__criterion': 'entropy', 'model__max_leaf_nodes': 30, 'model__min_samples_split': 2}


In [24]:
grid_smote.best_score_

0.8401917522916598

In [22]:
grid_weight.fit(X_train,y_train)
best_model_weight = grid_weight.best_estimator_
print("Best Params:", grid_weight.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Params: {'model__criterion': 'entropy', 'model__max_leaf_nodes': 20, 'model__min_samples_split': 2}


In [23]:
grid_weight.best_score_

0.8384093266876317

###SMotw with class weight balance

In [26]:
prob_val_smote = best_model_smote.predict_proba(X_val)[:,1]
precision_smote,recall_smote,thresholds_smote = precision_recall_curve(y_val,prob_val_smote)


In [27]:
f1_score_smote = (2*precision_smote*recall_smote)/(precision_smote+recall_smote+1e-8)
best_idx_smote = np.argmax(f1_score_smote)

In [29]:
best_thres_smote = thresholds_smote[best_idx_smote]

print(f"Optimal Threshold (F1 max): {best_thres_smote:.4f}")

Optimal Threshold (F1 max): 0.5890


In [33]:
prob_test_smote = best_model_smote.predict_proba(X_test)[:,1]
y_test_pred_smote = (prob_test_smote >= best_thres_smote).astype(int)
print(f"ROC-AUC on Test for Smote: {roc_auc_score(y_test, prob_test_smote):.4f}")
print(classification_report(y_test, y_test_pred_smote))

ROC-AUC on Test for Smote: 0.8360
              precision    recall  f1-score   support

           0       0.91      0.86      0.88      1593
           1       0.55      0.65      0.59       407

    accuracy                           0.82      2000
   macro avg       0.73      0.76      0.74      2000
weighted avg       0.83      0.82      0.83      2000



##Class Weight

In [30]:
prob_val_weight= best_model_weight.predict_proba(X_val)[:,1]
precision_weight,recall_weight,thresholds_weight = precision_recall_curve(y_val,prob_val_weight)
f1_score_weight= (2*precision_weight*recall_weight)/(precision_weight+recall_weight+1e-8)
best_idx_weight = np.argmax(f1_score_weight)
best_thres_weight = thresholds_weight[best_idx_weight]

print(f"Optimal Threshold (F1 max): {best_thres_weight:.4f}")

Optimal Threshold (F1 max): 0.5441


In [34]:
prob_test_weight= best_model_weight.predict_proba(X_test)[:,1]
y_test_pred_weight = (prob_test_weight >= best_thres_weight).astype(int)
print(f"ROC-AUC on Test for weight: {roc_auc_score(y_test, prob_test_weight):.4f}")
print(classification_report(y_test, y_test_pred_weight))

ROC-AUC on Test for weight: 0.8426
              precision    recall  f1-score   support

           0       0.91      0.82      0.86      1593
           1       0.50      0.70      0.58       407

    accuracy                           0.80      2000
   macro avg       0.71      0.76      0.72      2000
weighted avg       0.83      0.80      0.81      2000



If Churn is Expensive → Use Class Weights
Churn cost is usually high → you’d rather catch more churners even if you annoy a few non-churners.

Recall is more critical in such cases.

Class Weights give you 70% recall of churners → good for proactive retention strategies.

In [35]:
best_model_weight,


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num', StandardScaler(),
                                                   ['CreditScore', 'Age',
                                                    'Tenure', 'Balance',
                                                    'EstimatedSalary']),
                                                  ('cat',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 sparse_output=False),
                                                   ['Geography', 'Gender',
                                                    'HasCrCard',
                                                    'IsActiveMember',
                                                    'NumOfProducts'])])),
                 ('model',
                  DecisionTreeClassifier(class_weight='balanced',
                                         criterion='entrop

In [37]:
best_thres_weight

0.5440979433241766

In [38]:
from joblib import dump

model_package = {
    'model':best_model_weight,
    'threshold':best_thres_weight
}
dump(model_package,'../models/dtree.pkl')

['../models/dtree.pkl']

In [41]:
df.sample()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6996,623,Spain,Female,50,2,87116.71,1,1,1,104382.11,0
