In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, classification_report,
    precision_recall_curve
)
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv(r"C:\Users\dell\OneDrive\Desktop\bank churn\resources\analytical_base_table.csv")  # replace with actual data path
X = df.drop(columns=['Exited'])
y = df['Exited']

# Split into train+val and test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=42)

# Define columns
scale_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'EstimatedSalary']
cat_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember', 'NumOfProducts']


In [3]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), scale_cols),
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)
])


In [21]:


pipeline_weight = ImbPipeline([
    ('preprocessor',preprocessor),
    ('model',DecisionTreeClassifier(class_weight='balanced',random_state=42))
    
])

In [25]:


param_weight = {
    'model__max_leaf_nodes': [5,10,15], 
    'model__min_samples_split': [2, 3, 4],
    'model__criterion': ['gini', 'entropy']
}



In [26]:


grid_weight = GridSearchCV(
    estimator = pipeline_weight,
    param_grid = param_weight,
    scoring='recall',
    cv=5,n_jobs=-1,verbose=2
)

In [27]:
grid_weight.fit(X_train,y_train)
best_model_weight = grid_weight.best_estimator_
print("Best Params:", grid_weight.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Params: {'model__criterion': 'entropy', 'model__max_leaf_nodes': 15, 'model__min_samples_split': 2}


In [28]:
grid_weight.best_score_

0.7714854111405836

In [29]:
y_pred = grid_weight.predict(X_test)


In [31]:
y_pred_train = grid_weight.predict(X_train)

In [None]:
print(classification_report(y_pred_train,y_train))#training data 

              precision    recall  f1-score   support

           0       0.74      0.93      0.83      4055
           1       0.79      0.44      0.57      2345

    accuracy                           0.75      6400
   macro avg       0.77      0.69      0.70      6400
weighted avg       0.76      0.75      0.73      6400



In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test)) #test data 

              precision    recall  f1-score   support

           0       0.74      0.93      0.82      1268
           1       0.77      0.43      0.55       732

    accuracy                           0.75      2000
   macro avg       0.76      0.68      0.69      2000
weighted avg       0.75      0.75      0.72      2000



##Class Weight

In [33]:
prob_val_weight= best_model_weight.predict_proba(X_val)[:,1]
precision_weight,recall_weight,thresholds_weight = precision_recall_curve(y_val,prob_val_weight)
f1_score_weight= (2*precision_weight*recall_weight)/(precision_weight+recall_weight+1e-8)
best_idx_weight = np.argmax(f1_score_weight)
best_thres_weight = thresholds_weight[best_idx_weight]

print(f"Optimal Threshold (F1 max): {best_thres_weight:.4f}")

Optimal Threshold (F1 max): 0.5441


In [34]:
prob_test_weight= best_model_weight.predict_proba(X_test)[:,1]
y_test_pred_weight = (prob_test_weight >= best_thres_weight).astype(int)
print(f"ROC-AUC on Test for weight: {roc_auc_score(y_test, prob_test_weight):.4f}")
print(classification_report(y_test, y_test_pred_weight))

ROC-AUC on Test for weight: 0.8385
              precision    recall  f1-score   support

           0       0.93      0.74      0.82      1593
           1       0.43      0.77      0.55       407

    accuracy                           0.75      2000
   macro avg       0.68      0.76      0.69      2000
weighted avg       0.83      0.75      0.77      2000



If Churn is Expensive → Use Class Weights
Churn cost is usually high → you’d rather catch more churners even if you annoy a few non-churners.

Recall is more critical in such cases.

Class Weights give you 70% recall of churners → good for proactive retention strategies.

In [35]:
# Find threshold with recall >= 0.70 and highest possible precision
recall_cutoff = 0.75
valid_idxs = np.where(recall_weight >= recall_cutoff)[0]

# Of those, pick the one with highest precision
best_idx_recall = valid_idxs[np.argmax(precision_weight[valid_idxs])]
best_thresh_recall = thresholds_weight[best_idx_recall]

print(f"Threshold for Recall ≥ 75%: {best_thresh_recall:.4f}")


Threshold for Recall ≥ 75%: 0.5441


In [35]:
best_model_weight,


(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num', StandardScaler(),
                                                   ['CreditScore', 'Age',
                                                    'Tenure', 'Balance',
                                                    'EstimatedSalary']),
                                                  ('cat',
                                                   OneHotEncoder(handle_unknown='ignore',
                                                                 sparse_output=False),
                                                   ['Geography', 'Gender',
                                                    'HasCrCard',
                                                    'IsActiveMember',
                                                    'NumOfProducts'])])),
                 ('model',
                  DecisionTreeClassifier(class_weight='balanced',
                                         criterion='entrop

In [36]:
best_thres_weight

0.5440979433241766

In [37]:
from joblib import dump

model_package = {
    'model':best_model_weight,
    'threshold':best_thres_weight
}
dump(model_package,'../models/dtree.pkl')

['../models/dtree.pkl']

In [41]:
df.sample()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
6996,623,Spain,Female,50,2,87116.71,1,1,1,104382.11,0
