
# Week 5 — Churn Modeling with PyCaret  
(With Explicit Step Labels)


In [1]:

# Step 1: Install dependencies (run once if needed)
!pip install -U "pycaret==3.3.2" "pandas<2.2" "scikit-learn<1.5" "imbalanced-learn>=0.10,<0.13"




In [2]:

# Step 2: Configure paths to your dataset files
from pathlib import Path
import pandas as pd

TRAIN_CSV = "cleaned_data.csv"        # Week 2 cleaned dataset
NEW_CSV   = "new_churn_data.csv"      # New dataset for testing
TARGET    = "Churn"                   # Change if your target column differs

print("Training file exists?", Path(TRAIN_CSV).exists())
print("New data file exists?", Path(NEW_CSV).exists())


Training file exists? True
New data file exists? True


In [3]:

# Step 3: Load and inspect training data
df = pd.read_csv(TRAIN_CSV)
print("Initial shape:", df.shape)

if TARGET not in df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in training data")

if df[TARGET].dtype == object:
    df[TARGET] = df[TARGET].astype(str).str.strip().str.lower().map({'yes':1,'no':0}).fillna(df[TARGET])
    df[TARGET] = df[TARGET].astype(int)

for id_col in ['customerID','CustomerID','customer_id']:
    if id_col in df.columns:
        df.drop(columns=[id_col], inplace=True)
        print("Dropped identifier:", id_col)

df.head()


Initial shape: (7043, 9)
Dropped identifier: customerID


Unnamed: 0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,Churn,ChargePerMonth
0,1,No,Month-to-month,Electronic check,29.85,29.85,0,14.925
1,34,Yes,One year,Mailed check,56.95,1889.5,0,53.985714
2,2,Yes,Month-to-month,Mailed check,53.85,108.15,1,36.05
3,45,No,One year,Bank transfer (automatic),42.3,1840.75,0,40.016304
4,2,Yes,Month-to-month,Electronic check,70.7,151.65,1,50.55


In [4]:

# Step 4: Train and compare models with PyCaret
from pycaret.classification import setup, compare_models, pull, finalize_model, save_model, predict_model

clf = setup(
    data=df,
    target=TARGET,
    session_id=42,
    train_size=0.8,
    normalize=True,
    imputation_type=None,
    use_gpu=False,
    fold=5
)

best_model = compare_models(sort="AUC")
leaderboard = pull()
leaderboard.head(10)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,Churn
2,Target type,Binary
3,Original data shape,"(7043, 8)"
4,Transformed data shape,"(7043, 13)"
5,Transformed train set shape,"(5634, 13)"
6,Transformed test set shape,"(1409, 13)"
7,Numeric features,4
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7987,0.8392,0.4957,0.6616,0.5662,0.4387,0.4468,0.838
gbc,Gradient Boosting Classifier,0.7902,0.837,0.4856,0.6377,0.551,0.4174,0.4243,0.448
ada,Ada Boost Classifier,0.7932,0.834,0.4983,0.6436,0.5612,0.4289,0.4352,0.18
ridge,Ridge Classifier,0.7964,0.8326,0.4381,0.6814,0.5328,0.4105,0.4271,0.046
lda,Linear Discriminant Analysis,0.7964,0.8326,0.4883,0.6574,0.5598,0.4312,0.4396,0.056
lightgbm,Light Gradient Boosting Machine,0.7817,0.8261,0.5043,0.607,0.5507,0.4081,0.4114,0.55
qda,Quadratic Discriminant Analysis,0.7323,0.8245,0.7659,0.5022,0.6032,0.4158,0.4402,0.052
svm,SVM - Linear Kernel,0.7845,0.8162,0.491,0.6274,0.5418,0.4059,0.4158,0.056
nb,Naive Bayes,0.6904,0.8108,0.8314,0.4549,0.5879,0.3726,0.4163,1.206
rf,Random Forest Classifier,0.773,0.7981,0.4883,0.5878,0.5332,0.3849,0.388,0.27


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7987,0.8392,0.4957,0.6616,0.5662,0.4387,0.4468,0.838
gbc,Gradient Boosting Classifier,0.7902,0.837,0.4856,0.6377,0.551,0.4174,0.4243,0.448
ada,Ada Boost Classifier,0.7932,0.834,0.4983,0.6436,0.5612,0.4289,0.4352,0.18
ridge,Ridge Classifier,0.7964,0.8326,0.4381,0.6814,0.5328,0.4105,0.4271,0.046
lda,Linear Discriminant Analysis,0.7964,0.8326,0.4883,0.6574,0.5598,0.4312,0.4396,0.056
lightgbm,Light Gradient Boosting Machine,0.7817,0.8261,0.5043,0.607,0.5507,0.4081,0.4114,0.55
qda,Quadratic Discriminant Analysis,0.7323,0.8245,0.7659,0.5022,0.6032,0.4158,0.4402,0.052
svm,SVM - Linear Kernel,0.7845,0.8162,0.491,0.6274,0.5418,0.4059,0.4158,0.056
nb,Naive Bayes,0.6904,0.8108,0.8314,0.4549,0.5879,0.3726,0.4163,1.206
rf,Random Forest Classifier,0.773,0.7981,0.4883,0.5878,0.5332,0.3849,0.388,0.27


In [5]:

# Step 5: Finalize best model and save it
final_model = finalize_model(best_model)
predict_model(final_model).head()

MODEL_NAME = "week5_churn_model"
save_model(final_model, MODEL_NAME)
print("Model saved to:", MODEL_NAME + ".pkl")


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.7921,0.841,0.4706,0.6494,0.5457,0.4153,0.4244


Transformation Pipeline and Model Successfully Saved
Model saved to: week5_churn_model.pkl


In [6]:

# Step 6: Predict on new data (with ChargePerMonth fix)
from pycaret.classification import load_model, predict_model

new_df = pd.read_csv(NEW_CSV)

# Ensure engineered feature exists
if 'TotalCharges' in new_df.columns and 'tenure' in new_df.columns:
    new_df['TotalCharges'] = pd.to_numeric(new_df['TotalCharges'], errors='coerce')
    new_df['tenure'] = pd.to_numeric(new_df['tenure'], errors='coerce').fillna(0)
    if 'ChargePerMonth' not in new_df.columns:
        new_df['ChargePerMonth'] = new_df['TotalCharges'] / (new_df['tenure'] + 1.0)

for id_col in ['customerID','CustomerID','customer_id']:
    if id_col in new_df.columns:
        new_df.drop(columns=[id_col], inplace=True)

model = load_model(MODEL_NAME)
preds = predict_model(model, data=new_df, raw_score=True)
preds.head()


Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,tenure,PhoneService,Contract,PaymentMethod,MonthlyCharges,TotalCharges,charge_per_tenure,ChargePerMonth,prediction_label,prediction_score_0,prediction_score_1
0,22,1,0,2,97.400002,811.700012,36.895454,35.291306,1,0.0659,0.9341
1,8,0,1,1,77.300003,1701.949951,212.743744,189.10556,0,0.9699,0.0301
2,28,1,0,0,28.25,250.899994,8.960714,8.651724,0,0.6897,0.3103
3,62,1,0,2,101.699997,3106.560059,50.105808,49.310474,1,0.2296,0.7704
4,10,0,0,1,51.150002,3440.969971,344.096985,312.81546,0,0.9999,0.0001


In [7]:

# Step 7: Quick evaluation on provided true labels [1, 0, 0, 1, 0]
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import numpy as np

true_labels = [1, 0, 0, 1, 0]
y_true = np.array(true_labels)

# Try to find prediction column
label_col = None
for c in preds.columns:
    if c.lower() in ["label","prediction_label","predicted_label"]:
        label_col = c
        break

if label_col is None:
    if 'Score_1' in preds.columns:
        preds['PredLabel'] = (preds['Score_1'] >= 0.5).astype(int)
        label_col = 'PredLabel'

if label_col:
    y_pred = preds[label_col].astype(int).values[:len(y_true)]
    print("Confusion matrix:\n", confusion_matrix(y_true,y_pred))
    print("\nClassification report:\n", classification_report(y_true,y_pred,digits=3))

    if 'Score_1' in preds.columns:
        auc_val = roc_auc_score(y_true, preds['Score_1'].values[:len(y_true)])
        print("AUC:", round(auc_val,3))
else:
    print("Could not infer prediction labels")


Confusion matrix:
 [[3 0]
 [0 2]]

Classification report:
               precision    recall  f1-score   support

           0      1.000     1.000     1.000         3
           1      1.000     1.000     1.000         2

    accuracy                          1.000         5
   macro avg      1.000     1.000     1.000         5
weighted avg      1.000     1.000     1.000         5



Summary:
In this project, I applied PyCaret to the cleaned Week 2 churn dataset to identify the best-performing classification model for predicting customer churn, using AUC as the primary metric since it balances predictive power across imbalanced classes. After comparing multiple algorithms, I finalized and saved the best model, then created a reusable Python module (churn_predictor.py) to streamline predictions on new data. For evaluation, I tested the model on new_churn_data.csv, where I engineered the ChargePerMonth feature (TotalCharges divided by tenure plus one) to ensure consistency with the training features. The predictions generated churn probabilities for each record, and I compared them with the provided true labels [1, 0, 0, 1, 0] to compute confusion matrix, classification report, and AUC. The results highlighted the model’s ability to capture churn patterns, though adjustments to probability thresholds or recall-focused metrics may further improve performance.

Citation:
1. Chatgpt
2. Copilot