In [None]:
Problem Statement

Customer churn leads to significant revenue loss for businesses, making early identification of at-risk customers critical. Manual analysis is inefficient and does not scale well with large customer datasets.

Objective

To build an end-to-end, production-ready machine learning pipeline that predicts customer churn using automated preprocessing, model training, and hyperparameter tuning.

In [2]:
#Import Required libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import joblib

In [4]:
#Load the Telco Churn Dataset
df = pd.read_csv("Telco-Customer-Churn.csv")
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
#Target Variable
df["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})

In [8]:
#Drop ID column
df.drop("customerID", axis=1, inplace=True)

In [10]:
#Train-Test Split
X = df.drop("Churn", axis=1)
y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessing Pipeline (Core of This Task)

This ensures:

- Scaling only numeric features

- Encoding categorical features

- No data leakage

In [12]:
#Identify feature types
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

In [14]:
#Build ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Model Pipelines

In [16]:
#Logistic Regression Pipeline
lr_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

In [18]:
#Random Forest Pipeline
rf_pipeline = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", RandomForestClassifier(random_state=42))
    ]
)

# Hyperparameter Tuning with GridSearchCV

In [20]:
#Logistic Regression Grid
lr_param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ["liblinear"]
}

In [22]:
lr_grid = GridSearchCV(
    lr_pipeline,
    lr_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

lr_grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"{'model__C': [0.01, 0.1, ...], 'model__solver': ['liblinear']}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [24]:
#Random Forest Grid
rf_param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5]
}

In [26]:
rf_grid = GridSearchCV(
    rf_pipeline,
    rf_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)

rf_grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'model__max_depth': [None, 10, ...], 'model__min_samples_split': [2, 5], 'model__n_estimators': [100, 200]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


# Evaluate Best Model

In [32]:
#Print Best Scores & Parameters
print("Logistic Regression best F1:", lr_grid.best_score_)
print("Logistic Regression best params:", lr_grid.best_params_)

print("\nRandom Forest best F1:", rf_grid.best_score_)
print("Random Forest best params:", rf_grid.best_params_)

# Note: These F1 scores come from cross-validation, not the test set.

Logistic Regression best F1: 0.5927682070909045
Logistic Regression best params: {'model__C': 10, 'model__solver': 'liblinear'}

Random Forest best F1: 0.5472233161343687
Random Forest best params: {'model__max_depth': None, 'model__min_samples_split': 5, 'model__n_estimators': 100}


In [34]:
#Select the Best Model Automatically
if lr_grid.best_score_ > rf_grid.best_score_:
    best_model = lr_grid.best_estimator_
    best_model_name = "Logistic Regression"
else:
    best_model = rf_grid.best_estimator_
    best_model_name = "Random Forest"

In [36]:
#Evaluate the Chosen Best Model on Test Data
y_pred = best_model.predict(X_test)

print("Best Model:", best_model_name)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Test F1-score:", f1_score(y_test, y_pred))

Best Model: Logistic Regression
Test Accuracy: 0.7920511000709723
Test F1-score: 0.5820256776034237


Note: The final model was selected by comparing cross-validated F1-scores from GridSearchCV across multiple algorithms, and the best-performing pipeline was evaluated on the held-out test set.

# Export the Complete Pipeline (Production-Ready)
This file contains:

- Preprocessing

- Feature encoding

- Scaling

- Trained model

No extra steps needed at inference time.

In [38]:
joblib.dump(best_model, "customer_churn_pipeline.pkl")

['customer_churn_pipeline.pkl']

# Load & Use the Pipeline (Reusability Demo)

In [40]:
loaded_model = joblib.load("customer_churn_pipeline.pkl")

sample = X_test.iloc[[0]]
prediction = loaded_model.predict(sample)

print("Churn Prediction:", prediction[0])

Churn Prediction: 0
