Project Assignment: Build & Deploy a Customer Churn Prediction Model

1. Data Loading and Exploration

In [3]:
# Load the dataset (CSV format)

In [4]:
import numpy as np
import pandas as pd

customerChurn = pd.read_csv("customerChurn.csv")

print(customerChurn)

      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service             DSL             No  ...   
1        

In [5]:
# Explore the number of rows, column types, and any missing values

In [6]:
print(customerChurn.shape[0], '\n\n')

print(customerChurn.info, '\n\n')

print(customerChurn.isnull().sum(), '\n\n')

print(customerChurn.isnull().sum().sum())

7043 


<bound method DataFrame.info of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female              0     Yes         No       1   
1     5575-GNVDE    Male              0      No         No      34   
2     3668-QPYBK    Male              0      No         No       2   
3     7795-CFOCW    Male              0      No         No      45   
4     9237-HQITU  Female              0      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male              0     Yes        Yes      24   
7039  2234-XADUH  Female              0     Yes        Yes      72   
7040  4801-JZAZL  Female              0     Yes        Yes      11   
7041  8361-LTMKD    Male              1     Yes         No       4   
7042  3186-AJIEK    Male              0      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  ...  \
0              No  No phone service         

In [7]:
# Understand the balance of the target variable (Churn)

In [8]:
customerChurn['Churn'].value_counts(normalize=True)

Churn
No     0.73463
Yes    0.26537
Name: proportion, dtype: float64

2. Preprocessing

In [9]:
# Convert Churn to binary: Yes → 1, No → 0

In [10]:
customerChurn['Churn'] = customerChurn['Churn'].map({'Yes': 1, 'No': 0})

print(customerChurn['Churn'])

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64


In [11]:
# Handle missing or blank values in TotalCharges

In [12]:
customerChurn['TotalCharges'] = pd.to_numeric(customerChurn['TotalCharges'], errors='coerce')
customerChurn['TotalCharges'].fillna(customerChurn['TotalCharges'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customerChurn['TotalCharges'].fillna(customerChurn['TotalCharges'].median(), inplace=True)


In [13]:
# Encode categorical columns using Label Encoding or One-Hot Encoding

In [14]:
customerChurn.drop('customerID', axis=1, inplace=True)

customerChurn = pd.get_dummies(customerChurn, columns=customerChurn.select_dtypes(include='object').columns, drop_first=True)

In [15]:
# Normalize or scale MonthlyCharges and TotalCharges if needed

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

customerChurn[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(customerChurn[['MonthlyCharges', 'TotalCharges']])

In [17]:
# Split the data into train and test sets (80:20 or 70:30)

In [18]:
from sklearn.model_selection import train_test_split

X = customerChurn.drop('Churn', axis=1)

y = customerChurn['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

3. Model Training

In [19]:
# Use Logistic Regression from scikit-learn

In [20]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

In [21]:
# Train the model on the training set

In [22]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [23]:
# Evaluate it on the test set using: Accuracy, Precision, Recall, F1 Score and Confusion Matrix

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print(accuracy_score(y_test, y_pred), '\n\n')

print(precision_score(y_test, y_pred), '\n\n')

print(recall_score(y_test, y_pred), '\n\n')

print(f1_score(y_test, y_pred), '\n\n')

print(confusion_matrix(y_test, y_pred))

0.8211497515968772 


0.6850152905198776 


0.6005361930294906 


0.64 


[[933 103]
 [149 224]]


In [25]:
# Save your model using pickle or joblib

In [26]:
import pickle

with open('customer_churn_model.pkl', 'wb') as f:
    pickle.dump(model, f)