# 3.1  Churn Prediction Project
- Dataset Link: https://github.com/alexeygrigorev/mlbookcamp-code/tree/master/chapter-03-churn-prediction

# 3.2 Data Prep
- Import relevant libraries
- Download data and read with pandas - make sure to catch empty strings in columns
- Inspect Data
- Make columns names and values look uniform
- Check if all column names read correctly
- Check for missingness 
- Check if the churn variable needs any preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
!wget 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=['', ' '])
print(df.head(n=5))
print(df.shape)

--2025-10-27 05:16:09--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv.1’


2025-10-27 05:16:09 (82.9 MB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv.1’ saved [977501/977501]

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No    

In [3]:
df.columns = df.columns.str.lower()
df.columns
#transpose data to see all columns well
#df.head().T

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [4]:
df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

In [5]:
# format values in string columns
string_cols = list(df.dtypes[df.dtypes=='object'].index)
string_cols

for col in string_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [6]:
# --> Check Missingness
df.isnull().sum()

customerid           0
gender               0
seniorcitizen        0
partner              0
dependents           0
tenure               0
phoneservice         0
multiplelines        0
internetservice      0
onlinesecurity       0
onlinebackup         0
deviceprotection     0
techsupport          0
streamingtv          0
streamingmovies      0
contract             0
paperlessbilling     0
paymentmethod        0
monthlycharges       0
totalcharges        11
churn                0
dtype: int64

In [7]:
# Replace missingness with in total charges with median value
df['totalcharges'] = df['totalcharges'].fillna(df['totalcharges'].median())
df.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [9]:
# # --> Some variables were incorrectly coded 
# # --> totalcharges and seniorcitizen
# # -> totalcharges should be number instead of object
# # -> seniorcitizen should be .... instead of .... 

# df.isnull().sum()

#--> Convert totalcharges column to numeric(float64) and treat foreign entries as nan -  it initially read as objects due to non-numeric entires like empty spaces present in them, 

#--> df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

df.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                object
dtype: object

In [10]:
# Inspect the Churn variable
df.churn = (df.churn == 'yes').astype(int)
df['churn'].value_counts()  #imbalanced

churn
0    5174
1    1869
Name: count, dtype: int64

# 3.3  Setting up the Validation Framework
- Perform train/validation/test split with scikit learn

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
#train_test_split?

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
len(df_full_train), len(df_test)

(5634, 1409)

In [13]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_val)

(4225, 1409)

In [14]:
# needed splits
len(df_train), len(df_val), len(df_test)


(4225, 1409, 1409)

In [15]:
#Reset Indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
#get target variables
y_train = df_train['churn'].values
y_val = df_val.churn.values
y_test = df_test.churn.values 

print(type(y_train))
print(type(y_val))

#drop churn variable in train
del df_train['churn']
del df_val['churn']
del df_test['churn']

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [17]:
y_train


array([0, 0, 1, ..., 1, 0, 1], shape=(4225,))

In [18]:
y_val

array([0, 0, 0, ..., 0, 1, 1], shape=(1409,))

In [19]:
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(4225, 20)
(1409, 20)
(1409, 20)


In [20]:
#train_test_split?

# 3.4 EDA
- Check Missing Values
- Look at target variable(Churn)
- Look at numerical and categorical variables

In [21]:
df_full_train = df_full_train.reset_index(drop=True)
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [22]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [23]:
df_full_train['churn'].value_counts()

churn
0    4113
1    1521
Name: count, dtype: int64

In [24]:
df_full_train['churn'].value_counts(normalize=True)

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [25]:
global_churn_rate = df_full_train['churn'].mean()
round(global_churn_rate, 2)

np.float64(0.27)

In [26]:
# Check categorical & numerical variables
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
               'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
               'paymentmethod']

In [27]:
df_full_train[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# 3.5 Feature Importance: Churn Rate & Risk Ratio

- Churn rate
- Risk Ratio
- Mutual information - later

## Churn Rate
- Subtract group churn from global churn (global_churn - group_churn)
    - If result is more than zero(positive), then group is less likely to churn
    - If result is < 0, the group is more likely to churn

In [28]:
# Churn rate with different groups
# churn rate in female
churn_female = df_full_train[df_full_train.gender == 'female']['churn'].mean()
float(churn_female)

0.27682403433476394

In [29]:
# Churn rate in males
churn_male = df_full_train[df_full_train['gender']=='male']['churn'].mean()
float(round(churn_male, 2))


0.26

In [30]:
#Global Churn
float(global_churn_rate)

0.26996805111821087

In [31]:
#Churn rate among individuals with partners
churn_partner = df_full_train[df_full_train['partner']=='yes'].churn.mean()
float(round(churn_partner, 2))

0.21

In [32]:
#churn rate among individuals without partner
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
float(round(churn_no_partner, 2))

0.33

In [33]:
#churnrate among male senior citizens with montly contract
churn_male_senior_citizens_monthly_contract = df_full_train[
    (df_full_train.gender == 'male') &
    (df_full_train['seniorcitizen'] == 1) &
    (df_full_train.contract == 'month-to-month')
    ].churn.mean()
float(churn_male_senior_citizens_monthly_contract)

#The churn rate for this subgroup is higher than the global churn rate. Hence these groups are more likely to churn

# # Inspect the Churn variable
# df.churn = (df.churn == 'yes').astype(int)
# df['churn'].value_counts()  #imbalanced

0.5292307692307693

In [34]:
float(global_churn_rate - churn_female) #females are slightly likely to churn (weaker)

-0.006855983216553063

In [35]:
float(global_churn_rate - churn_male) #males are less likely to churn 

0.006754520462819769

## Risk Ratio
 - Divide group churn rate by global churn rate (group churn / global churn)
     - If result is > 1, then group is more likely to churn
     - If result is < 1, group is less likely to churn

In [36]:
float(churn_no_partner/ global_churn_rate)
#no partners are likely to churn that those with partners


1.2216593879412643

In [37]:
float(churn_male_senior_citizens_monthly_contract / global_churn_rate)

#male senior citizens with monthly contract are more likely to churn. They are twice as likely to churn

1.9603459262630862

In [38]:
from IPython.display import display

for cols in categorical:
    print(cols)
    df_group = df_full_train.groupby(cols).churn.agg(['mean', 'count'])
    df_group['diff'] = global_churn_rate - df_group['mean']
    df_group['risk'] = df_group['mean'] / global_churn_rate 
    display(df_group)
    print()
    print()

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,-0.006856,1.025396
male,0.263214,2838,0.006755,0.97498




seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,0.027698,0.897403
1,0.413377,912,-0.143409,1.531208




partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,-0.059841,1.221659
yes,0.205033,2702,0.064935,0.759472




dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,-0.043792,1.162212
yes,0.165666,1666,0.104302,0.613651




phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,0.028652,0.89387
yes,0.273049,5087,-0.003081,1.011412




multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,0.012561,0.953474
no_phone_service,0.241316,547,0.028652,0.89387
yes,0.290742,2387,-0.020773,1.076948




internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,0.077621,0.712482
fiber_optic,0.425171,2479,-0.155203,1.574895
no,0.077805,1221,0.192163,0.288201




onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,-0.150953,1.559152
no_internet_service,0.077805,1221,0.192163,0.288201
yes,0.153226,1612,0.116742,0.56757




onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,-0.134355,1.497672
no_internet_service,0.077805,1221,0.192163,0.288201
yes,0.217232,1915,0.052736,0.80466




deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,-0.125907,1.466379
no_internet_service,0.077805,1221,0.192163,0.288201
yes,0.230412,1940,0.039556,0.85348




techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,-0.148946,1.551717
no_internet_service,0.077805,1221,0.192163,0.288201
yes,0.159926,1632,0.110042,0.59239




streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,-0.072864,1.269897
no_internet_service,0.077805,1221,0.192163,0.288201
yes,0.302723,2167,-0.032755,1.121328




streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,-0.068938,1.255358
no_internet_service,0.077805,1221,0.192163,0.288201
yes,0.307273,2200,-0.037305,1.138182




contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,-0.161733,1.599082
one_year,0.120573,1186,0.149395,0.446621
two_year,0.028274,1344,0.241694,0.10473




paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,0.097897,0.637375
yes,0.338151,3321,-0.068183,1.25256




paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,0.101797,0.622928
credit_card_(automatic),0.164339,1217,0.10563,0.608733
electronic_check,0.45589,1893,-0.185922,1.688682
mailed_check,0.19387,1305,0.076098,0.718121






# 3.6

In [39]:
categorical

['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

 # 3.6 Feature Importance - Mutual Information
 - Tells us how much we can learn about one variable if we know the value of another.
 - Mutual Information between two clusterings - The Mutual Information is a measure of the similarity between two labels of the same data
 - Its all about relative importance

In [40]:
from sklearn.metrics import mutual_info_score
# mutual_info_score?

In [41]:
mutual_info_score(df_full_train.churn, df_full_train.seniorcitizen)

0.009410216144208144

In [42]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

# if somebody tells us about month-to-month contract, we can learn alot from this info (higher churn risk)

0.0983203874041556

In [43]:
mutual_info_score(df_full_train.churn, df_full_train.gender)

#value is low --> implies if someone tells us about gender we know nothing about his/her churn status

0.0001174846211139946

In [44]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)
    

In [45]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending = False)

# Scores > 0.05 have strong relevance
# 0.01 > score < 0.05 - moderate relevance
# scores < 0.01 have weak relevance

# These are the kind of signals ml models pick when modelling

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

# 3.7 Feature Importance - Correlation

In [46]:
df_full_train[numerical].corrwith(df_full_train.churn)
#the longer people stay(tenure) with the company the lesser they churn

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.197006
dtype: float64

In [47]:
print(df_full_train[df_full_train.tenure <=2].churn.mean())
#churn rate for people with less than 2 years is 60% 

print(df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure <=12) ].churn.mean())

print(df_full_train[df_full_train.tenure > 12].churn.mean() )

#we see the churn rate decreases as tenure increases

#do same for monthly charges

0.5953420669577875
0.3994413407821229
0.17634908339788277


# 3.8 One-hot Encoding
- Scikit learn to encode categorical features

In [48]:
from sklearn.feature_extraction import DictVectorizer
#DictVectorizer is clever enough to identify numeric variables and does not process numeric variables. 

In [49]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records') # Put categorical and numerical into dictionaries

dv = DictVectorizer(sparse=False) #instantiate a DictVectorizer to learn our dictionaries 

#teach the vectorizer what kind of variables we have

In [50]:
#teach the vectorizer what kind of variables we have

X_train = dv.fit_transform(train_dicts) # Now transfrom the dictionaries in train data into a feature matrix
X_train.shape

(4225, 45)

In [52]:
#validation data - apply the fitto validation data.
# Now transfrom the dictionaries in validation data into a feature matrix
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records') 
X_val = dv.transform(val_dicts) 
X_val.shape

(1409, 45)

# 3.9 Logistic Regression