# Churn Model Building

In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')

pandas==2.3.1
numpy==2.3.1
sklearn==1.7.0


In [3]:
# from sklearn.feature_extraction import DictVectorizer
# from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split               # --> data splitting
from sklearn.model_selection import KFold                           # --> create folds

from sklearn.feature_extraction import DictVectorizer             # --> handle categorical variables
from sklearn.linear_model import LogisticRegression               # --> logistic model
from sklearn.metrics import roc_auc_score                         # --> evaluate with auc_roc_score    

In [4]:
!wget 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv', na_values=['', ' '])

df.columns = df.columns.str.lower()

string_cols = list(df.dtypes[df.dtypes=='object'].index)

for col in string_cols:
    df[col] = df[col].str.lower().str.replace(' ', '_')

# Replace missingness with in total charges with median value
df['totalcharges'] = df['totalcharges'].fillna(df['totalcharges'].median())

df.churn = (df.churn == 'yes').astype(int)

print(df.isnull().sum())
    

--2025-11-03 19:22:54--  https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/refs/heads/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 977501 (955K) [text/plain]
Saving to: ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’


2025-11-03 19:22:54 (132 MB/s) - ‘WA_Fn-UseC_-Telco-Customer-Churn.csv’ saved [977501/977501]

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0

In [5]:
df_full_train, df_test = train_test_split(df, test_size=0.2,random_state=1)
df_full_train.shape, df_test.shape


((5634, 21), (1409, 21))

In [6]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
     'gender',
     'seniorcitizen',
     'partner',
     'dependents',
     'phoneservice',
     'multiplelines',
     'internetservice',
     'onlinesecurity',
     'onlinebackup',
     'deviceprotection',
     'techsupport',
     'streamingtv',
     'streamingmovies',
     'contract',
     'paperlessbilling',
     'paymentmethod',
]


In [7]:
# training pipeline - function that accepts X, y, C,
# get the dictvectorizer on train, fit the logistic model, returns dv, model

def train(df_train, y_train, C=0.5):
    train_dicts = df_train[numerical + categorical].to_dict(orient = 'records')

    dv = DictVectorizer(sparse = False)

    X_train = dv.fit_transform(train_dicts)

    model = LogisticRegression(C=0.5, max_iter=10000).fit(X_train, y_train)

    return dv, model

In [8]:
#prediction function

def predict(df_val, dv, model):
    val_dicts = df_val[numerical + categorical].to_dict(orient = 'records')
    X_val = dv.transform(val_dicts)

    y_pred = model.predict_proba(X_val)[:,1]

    return y_pred

In [9]:
C = 0.5
n_splits = 5

kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]

    y_train = df_train.churn.values
    y_val = df_val.churn.values

    dv, model = train(df_train, y_train, C=C) #--> the train function we wrote
    y_pred = predict(df_val, dv, model)       #--> the predict function we wrote

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C = %s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))


C = 0.5 0.842 +- 0.007


In [10]:
dv, model = train(df_full_train, df_full_train.churn.values, C=0.5)
y_pred = predict(df_test, dv, model)
y_test = df_test.churn.values

auc = roc_auc_score(y_test, y_pred); auc

0.8583490417844801

# Save the training model

In [11]:
import pickle
output_file = f'model_C={C}.bin'
print(output_file)

with open(output_file, 'wb') as f_out:
    pickle.dump((dv, model), f_out)
    #do stuff

#do other stuff below
dv, model

model_C=0.5.bin


(DictVectorizer(sparse=False), LogisticRegression(C=0.5, max_iter=10000))

## Post Model Training 
## Load the model for prediction

In [12]:
import pickle

model_file = 'model_C=0.5.bin'

with open(model_file, 'rb') as f_in:
    dv, model = pickle.load(f_in)
    
dv, model

(DictVectorizer(sparse=False), LogisticRegression(C=0.5, max_iter=10000))

In [13]:
# df_test.shape

test_customer = {
    'gender': 'male',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'yes',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbiling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': (1 * 29.85)
}

X = dv.transform([test_customer])
churn_proba = model.predict_proba(X)[0,1]
print(churn_proba)

if churn_proba >=0.5:
    print('send email with promo')
else:
    print('Do not send promo')

0.5624257721722079
send email with promo


# Option 2 - Efficient Pipeline 
## Fit a final model with pipelines - feed the parameters from the CV done prior

In [14]:
from sklearn.pipeline import make_pipeline

best_C = C

training_pipeline = make_pipeline(
    DictVectorizer(sparse = False),
    LogisticRegression(C=best_C, max_iter=10000)    #solver='liblinear'
)

y_train = df_full_train.churn.values
train_dict = df_full_train[categorical + numerical].to_dict(orient='records')
training_pipeline.fit(train_dict, y_train)

test_dict = df_test[Categorical + numerical].to_dict(orient = 'records')
y_pred_pipe = training_pipeline.predict_proba(test_dict)[:,1]
y_test = df_test.churn.values

auc = roc_auc_score(y_test, y_pred_pipe)
print('auc on held out test data is', auc)


# Save pipeline model
import pickle
output_file = f'pipeline_model_C={C}.bin'
print(output_file)
with open(output_file, 'wb') as f_out:
    pickle.dump((training_pipeline), f_out)

#load pipeline model
import pickle
model_file = 'pipeline_model_C=0.5.bin'
with open(model_file, 'rb') as f_in:
    training_pipeline = pickle.load(f_in)

training_pipeline

#predict a customer with pipeline
test_customer2 = {
    'gender': 'male',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'yes',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbiling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': (1 * 29.85)
}

churn_proba = training_pipeline.predict_proba(test_customer2)[0,1]
print(churn_proba)

if churn_proba >= 0.5:
    print('send email with promo')
else:
    print('Do not send promo')

pipeline_model_C=0.5.bin
0.5624257721722079
send email with promo


In [15]:
# convert to script
!jupyter nbconvert --to python modelling-churn.ipynb 

#convert and save as different name -->   !jupyter nbconvert --to python modelling-churn.ipynb --output training.py    

[NbConvertApp] Converting notebook modelling-churn.ipynb to python
[NbConvertApp] Writing 6646 bytes to modelling-churn.py


In [17]:
for c in categorical:
    info = df[c].value_counts()
    print(info)
    

gender
male      3555
female    3488
Name: count, dtype: int64
seniorcitizen
0    5901
1    1142
Name: count, dtype: int64
partner
no     3641
yes    3402
Name: count, dtype: int64
dependents
no     4933
yes    2110
Name: count, dtype: int64
phoneservice
yes    6361
no      682
Name: count, dtype: int64
multiplelines
no                  3390
yes                 2971
no_phone_service     682
Name: count, dtype: int64
internetservice
fiber_optic    3096
dsl            2421
no             1526
Name: count, dtype: int64
onlinesecurity
no                     3498
yes                    2019
no_internet_service    1526
Name: count, dtype: int64
onlinebackup
no                     3088
yes                    2429
no_internet_service    1526
Name: count, dtype: int64
deviceprotection
no                     3095
yes                    2422
no_internet_service    1526
Name: count, dtype: int64
techsupport
no                     3473
yes                    2044
no_internet_service    1526
Name: c

In [18]:
for n in numerical:
    info2 = df[n].describe()
    print(info2)

count    7043.000000
mean       32.371149
std        24.559481
min         0.000000
25%         9.000000
50%        29.000000
75%        55.000000
max        72.000000
Name: tenure, dtype: float64
count    7043.000000
mean       64.761692
std        30.090047
min        18.250000
25%        35.500000
50%        70.350000
75%        89.850000
max       118.750000
Name: monthlycharges, dtype: float64
count    7043.000000
mean     2281.916928
std      2265.270398
min        18.800000
25%       402.225000
50%      1397.475000
75%      3786.600000
max      8684.800000
Name: totalcharges, dtype: float64
