# Machine Learning for Classification

# 

# 3.1 Churn prediction :

In [1]:
# Libraries:

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv

In [3]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
# Initial cleaning

In [5]:
df.columns = df.columns.str.lower().str.replace(' ','_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ','_')

In [6]:
df.totalcharges

0         29.85
1        1889.5
2        108.15
3       1840.75
4        151.65
         ...   
7038     1990.5
7039     7362.9
7040     346.45
7041      306.6
7042     6844.5
Name: totalcharges, Length: 7043, dtype: object

In [7]:
#enabling coerce is important for parsing string:
tc = pd.to_numeric(df.totalcharges, errors='coerce')

In [8]:
df[tc.isnull()][['customerid','totalcharges']]

Unnamed: 0,customerid,totalcharges
488,4472-lvygi,_
753,3115-czmzd,_
936,5709-lvoeq,_
1082,4367-nuyao,_
1340,1371-dwpaz,_
3331,7644-omvmy,_
3826,3213-vvolg,_
4380,2520-sgtta,_
5218,2923-arzlg,_
6670,4075-wkniu,_


In [9]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

In [10]:
df.totalcharges = df.totalcharges.fillna(0)

In [11]:
# Converting yes and no to binary format:

In [12]:
df.churn = (df.churn == 'yes').astype('int')

# 3.2 Setting up the validation framework

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df,test_size=0.2,random_state=1)

In [15]:
len(df_full_train), len(df_test)

(5634, 1409)

In [16]:
# From full train we split again but we split on %20/%80 for the test size to make sure we split our main data %60 train,%20 validation,%20 test

In [17]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25,random_state=1)

In [18]:
len(df_train), len(df_val), len(df_test)

(4225, 1409, 1409)

In [19]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [20]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [21]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# 3.3 EDA

- Check missing values
- Look at the target variable(churn)
- Look at numerical and categorical variables

In [22]:
df_full_train = df_full_train.reset_index(drop=True)

In [23]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [24]:
# Let's check churn rate:

In [25]:
df_full_train.churn.value_counts(normalize=True)

0    0.730032
1    0.269968
Name: churn, dtype: float64

In [26]:
# We can also calculate the churn rate with mean of target variable:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate,2)

0.27

In [27]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [28]:
list(df_full_train.dtypes[df_full_train.dtypes =='O'].index)

['customerid',
 'gender',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [29]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [30]:
numerical = ['tenure','monthlycharges','totalcharges']

In [31]:
categorical = [
 'gender',
 'partner',
    'seniorcitizen',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

In [32]:
df_full_train[categorical].nunique()

gender              2
partner             2
seniorcitizen       2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Feature importance: Churn rate and risk ratio

- Churn rate
- Risk ratio
- Mutual information > later

# Churn rate

In [33]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [34]:
# Let's take a look how differ churn rate by gender:

In [35]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

0.27682403433476394

In [36]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

0.2632135306553911

In [37]:
global_churn_rate

0.26996805111821087

In [38]:
# Not much difference so let's look at another feature:

In [39]:
df_full_train.partner.value_counts()

no     2932
yes    2702
Name: partner, dtype: int64

In [40]:
churn_with_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_with_partner

0.20503330866025166

In [41]:
global_churn_rate - churn_with_partner

0.06493474245795922

In [42]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

0.3298090040927694

In [43]:
global_churn_rate - churn_no_partner

-0.05984095297455855

In [44]:
# We can conclude that partner variable importance is more important gender variable in this dataset

# Risk ratio:

In [45]:
churn_no_partner / global_churn_rate

1.2216593879412643

In [46]:
churn_with_partner / global_churn_rate

0.7594724924338315

In [47]:
# To display something that in the loop:

from IPython.display import display

In [48]:
# Let' generalize it for other variables:


for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c).churn.agg(['mean','count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)
    #print()
    #print()

gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498


partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472


seniorcitizen


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208


dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651


phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412


multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948


internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201


onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757


onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466


deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348


techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239


streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328


streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182


contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473


paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256


paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121


# 

# Mutual information

-Mutual information concept from information theory, it tells us how much we can learn about one variable if we know the value of another


In [49]:
from sklearn.metrics import mutual_info_score

In [50]:
# It tells us how much we can learn with passing variable:

mutual_info_score(df_full_train.churn,df_full_train.contract)

0.0983203874041556

In [51]:
# We will try to do with every variable in the dataset:

def mutual_info_churn_score(series):
    return mutual_info_score(series,df_full_train.churn)

In [52]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

# Correlation:
- Correlation coefficient

In [53]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [54]:
df_full_train[df_full_train.tenure <= 2].churn.mean()

0.5953420669577875

In [55]:
df_full_train[(df_full_train.tenure >= 2) & (df_full_train.tenure <= 12)].churn.mean()

0.41798107255520506

In [56]:
df_full_train[df_full_train.tenure >= 12].churn.mean()

0.17986881937436933

In [57]:
df_full_train[df_full_train.monthlycharges <= 20].churn.mean()

0.08795411089866156

In [58]:
df_full_train[(df_full_train.monthlycharges > 20) & (df_full_train.monthlycharges <= 50)].churn.mean()

0.18340943683409436

In [59]:
df_full_train[df_full_train.monthlycharges > 50].churn.mean()

0.32499341585462205

# One - hot encoding:

- Use Scikit-Learn to encode categorical features

In [60]:
from sklearn.feature_extraction import DictVectorizer

In [61]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [62]:
train_dicts[0]

{'gender': 'female',
 'partner': 'yes',
 'seniorcitizen': 0,
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

In [63]:
dv = DictVectorizer(sparse=False)

In [64]:
dv.get_feature_names_out()

AttributeError: 'DictVectorizer' object has no attribute 'feature_names_'

In [None]:
#  Transform a sparse matrix which is a special way of encoding data when there are many zeros: ( We did ignore sparse above)

In [None]:
list(dv.transform(train_dicts[:5])[0])[0:5]

In [None]:
X_train = dv.fit_transform(train_dicts)

In [None]:
X_train.shape

In [None]:
# Same for validation set:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [None]:
# We only use transform on validation set:
X_val = dv.transform(val_dicts)

In [None]:
X_val.shape

# Logistic Regression:

In [None]:
# Sigmoid function:

def sigmoid(z):
    return 1/ (1 + np.exp(-z))

In [None]:
z = np.linspace(-7, 5, 51)

In [None]:
sigmoid(z)

In [None]:
plt.plot(z,sigmoid(z))

In [None]:
def logistic_regression(xi):
    score = w0
    
    for j in range(len(w)):
        score = score + xi[j] * w[j]
        
        result = sigmoid(score)
        return result

# Training logistic regression with Scikit-Learn
- Train a model with Scikit-Learn
- Apply it to the validation dataset
- Calculate the accuracy

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
model.intercept_[0]

In [None]:
model.coef_[0].round(3)

In [None]:
# Hard predictions: We don't know probability yet: 
model.predict(X_train)

In [None]:
# Soft predictions: Not just a number it is a score

# First column is probability of not being churn and second one probability of bein churn which we interested in
model.predict_proba(X_train)[:,1]

In [None]:
y_pred = model.predict_proba(X_val)[:,1]

In [None]:
churn_decision = (y_pred >= 0.5)

In [None]:
churn_decision

In [None]:
y_val

In [None]:
churn_decision.astype(int)

In [None]:
(y_val == churn_decision).mean()

In [None]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

In [None]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [None]:
df_pred

In [None]:
df_pred.correct.mean()

# Model interpretation:

- Look at the coefficients
- Train a smaller model with fewer features

In [None]:
model.coef_[0].round(3)

In [None]:
dv.get_feature_names_out()

In [None]:
# Coefficients 

In [None]:
# Weight for each feature:
dict(zip(dv.get_feature_names_out(),model.coef_[0].round(3)))

In [None]:
# Subsampling data to evaluate the results based on 3 features:

In [None]:
small = ['contract','tenure','monthlycharges']

In [None]:
df_train[small].iloc[:10].to_dict(orient='records')

In [None]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [None]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [None]:
from warnings import c

In [None]:
dv_small.get_feature_names_out()

In [None]:
X_train_small = dv_small.transform(dicts_train_small)

In [None]:
model_small = LogisticRegression()
model_small.fit(X_train_small,y_train)

In [None]:
w0 = model_small.intercept_[0]
w0

In [None]:
w = model_small.coef_[0]
w.round(3)

In [None]:
dict(zip(dv_small.get_feature_names_out(),w.round(3)))

# Using the model

In [None]:
dicts_full_train = df_full_train[categorical + numerical].to_dict(orient='records')

In [None]:
dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

In [None]:
y_full_train = df_full_train.churn.values

In [None]:
model = LogisticRegression()
model.fit(X_full_train,y_full_train)

In [None]:
X_full_train

In [None]:
dicts_test = df_test[categorical + numerical].to_dict(orient='records')

In [None]:
X_test = dv.transform(dicts_test) 

In [None]:
X_test[:,1]

In [None]:
y_pred = model.predict_proba(X_test)[:,1]

In [None]:
churn_decision = (y_pred >= 0.5)

In [None]:
(churn_decision == y_test).mean()

In [None]:
# Not much change!

In [None]:
# Using model on a sample

In [None]:
customer = dicts_test[10]
customer

In [None]:
X_small = dv.transform([customer])

In [None]:
model.predict_proba(X_small)[0,1]

In [None]:
y_test[10]