# Churn Prediction

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('telco_customer_churn.csv')

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
df.head().T # Transposes so that you can view all columns

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [6]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [7]:
tc = pd.to_numeric(df.totalcharges, errors = 'coerce') #ignore errors

In [8]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce')

In [9]:
df.totalcharges = df.totalcharges.fillna(0)

In [10]:
df.churn.head()

0     no
1     no
2    yes
3     no
4    yes
Name: churn, dtype: object

In [11]:
df.churn = (df.churn == 'yes').astype(int)

### Setting up Validation Framework

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1) #splits into train and test

In [15]:
len(df_full_train), len(df_test)

(5634, 1409)

In [16]:
#split full train into train and validation

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1) #val is 25% of full train (20% of 80)

In [17]:
len(df_full_train), len(df_test), len(df_test)

(5634, 1409, 1409)

In [18]:
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [19]:
len(df_test), len(df_val), len(df_train)

(1409, 1409, 4225)

In [20]:
y_train = df_train.churn.values
y_test = df_test.churn.values
y_val = df_val.churn.values

In [21]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

### EDA

- Check Missing Values
- Look at Target Variable,
- Look at Numerical and Categorical Variables

In [23]:
df_full_train = df_full_train.reset_index(drop=True)

In [24]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [25]:
df_full_train.churn.value_counts(normalize=True) #provides churn rate
df_full_train.churn.mean() #also provides churn rate

0.26996805111821087

In [26]:
global_churn_rate = df_full_train.churn.mean()
round(global_churn_rate, 2)

0.27

In [27]:
df_full_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

In [28]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [29]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [30]:
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
        'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod']

In [31]:
df_full_train[categorical].nunique() #viewing the different answers for each column

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature Importance: Churn rate and Risk ratio

#### Churn Rate

In [34]:
## Look at churn rate within each group

churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

0.27682403433476394

In [35]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

0.2632135306553911

In [36]:
global_churn = df_full_train.churn.mean()
global_churn

0.26996805111821087

In [37]:
df_full_train.partner.value_counts()

partner
no     2932
yes    2702
Name: count, dtype: int64

In [38]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner

0.20503330866025166

In [39]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner

0.3298090040927694

In [40]:
global_churn - churn_no_partner

-0.05984095297455855

#### Risk Ratio

In [42]:
churn_no_partner / global_churn

1.2216593879412643

In [43]:
churn_partner / global_churn

0.7594724924338315

In [44]:
from IPython.display import display

In [45]:
for c in categorical: 
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()
    print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498






Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208






Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472






Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651






Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412






Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948






Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201






Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757






Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466






Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348






Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239






Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328






Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182






Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473






Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256






Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121






### Feature Importance: Mutual Information

How much we learn about 1 variable if we know the value of another. 
Higher value, Higher Relative Importance

In [48]:
from sklearn.metrics import mutual_info_score

In [49]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

0.0983203874041556

In [50]:
mutual_info_score(df_full_train.gender, df_full_train.churn)

0.0001174846211139946

In [51]:
mutual_info_score(df_full_train.churn, df_full_train.partner)

0.009967689095399745

In [52]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [53]:
mi = df_full_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False) #most important top, least important bottom

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

### Feature Importance: Correlation

Degree of measuring relationship between variables

In [56]:
df_full_train.tenure.max()

72

In [57]:
df_full_train[numerical].corrwith(df_full_train.churn).abs()

tenure            0.351885
monthlycharges    0.196805
totalcharges      0.196353
dtype: float64

In [58]:
df_full_train[numerical].corrwith(df_full_train.churn)

tenure           -0.351885
monthlycharges    0.196805
totalcharges     -0.196353
dtype: float64

In [59]:
df_full_train[df_full_train.tenure <= 2].churn.mean()

0.5953420669577875

In [60]:
df_full_train[(df_full_train.tenure > 2) & (df_full_train.tenure <=12)].churn.mean()

0.3994413407821229

In [61]:
df_full_train[df_full_train.tenure > 12].churn.mean()

0.17634908339788277

## One-hot Encoding

In [63]:
from sklearn.feature_extraction import DictVectorizer

In [64]:
dicts = df_train[['gender', 'contract']].iloc[:10].to_dict(orient='records')

In [65]:
dv = DictVectorizer(sparse=False)

In [66]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [67]:
X_train = dv.fit_transform(train_dicts)
X_train.shape

(4225, 45)

In [68]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [69]:
X_val = dv.transform(val_dicts)

### Logistic Regression

Binary Classification
Linear vs Logistic

In [72]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [73]:
def logistic_regression(xi):
    score = w0

    for j in range(len(w)):
        score = score + xi[j] * w[j]

    result = sigmoid(score)
    return result

### Training Logistic Regression with SciKit-Learn

- Train model with Scikit-Learn
- Apply it to the validation dataset
- Calculate the accuracy

In [76]:
from sklearn.linear_model import LogisticRegression

In [77]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [144]:
model.intercept_[0]

-0.10829812861544427

In [146]:
model.coef_[0].round(3)

array([ 0.472, -0.171, -0.407, -0.031, -0.076,  0.062, -0.09 , -0.079,
       -0.034, -0.073, -0.33 ,  0.313, -0.09 ,  0.004, -0.255,  0.141,
        0.006,  0.062, -0.09 , -0.079,  0.261, -0.09 , -0.278, -0.227,
        0.12 , -0.164,  0.057, -0.086, -0.031,  0.066, -0.057,  0.141,
       -0.248,  0.212, -0.118, -0.09 ,  0.1  , -0.069, -0.09 ,  0.052,
        0.208, -0.09 , -0.226, -0.07 ,  0.   ])

In [148]:
model.predict(X_train) #hard prediction

array([0, 1, 1, ..., 1, 0, 1])

In [164]:
model.predict_proba(X_train) #soft prediction, right column is positive, left is negative
y_pred = model.predict_proba(X_val)[:,1]

In [166]:
churn_decision = (y_pred >= 0.5) #default threshold

In [168]:
df_val[churn_decision].customerid

3       8433-wxgna
8       3440-jpscl
11      2637-fkfsy
12      7228-omtpn
19      6711-fldfb
           ...    
1397    5976-jcjrh
1398    2034-cgrhz
1399    5276-kqwhg
1407    6521-yytyi
1408    3049-solay
Name: customerid, Length: 313, dtype: object

In [174]:
(y_val == churn_decision).mean() #model accuracy

0.8019872249822569

In [180]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual'] = y_val

In [184]:
df_pred['cor'] = df_pred.prediction == df_pred.actual

In [186]:
df_pred

Unnamed: 0,probability,prediction,actual,cor
0,0.009377,0,0,True
1,0.206365,0,0,True
2,0.213500,0,0,True
3,0.544175,1,1,True
4,0.214825,0,0,True
...,...,...,...,...
1404,0.314914,0,0,True
1405,0.040401,0,1,False
1406,0.140165,0,0,True
1407,0.797574,1,1,True


### Model Interpretation

- Look at the coefficients
- Train a smaller model with fewer features

In [207]:
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3))) #zip attaches elements of sequences

{'contract=month-to-month': 0.472,
 'contract=one_year': -0.171,
 'contract=two_year': -0.407,
 'dependents=no': -0.031,
 'dependents=yes': -0.076,
 'deviceprotection=no': 0.062,
 'deviceprotection=no_internet_service': -0.09,
 'deviceprotection=yes': -0.079,
 'gender=female': -0.034,
 'gender=male': -0.073,
 'internetservice=dsl': -0.33,
 'internetservice=fiber_optic': 0.313,
 'internetservice=no': -0.09,
 'monthlycharges': 0.004,
 'multiplelines=no': -0.255,
 'multiplelines=no_phone_service': 0.141,
 'multiplelines=yes': 0.006,
 'onlinebackup=no': 0.062,
 'onlinebackup=no_internet_service': -0.09,
 'onlinebackup=yes': -0.079,
 'onlinesecurity=no': 0.261,
 'onlinesecurity=no_internet_service': -0.09,
 'onlinesecurity=yes': -0.278,
 'paperlessbilling=no': -0.227,
 'paperlessbilling=yes': 0.12,
 'partner=no': -0.164,
 'partner=yes': 0.057,
 'paymentmethod=bank_transfer_(automatic)': -0.086,
 'paymentmethod=credit_card_(automatic)': -0.031,
 'paymentmethod=electronic_check': 0.066,
 'pay

In [209]:
small = ['contract', 'tenure', 'monthlycharges']

In [213]:
df_train[small].iloc[:10].to_dict(orient='records')

[{'contract': 'two_year', 'tenure': 72, 'monthlycharges': 115.5},
 {'contract': 'month-to-month', 'tenure': 10, 'monthlycharges': 95.25},
 {'contract': 'month-to-month', 'tenure': 5, 'monthlycharges': 75.55},
 {'contract': 'month-to-month', 'tenure': 5, 'monthlycharges': 80.85},
 {'contract': 'two_year', 'tenure': 18, 'monthlycharges': 20.1},
 {'contract': 'month-to-month', 'tenure': 4, 'monthlycharges': 30.5},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 75.1},
 {'contract': 'month-to-month', 'tenure': 1, 'monthlycharges': 70.3},
 {'contract': 'two_year', 'tenure': 72, 'monthlycharges': 19.75},
 {'contract': 'month-to-month', 'tenure': 6, 'monthlycharges': 109.9}]

In [215]:
dicts_train_small = df_train[small].to_dict(orient='records')
dicts_val_small = df_val[small].to_dict(orient='records')

In [220]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [222]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [224]:
X_train_small = dv_small.transform(dicts_train_small)

In [230]:
model_small = LogisticRegression ()
model_small.fit(X_train_small, y_train)

In [244]:
w0 = model_small.intercept_[0]
w = model_small.coef_[0]
w0, w

(-2.477957595128277,
 array([ 0.9711394 , -0.02379507, -0.94828863,  0.02748534, -0.03619005]))

In [236]:
w.round(3)

array([ 0.971, -0.024, -0.948,  0.027, -0.036])

In [240]:
dict(zip(dv_small.get_feature_names_out(), w.round(3))) #zip attaches elements of sequences

{'contract=month-to-month': 0.971,
 'contract=one_year': -0.024,
 'contract=two_year': -0.948,
 'monthlycharges': 0.027,
 'tenure': -0.036}

In [252]:
sigmoid(-2.47 + 0.97 + 50 * 0.027 + 5 * (-0.036))

0.41824062315816374

## Using the Model

In [255]:
#Full Training and Validation Set
dicts_full_train = df_full_train[categorical+numerical].to_dict(orient='records') 

In [257]:
#Vectorise Dictionary, OH Encoding
dv = DictVectorizer(sparse=False)

In [259]:
#Fit and Transform 
X_full_train = dv.fit_transform(dicts_full_train)

In [261]:
#Churn Values - Vector
y_full_train = df_full_train.churn.values

In [267]:
#Train model on training + validation set
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

In [269]:
#Test Set - create Dictionary
dicts_test = df_test[categorical+numerical].to_dict(orient='records')

In [275]:
#transform into vectors
X_test = dv.transform(dicts_test)

In [277]:
#predict probability of churn based on training model
y_pred = model.predict_proba(X_test)[:, 1]

In [307]:
#set criteria of churn decision
churn_decision = (y_pred >= 0.5)

In [309]:
#How accurate is model
(churn_decision == y_test).mean()

0.8140525195173882

#### Predict a customers churn

In [316]:
#customer 10

customer = dicts_test[10]
customer

{'gender': 'male',
 'seniorcitizen': 1,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'no',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'mailed_check',
 'tenure': 32,
 'monthlycharges': 93.95,
 'totalcharges': 2861.45}

In [320]:
x_small = dv.transform([customer])

In [324]:
x_small.shape

(1, 45)

In [332]:
model.predict_proba(x_small)[0,1]

0.49540666997820526

In [334]:
y_test[10]

0

Churn less than 0.5, not likely to churn. Actual value in dataset is not churning, so model is accurate for this individual