In [1]:
import pandas as pd
import numpy as np 

import seaborn as sns
from matplotlib import pyplot as plt 
%matplotlib inline 

In [2]:
!ls data

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
#df = pd.read_csv('data/telco-customer-churn-data.csv')
df = pd.read_csv("data/03-04-classification-telco-customer-churn-data.csv")
len(df)

7043

In [4]:
#df.head()

In [5]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
# "TotalCharges" column should be numeric data type but its type is "object".
# That is because some of its missing values are " "
# Let's verify that
df['TotalCharges'] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df[df['TotalCharges'].isnull()][["customerID", "TotalCharges"]]
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [8]:
# Make column names, string column values more conventional 

df.columns = df.columns.str.lower().str.replace(" ", "_")

string_columns = list(df.dtypes[df.dtypes == "object"].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(" ", "_")

In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


In [10]:
# The target variable is "churn", which is categorical 
# Currently its values are "yes/no" and we need to convert that to binary 1/0 for convenience
df['churn'].value_counts()

no     5174
yes    1869
Name: churn, dtype: int64

In [11]:
df['churn'] = (df['churn'] == 'yes').astype(int)
df['churn'].value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [12]:
# Use library functions to do train-validate-test split

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [15]:
df_train, df_val = train_test_split(df_train_val, test_size=0.33, random_state=11)

In [16]:
f"{len(df_train)/len(df):.2f}, {len(df_val)/len(df):.2f}, {len(df_test)/len(df):.2f}"

'0.54, 0.26, 0.20'

In [17]:
y_train = df_train['churn'].values
y_val = df_val['churn'].values
y_test = df_test['churn'].values

In [18]:
del df_train['churn']
del df_val['churn']

### Looking at data before training a model

In [19]:
# Missing values
df_train_val.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [20]:
df_train_val['churn'].value_counts()

0    4113
1    1521
Name: churn, dtype: int64

In [21]:
nChurn = len(df_train_val[df_train_val['churn'] == 1])
f"Churn rate: {nChurn} / {len(df_train_val)} = {nChurn / len(df_train_val):.2f}"

'Churn rate: 1521 / 5634 = 0.27'

In [22]:
# Or: there is a quick way to compute this churn rate
# Churn Rate = ( (y[0]==1) + (y[1]==1) + ... + (y[n-1]==1) ) / n = sum(y[i] for i in 0 to n-1) / n = np.mean(y[i] for i in 0 to n-1)
# since y[i] is either 0 or 1
global_mean = np.mean(df_train_val['churn'])
global_mean

0.26996805111821087

In [23]:
# Separate categorical and numerical variables

In [24]:
print("Categorical columns:")
print(df_train.dtypes[df_train.dtypes == "object"].index)
print("Numerical columns:")
print(df_train.dtypes[df_train.dtypes != "object"].index)

Categorical columns:
Index(['customerid', 'gender', 'partner', 'dependents', 'phoneservice',
       'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
       'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
       'contract', 'paperlessbilling', 'paymentmethod'],
      dtype='object')
Numerical columns:
Index(['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges'], dtype='object')


In [25]:
df_train['seniorcitizen'].value_counts()

0    3167
1     607
Name: seniorcitizen, dtype: int64

In [26]:
categorical = ['seniorcitizen', 'gender', 'partner', 'dependents', 'phoneservice', 
    'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 
    'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']

numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [27]:
# number of distinc values for each categorical variable
df_train_val[categorical].nunique()

seniorcitizen       2
gender              2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

### Feature importance

#### Categorical and numerical variables are treated differently

#### Categorical variables

In [28]:
# "gender" may not be a determining feature
churn_rate_female = np.mean(df_train_val[df_train_val['gender'] == 'female']['churn'])
churn_rate_male = np.mean(df_train_val[df_train_val['gender'] == 'male']['churn'])
global_mean, churn_rate_female, churn_rate_male

(0.26996805111821087, 0.27682403433476394, 0.2632135306553911)

In [29]:
# "partner": having a partner or not seems important
churn_rate_partner_yes = np.mean(df_train_val[df_train_val['partner'] == 'yes']['churn'])
churn_rate_partner_no = np.mean(df_train_val[df_train_val['partner'] == 'no']['churn'])
global_mean, churn_rate_partner_yes, churn_rate_partner_no

(0.26996805111821087, 0.20503330866025166, 0.3298090040927694)

In [30]:
# Risk Ratio = Group Rate / Global Rate
## If Risk Ratio is close to 1: the group has the same level as the rest of the population (not outstandingly risky)
## If Risk Ratio < 1: the group has less risk (here, lower churn rate)
## If Risk Ratio > 1: the group has more risk (here, higher churn rate)

In [31]:
from IPython.display import display

In [32]:
df_train_val.groupby(by='gender')['churn'].agg(['mean'])

Unnamed: 0_level_0,mean
gender,Unnamed: 1_level_1
female,0.276824
male,0.263214


In [33]:
for col in categorical:
    df_group = df_train_val.groupby(by=col)['churn'].agg(['mean'])
    df_group['risk ratio'] = df_group['mean'] / global_mean
    display(df_group)
    print("\n---------------------------------------------------\n")

Unnamed: 0_level_0,mean,risk ratio
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.24227,0.897403
1,0.413377,1.531208



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.276824,1.025396
male,0.263214,0.97498



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
partner,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.329809,1.221659
yes,0.205033,0.759472



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.31376,1.162212
yes,0.165666,0.613651



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.241316,0.89387
yes,0.273049,1.011412



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.257407,0.953474
no_phone_service,0.241316,0.89387
yes,0.290742,1.076948



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1
dsl,0.192347,0.712482
fiber_optic,0.425171,1.574895
no,0.077805,0.288201



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.420921,1.559152
no_internet_service,0.077805,0.288201
yes,0.153226,0.56757



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.404323,1.497672
no_internet_service,0.077805,0.288201
yes,0.217232,0.80466



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.395875,1.466379
no_internet_service,0.077805,0.288201
yes,0.230412,0.85348



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.418914,1.551717
no_internet_service,0.077805,0.288201
yes,0.159926,0.59239



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.342832,1.269897
no_internet_service,0.077805,0.288201
yes,0.302723,1.121328



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.338906,1.255358
no_internet_service,0.077805,0.288201
yes,0.307273,1.138182



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
contract,Unnamed: 1_level_1,Unnamed: 2_level_1
month-to-month,0.431701,1.599082
one_year,0.120573,0.446621
two_year,0.028274,0.10473



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1
no,0.172071,0.637375
yes,0.338151,1.25256



---------------------------------------------------



Unnamed: 0_level_0,mean,risk ratio
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1
bank_transfer_(automatic),0.168171,0.622928
credit_card_(automatic),0.164339,0.608733
electronic_check,0.45589,1.688682
mailed_check,0.19387,0.718121



---------------------------------------------------



In [34]:
# More quantitative way: mutal information 
# It measures the degree of dependency between a categorical variable and the target variable

In [35]:
from sklearn.metrics import mutual_info_score

In [36]:
def get_mutual_info(series1, series2=df_train_val['churn'].values):
    return mutual_info_score(series1, series2)

In [37]:
df_mutual_info = df_train_val[categorical].apply(get_mutual_info)
df_mutual_info

seniorcitizen       0.009410
gender              0.000117
partner             0.009968
dependents          0.012346
phoneservice        0.000229
multiplelines       0.000857
internetservice     0.055868
onlinesecurity      0.063085
onlinebackup        0.046923
deviceprotection    0.043453
techsupport         0.061032
streamingtv         0.031853
streamingmovies     0.031581
contract            0.098320
paperlessbilling    0.017589
paymentmethod       0.043210
dtype: float64

In [38]:
type(df_mutual_info)

pandas.core.series.Series

In [39]:
df_mutual_info = df_mutual_info.sort_values(ascending=False).to_frame(name="Mutual Info")
df_mutual_info

Unnamed: 0,Mutual Info
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


In [40]:
# For numerical variables, use correlations to measure dependencies

In [41]:
df_train_val[numerical]

Unnamed: 0,tenure,monthlycharges,totalcharges
1814,12,19.70,258.35
5946,42,73.90,3160.55
3881,71,65.15,4681.75
2389,71,85.45,6300.85
3676,30,70.40,2044.75
...,...,...,...
905,9,100.50,918.60
5192,60,19.95,1189.90
3980,28,105.70,2979.50
235,2,54.40,114.10


In [42]:
df_train_val[numerical].corrwith(df_train_val['churn']).to_frame('correlation')

Unnamed: 0,correlation
tenure,-0.351885
monthlycharges,0.196805
totalcharges,-0.196353


In [43]:
def compute_pearson_correlation(arr1, arr2):
    mu1, mu2 = np.mean(arr1), np.mean(arr2)
    Arr1, Arr2 = arr1 - mu1, arr2 - mu2 
    corr = np.dot(Arr1, Arr2) / np.sqrt(np.dot(Arr1, Arr1) * np.dot(Arr2, Arr2))
    return corr 

arr1 = df_train_val.totalcharges.values
arr2 = df_train_val.churn.values
compute_pearson_correlation(arr1, arr2)

-0.19635337452472915

In [44]:
df_train_val.groupby(by='churn')[numerical].mean()

Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37.531972,61.176477,2548.021627
1,18.070348,74.521203,1545.689415


### One-hot encoding of categorical variables

Using Sklearn's DictVectorizer

To use this method, need to convert the dataframe to a list of dictionaries

In [45]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
type(train_dict), len(train_dict)

(list, 3774)

In [46]:
train_dict[123]

{'seniorcitizen': 0,
 'gender': 'female',
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'no',
 'onlinesecurity': 'no_internet_service',
 'onlinebackup': 'no_internet_service',
 'deviceprotection': 'no_internet_service',
 'techsupport': 'no_internet_service',
 'streamingtv': 'no_internet_service',
 'streamingmovies': 'no_internet_service',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'bank_transfer_(automatic)',
 'tenure': 65,
 'monthlycharges': 25.3,
 'totalcharges': 1748.55}

In [47]:
from sklearn.feature_extraction import DictVectorizer

In [48]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

DictVectorizer(sparse=False)

In [49]:
print("Num features after one-hot encoding:", len(dv.feature_names_))
print(dv.feature_names_)

Num features after one-hot encoding: 45
['contract=month-to-month', 'contract=one_year', 'contract=two_year', 'dependents=no', 'dependents=yes', 'deviceprotection=no', 'deviceprotection=no_internet_service', 'deviceprotection=yes', 'gender=female', 'gender=male', 'internetservice=dsl', 'internetservice=fiber_optic', 'internetservice=no', 'monthlycharges', 'multiplelines=no', 'multiplelines=no_phone_service', 'multiplelines=yes', 'onlinebackup=no', 'onlinebackup=no_internet_service', 'onlinebackup=yes', 'onlinesecurity=no', 'onlinesecurity=no_internet_service', 'onlinesecurity=yes', 'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no', 'partner=yes', 'paymentmethod=bank_transfer_(automatic)', 'paymentmethod=credit_card_(automatic)', 'paymentmethod=electronic_check', 'paymentmethod=mailed_check', 'phoneservice=no', 'phoneservice=yes', 'seniorcitizen', 'streamingmovies=no', 'streamingmovies=no_internet_service', 'streamingmovies=yes', 'streamingtv=no', 'streamingtv=no_internet_serv

In [50]:
X_train = dv.transform(train_dict)
X_train

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        7.10000e+01, 6.04590e+03],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        6.00000e+01, 6.02900e+03],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        4.60000e+01, 2.06515e+03],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.00000e+00, 2.83000e+01],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.30000e+01, 4.70600e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        6.40000e+01, 5.32725e+03]])

In [51]:
X_train.shape, (len(train_dict), len(dv.feature_names_))

((3774, 45), (3774, 45))

In [52]:
pd.DataFrame(X_train, columns=dv.get_feature_names())

Unnamed: 0,contract=month-to-month,contract=one_year,contract=two_year,dependents=no,dependents=yes,deviceprotection=no,deviceprotection=no_internet_service,deviceprotection=yes,gender=female,gender=male,...,streamingmovies=no_internet_service,streamingmovies=yes,streamingtv=no,streamingtv=no_internet_service,streamingtv=yes,techsupport=no,techsupport=no_internet_service,techsupport=yes,tenure,totalcharges
0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,71.0,6045.90
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,60.0,6029.00
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,46.0,2065.15
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,69.15
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,20.0,1842.80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3769,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,63.0,6705.70
3770,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,142.35
3771,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,28.30
3772,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,23.0,470.60


### Train logistic regression classifier

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
model = LogisticRegression(solver="liblinear", random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [55]:
# Now use this model to predict on the validation set

In [56]:
val_dict = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.transform(val_dict)

In [57]:
y_val_pred_proba = model.predict_proba(X_val)
y_val_pred_proba

array([[0.76509027, 0.23490973],
       [0.73113741, 0.26886259],
       [0.68054858, 0.31945142],
       ...,
       [0.94274812, 0.05725188],
       [0.38477022, 0.61522978],
       [0.93872741, 0.06127259]])

In [58]:
# Notice that in this binary classification case
# in any row, y_val_pred_proba[i,0] + y_val_pred_proba[i,1] == 1
# Thus it's enough to just look at one of the columns, say Column 1
# This is a "soft" prediction: a proba between 0 and 1, instead of hard 0 or 1
y_val_pred = y_val_pred_proba[:, 1]
y_val_pred

array([0.23490973, 0.26886259, 0.31945142, ..., 0.05725188, 0.61522978,
       0.06127259])

In [59]:
thresholds = np.arange(3, 9)/10

for theta in thresholds:
    acc = np.mean((y_val_pred >= theta) == y_val)
    print(f"thresh = {theta}\taccuracy = {acc:.2f}")

thresh = 0.3	accuracy = 0.75
thresh = 0.4	accuracy = 0.78
thresh = 0.5	accuracy = 0.80
thresh = 0.6	accuracy = 0.79
thresh = 0.7	accuracy = 0.77
thresh = 0.8	accuracy = 0.74


### Model interpretation

In [60]:
model.intercept_

array([-0.12198861])

In [61]:
model.coef_

array([[ 5.63355226e-01, -8.59043988e-02, -5.99439436e-01,
        -3.02756832e-02, -9.17129252e-02,  9.99309047e-02,
        -1.15870918e-01, -1.06048595e-01, -2.73675052e-02,
        -9.46211033e-02, -3.23344376e-01,  3.17226686e-01,
        -1.15870918e-01,  7.84091487e-04, -1.68101121e-01,
         1.27132037e-01, -8.10195250e-02,  1.35700257e-01,
        -1.15870918e-01, -1.41817947e-01,  2.57852270e-01,
        -1.15870918e-01, -2.63969960e-01, -2.12617969e-01,
         9.06293603e-02, -4.80147494e-02, -7.39738591e-02,
        -2.66750845e-02, -1.36242765e-01,  1.74740599e-01,
        -1.33811358e-01,  1.27132037e-01, -2.49120646e-01,
         2.97087806e-01, -8.48520028e-02, -1.15870918e-01,
         7.87343128e-02, -9.90683358e-02, -1.15870918e-01,
         9.29506458e-02,  1.78136319e-01, -1.15870918e-01,
        -1.84254009e-01, -6.94870404e-02,  4.47693584e-04]])

In [62]:
# To see which feature is associated with which weight
for feature, weight in dict(zip(dv.get_feature_names(), np.round(model.coef_[0], 3))).items():
    print(feature, ":", weight)

contract=month-to-month : 0.563
contract=one_year : -0.086
contract=two_year : -0.599
dependents=no : -0.03
dependents=yes : -0.092
deviceprotection=no : 0.1
deviceprotection=no_internet_service : -0.116
deviceprotection=yes : -0.106
gender=female : -0.027
gender=male : -0.095
internetservice=dsl : -0.323
internetservice=fiber_optic : 0.317
internetservice=no : -0.116
monthlycharges : 0.001
multiplelines=no : -0.168
multiplelines=no_phone_service : 0.127
multiplelines=yes : -0.081
onlinebackup=no : 0.136
onlinebackup=no_internet_service : -0.116
onlinebackup=yes : -0.142
onlinesecurity=no : 0.258
onlinesecurity=no_internet_service : -0.116
onlinesecurity=yes : -0.264
paperlessbilling=no : -0.213
paperlessbilling=yes : 0.091
partner=no : -0.048
partner=yes : -0.074
paymentmethod=bank_transfer_(automatic) : -0.027
paymentmethod=credit_card_(automatic) : -0.136
paymentmethod=electronic_check : 0.175
paymentmethod=mailed_check : -0.134
phoneservice=no : 0.127
phoneservice=yes : -0.249
seni

In [74]:
w0 = model.intercept_[0]
print(f'w0 = {w0:.4f}')
print(f"Sigmoid(w0) = {1/(1 + np.exp(-w0)):.4f}")

w0 = -0.1220
Sigmoid(w0) = 0.4695


Let's train a smaller model by using a (smaller) subset of features

In [66]:
smaller_feature_set = ['contract', 'tenure', 'totalcharges']

train_dict_small = df_train[smaller_feature_set].to_dict(orient='records')

dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_train_small = dv_small.transform(train_dict_small)

In [68]:
print(dv_small.get_feature_names())
print(X_train_small.shape)

['contract=month-to-month', 'contract=one_year', 'contract=two_year', 'tenure', 'totalcharges']
(3774, 5)


In [69]:
# train a smaller model
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_train_small, y_train)

LogisticRegression(random_state=1, solver='liblinear')

In [75]:
w0_small= model_small.intercept_[0]
print(f'w0_small = {w0_small:.4f}')
print(f"Sigmoid(w0_small) = {1/(1 + np.exp(-w0_small)):.4f}")

w0_small = -0.5772
Sigmoid(w0_small) = 0.3596


In [76]:
# If the coeff for a feature is positive, it means this feature contributes positively toward churn
# If the coeff for a feature is negative, it means this feature contributes negatively toward churn
# Also, the magnitude matters
# Here mag(contract=two_year) > mag(contract=one_year), both being negative, so it means
# having a two-year contract is a stronger indicator of non-churning than having a one-year contract
dict(zip(dv_small.get_feature_names(), model_small.coef_[0].round(3)))

{'contract=month-to-month': 0.866,
 'contract=one_year': -0.327,
 'contract=two_year': -1.117,
 'tenure': -0.094,
 'totalcharges': 0.001}

In [78]:
# Example 1: monthly contract, 12 months of tenure, paid $500 already 
-0.577 + 0.866*1 - 0.327*0 - 1.117*0 - 0.094*12 + 0.001*500

-0.3390000000000001

In [79]:
# Example 2: yearly contract, 24 months of tensure, paid $1000 already (even less likely to churn)
-0.577 + 0.866*0 - 0.327*1 - 1.117*0 - 0.094*24 + 0.001*1000

-2.16

### Using the model

In [80]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [83]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0][1]

0.07332300619542415

In [84]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [85]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0][1]

0.832164848003211