In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('Data/telco_churn.csv')
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,Yes


In [3]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [5]:
data.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
Churn                  2
dtype: int64

In [6]:
data.corr()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
SeniorCitizen,1.0,0.016567,0.220173
tenure,0.016567,1.0,0.2479
MonthlyCharges,0.220173,0.2479,1.0


I will need to transform all the binary variables from yes/no into 1/0. I will remove the 'No phone service' and 'No internet service' values and make them into 'No' so that I can treat those columns as binary variables rather than multinomial. If my model does not perform well, I may revert back to multinomial. Since there are only 2 genders listed, I will replace Male/Female with 1/0 and name the column 'Male'

In [7]:
# replace no internet/phone service with no
data.replace(['No internet service', 'No phone service'], 'No', inplace=True)

# replace yes and no with 1 and 0
data.replace(['YES', 'Yes', 'NO', 'No'],[1,1,0,0], inplace=True)

# since only 2 genders listed, make 'gender' into a binary column for female
data['gender'].replace(['Male', 'Female'], [0,1], inplace=True)
data.rename({'gender':'female'}, axis=1)

# one hot encoding for categorical variables
temp = pd.get_dummies(data[['InternetService',  'Contract', 'PaymentMethod']])

# rejoin modified datasets
data1 = pd.concat([data, temp.reindex(data.index)], axis=1)

#drop columns that are now redundant
data1.drop(columns = [ 'InternetService',  'Contract', 'PaymentMethod'], inplace=True)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [9]:
def model_and_eval(my_training, my_testing):
    X_train, X_test, y_train, y_test = train_test_split(my_training, my_testing, test_size=0.2, random_state=42)

    # evaluation
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print("tn:", tn, "fp:", fp, "fn:", fn, "tp:", tp)
    print(precision_recall_fscore_support(y_test, y_pred, average='weighted'))

In [10]:
# separate labels
data_labels = data1['Churn']

# make a dataset of everything
data_all_training = data1.drop(['Churn', 'customerID'], axis=1)

# make a dataset of just the user demographics
data_demographics_training = data1[['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure']]

#make a dataset of just services
data_service_training = data1[['PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 
                           'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 
                           'PaperlessBilling', 'MonthlyCharges', 'InternetService_DSL', 
                           'InternetService_Fiber optic']]

In [11]:
model_and_eval(data_all_training, data_labels)

[[939  97]
 [156 217]]
tn: 939 fp: 97 fn: 156 tp: 217
(0.8134700956723727, 0.8204400283889283, 0.8152153621698797, None)


In [14]:
model_and_eval(data_demographics_training, data_labels)

[[998  38]
 [307  66]]
tn: 998 fp: 38 fn: 307 tp: 66
(0.7303005896371207, 0.7551454932576295, 0.7001715991465763, None)


In [13]:
model_and_eval(data_service_training, data_labels)

[[934 102]
 [212 161]]
tn: 934 fp: 102 fn: 212 tp: 161
(0.7613111733012357, 0.7771469127040455, 0.7634923130257347, None)


In [15]:
def interp_coef(my_training, my_testing):
    logreg = LogisticRegression()
    X_train, X_test, y_train, y_test = train_test_split(my_training, my_testing, test_size=0.2, random_state=42)
    logreg.fit(X_train, y_train)
    my_coef = np.exp(logreg.coef_[0])
    coef_df = pd.DataFrame(my_coef, 
                 X_test.columns, 
                 columns=['coefficient']).sort_values(by='coefficient', ascending=False)
    print(coef_df)

In [16]:
interp_coef(data_all_training, data_labels)

                                         coefficient
Contract_Month-to-month                     1.659327
InternetService_Fiber optic                 1.560456
PaperlessBilling                            1.398548
StreamingMovies                             1.274889
MultipleLines                               1.273570
SeniorCitizen                               1.177045
StreamingTV                                 1.147158
PaymentMethod_Electronic check              1.143630
Partner                                     1.058915
gender                                      1.053909
MonthlyCharges                              1.008991
tenure                                      0.967646
DeviceProtection                            0.962899
Contract_One year                           0.874418
Dependents                                  0.841327
OnlineBackup                                0.831922
PaymentMethod_Bank transfer (automatic)     0.829027
PaymentMethod_Mailed check                  0.

In [18]:
interp_coef(data_demographics_training, data_labels)

               coefficient
SeniorCitizen     2.324255
Partner           1.168268
gender            1.054032
tenure            0.961724
Dependents        0.546091


In [19]:
interp_coef(data_service_training, data_labels)

                             coefficient
InternetService_DSL             2.211952
InternetService_Fiber optic     2.032288
PaperlessBilling                1.542743
MonthlyCharges                  1.043002
StreamingMovies                 0.719411
MultipleLines                   0.681694
StreamingTV                     0.649195
DeviceProtection                0.518160
OnlineBackup                    0.441851
TechSupport                     0.352020
OnlineSecurity                  0.318296
PhoneService                    0.309238


Since we are looking at the odds ratio, I was careful to interpret these values with respect to each other and not as linearly scaled proportions. In the model that uses all of the available features as predictors, whether someone is on a month-to-month contract has the biggest effect on whether someone will terminate service or not. It also seems that whether they are a fiber optic subscriber plays a large role. I would want to look into this more to determine if there are very few fiber optic subscribers, but it may be that people are unhappy with the service.