In [4]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.set_option("display.max_columns", 30)

# Logistic Regression

## Analyzing Churn

In [2]:
df = pd.read_csv("data/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [5]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

### Create a new feature `no_of_services`

In [7]:
services = ["PhoneService",
           "OnlineSecurity",
           "OnlineBackup",
           "DeviceProtection",
           "TechSupport",
           "StreamingTV",
           "StreamingMovies"]

df[services] = np.where(df[services] == "Yes", 1, 0)

In [9]:
df = df.assign(no_of_services=df[services].sum(axis=1))

In [10]:
to_bool = ["Partner", "Dependents"]
df[to_bool] = np.where(df[to_bool] == "Yes", 1, 0)

In [11]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,no_of_services
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,Yes,Electronic check,29.85,29.85,No,1
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,1,0,1,0,0,0,One year,No,Mailed check,56.95,1889.5,No,3
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,1,1,0,0,0,0,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,3
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,No,Bank transfer (automatic),42.3,1840.75,No,3
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,0,0,0,0,0,0,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1


In [15]:
df = pd.get_dummies(df, columns=["Contract"], prefix="Contract", drop_first=True)

In [18]:
df.dtypes

customerID            object
gender                object
SeniorCitizen          int64
Partner                int64
Dependents             int64
tenure                 int64
PhoneService           int64
MultipleLines         object
InternetService       object
OnlineSecurity         int64
OnlineBackup           int64
DeviceProtection       int64
TechSupport            int64
StreamingTV            int64
StreamingMovies        int64
PaperlessBilling      object
PaymentMethod         object
MonthlyCharges       float64
TotalCharges          object
Churn                 object
no_of_services         int64
Contract_One year      uint8
Contract_Two year      uint8
dtype: object

In [19]:
variables = ["no_of_services",
             "MonthlyCharges",
             "Contract_One year",
             "Contract_Two year",
             "SeniorCitizen",
             "tenure"]

We model churn as a function of 

- `no_of_services`     -
- `MonthlyCharges`     +
- `Contract_One year`  - 
- `Contract_Two year`  - (coefficient is higher in magnitude than for 1 year)
- `SeniorCitizen`      -
- `tenure`             -

using logistic regression.

In [20]:
df = df.assign(const=1)
X = df[["const"] + variables]
y = np.where(df["Churn"] == "Yes", 1, 0)

In [21]:
lr = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.428913
         Iterations 8


In [22]:
lr.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7043.0
Model:,Logit,Df Residuals:,7036.0
Method:,MLE,Df Model:,6.0
Date:,"Mon, 27 Jul 2020",Pseudo R-squ.:,0.2587
Time:,09:55:26,Log-Likelihood:,-3020.8
converged:,True,LL-Null:,-4075.1
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.6936,0.091,-18.573,0.000,-1.872,-1.515
no_of_services,-0.2774,0.033,-8.346,0.000,-0.343,-0.212
MonthlyCharges,0.0380,0.002,20.209,0.000,0.034,0.042
Contract_One year,-0.8073,0.104,-7.768,0.000,-1.011,-0.604
Contract_Two year,-1.6231,0.171,-9.517,0.000,-1.957,-1.289
SeniorCitizen,0.3851,0.081,4.733,0.000,0.226,0.545
tenure,-0.0324,0.002,-15.345,0.000,-0.037,-0.028
