In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.set_option("display.max_columns", 30)

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [10]:
# Create new features: number of services
services = ["PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]

for service in services:
    df[service] = np.where(df[service] == "Yes", 1, 0)

In [20]:
df = df.assign(no_of_services=df[services].sum(axis=1))

In [21]:
# convert remaining booleans to ints
to_bool = ["Partner", "Dependents"]

for col in to_bool:
    df[col] = np.where(df[col] == "Yes", 1, 0)

Contract

In [24]:
df["Contract"].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [30]:
# One-hot encode Contract
df = pd.concat([df,pd.get_dummies(df["Contract"], prefix="Contract")], axis=1)

PaymentMethod

In [32]:
df["PaymentMethod"].value_counts()

Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: PaymentMethod, dtype: int64

In [36]:
df = df.assign(automatic_payment=np.where(df["PaymentMethod"].str.contains("automatic"), 1, 0))

In [37]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,no_of_services,Contract_Month-to-month,Contract_One year,Contract_Two year,automatic_payment
0,7590-VHVEG,Female,0,1,0,1,0,No phone service,DSL,0,1,0,0,0,0,Month-to-month,Yes,Electronic check,29.85,29.85,No,1,1,0,0,0
1,5575-GNVDE,Male,0,0,0,34,1,No,DSL,1,0,1,0,0,0,One year,No,Mailed check,56.95,1889.5,No,3,0,1,0,0
2,3668-QPYBK,Male,0,0,0,2,1,No,DSL,1,1,0,0,0,0,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,3,1,0,0,0
3,7795-CFOCW,Male,0,0,0,45,0,No phone service,DSL,1,0,1,1,0,0,One year,No,Bank transfer (automatic),42.3,1840.75,No,3,0,1,0,1
4,9237-HQITU,Female,0,0,0,2,1,No,Fiber optic,0,0,0,0,0,0,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,1,0,0,0


In [38]:
variables = ["no_of_services",
             "MonthlyCharges",
             "Contract_Month-to-month",
             "Contract_One year",
             "automatic_payment",
             "SeniorCitizen",
             "Partner",
             "Dependents"]

## Logistic Regression

In [40]:
df = sm.add_constant(df)

In [None]:
df[]

In [43]:
X = df[["const"] + variables]
y = np.where(df["Churn"] == "Yes", 1, 0)

lr = sm.Logit(y, X).fit()

Optimization terminated successfully.
         Current function value: 0.441579
         Iterations 8


In [44]:
lr.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,7043.0
Model:,Logit,Df Residuals:,7034.0
Method:,MLE,Df Model:,8.0
Date:,"Thu, 14 Nov 2019",Pseudo R-squ.:,0.2368
Time:,12:58:09,Log-Likelihood:,-3110.0
converged:,True,LL-Null:,-4075.1
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.9021,0.180,-21.681,0.000,-4.255,-3.549
no_of_services,-0.3763,0.032,-11.672,0.000,-0.440,-0.313
MonthlyCharges,0.0345,0.002,18.844,0.000,0.031,0.038
Contract_Month-to-month,2.4009,0.160,15.005,0.000,2.087,2.714
Contract_One year,1.1135,0.171,6.512,0.000,0.778,1.449
automatic_payment,-0.4890,0.069,-7.105,0.000,-0.624,-0.354
SeniorCitizen,0.2131,0.080,2.650,0.008,0.055,0.371
Partner,-0.2044,0.073,-2.808,0.005,-0.347,-0.062
Dependents,-0.1788,0.086,-2.073,0.038,-0.348,-0.010


In [45]:
df[variables].mean()

no_of_services              2.941076
MonthlyCharges             64.761692
Contract_Month-to-month     0.550192
Contract_One year           0.209144
automatic_payment           0.435326
SeniorCitizen               0.162147
Partner                     0.483033
Dependents                  0.299588
dtype: float64

In [49]:
df = df.assign(churn_prediction=lr.predict(df[["const"] + variables]))

In [50]:
df[["Churn", "churn_prediction"]]

Unnamed: 0,Churn,churn_prediction
0,No,0.258960
1,No,0.124400
2,Yes,0.316251
3,No,0.049919
4,Yes,0.637212
...,...,...
7038,No,0.075718
7039,No,0.121379
7040,No,0.224648
7041,Yes,0.668129
