In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

%matplotlib inline

In [4]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# 1. Quick Cleaning

In [5]:
df = df[df["TotalCharges"] != " "].reset_index(drop=True)

In [23]:
df.nunique()

customerID          7032
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
dtype: int64

In [28]:
df.nunique()[df.nunique() == 3]

MultipleLines       3
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
dtype: int64

In [33]:
def get_cols_list(df, nunique, smaller_or_equal=True):
    if smaller_or_equal:
        return df.nunique()[df.nunique() <= nunique].keys().tolist()

    return df.nunique()[df.nunique() == nunique].keys().tolist()

Check columns with 2 unique values

In [43]:
df[get_cols_list(df, 2, False)].apply(pd.value_counts)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,,5890.0,,,,,
1,,1142.0,,,,,
Female,3483.0,,,,,,
Male,3549.0,,,,,,
No,,,3639.0,4933.0,680.0,2864.0,5163.0
Yes,,,3393.0,2099.0,6352.0,4168.0,1869.0


In [44]:
to_bool = ["Partner", "Dependents", "PhoneService", "PaperlessBilling", "Churn"]

# for col in to_bool:
#     df[col] = np.where(df[col] == "Yes", 1, 0)
    
df[to_bool] = np.where(df[to_bool] == "Yes", 1, 0)

In [46]:
df[get_cols_list(df, 2, False)].apply(pd.value_counts)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,,5890.0,3639.0,4933.0,680.0,2864.0,5163.0
1,,1142.0,3393.0,2099.0,6352.0,4168.0,1869.0
Female,3483.0,,,,,,
Male,3549.0,,,,,,


In [49]:
df = df.assign(Female=np.where(df["gender"] == "Female", 1, 0))
df = df.drop(columns=["gender"])

In [50]:
df[get_cols_list(df, 2, False)].apply(pd.value_counts)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,PaperlessBilling,Churn,Female
0,5890,3639,4933,680,2864,5163,3549
1,1142,3393,2099,6352,4168,1869,3483


Check columns with 3 unique values

In [51]:
df[get_cols_list(df, 3, False)].apply(pd.value_counts)

Unnamed: 0,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract
DSL,,2416.0,,,,,,,
Fiber optic,,3096.0,,,,,,,
Month-to-month,,,,,,,,,3875.0
No,3385.0,1520.0,3497.0,3087.0,3094.0,3472.0,2809.0,2781.0,
No internet service,,,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,
No phone service,680.0,,,,,,,,
One year,,,,,,,,,1472.0
Two year,,,,,,,,,1685.0
Yes,2967.0,,2015.0,2425.0,2418.0,2040.0,2703.0,2731.0,


In [53]:
exclude = ["InternetService", "Contract"]
df[[col for col in get_cols_list(df, 3, False) if col not in exclude]].apply(pd.value_counts)

Unnamed: 0,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies
No,3385.0,3497.0,3087.0,3094.0,3472.0,2809.0,2781.0
No internet service,,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0
No phone service,680.0,,,,,,
Yes,2967.0,2015.0,2425.0,2418.0,2040.0,2703.0,2731.0


In [54]:
to_bool_again = [col for col in get_cols_list(df, 3, False) if col not in exclude]
df[to_bool_again] = np.where(df[to_bool_again] == "Yes", 1, 0)

In [56]:
df[[col for col in get_cols_list(df, 2, False) if col not in exclude]].apply(pd.value_counts)

Unnamed: 0,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,Churn,Female
0,5890,3639,4933,680,4065,5017,4607,4614,4992,4329,4301,2864,5163,3549
1,1142,3393,2099,6352,2967,2015,2425,2418,2040,2703,2731,4168,1869,3483


In [57]:
df.dtypes

customerID           object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
Female                int64
dtype: object

In [58]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [87]:
df.dtypes

customerID           object
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService      object
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract             object
PaperlessBilling      int64
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
Female                int64
dtype: object

In [60]:
df.nunique()

customerID          7032
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          2
InternetService        3
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
Female                 2
dtype: int64

# 2. Create model evaluation function

In [111]:
def evaluate_model(df, scaler=StandardScaler, seed=666):
    # assign columns
    id_col = ["customerID"]
    target_col = ["Churn"]
    bool_col = [col for col in get_cols_list(df, 2, False) if col not in target_col]
    cat_col = [col for col in get_cols_list(df, 4) if col not in target_col]
    num_col = [col for col in df.columns if col not in id_col + target_col + cat_col]
    
    # create dummies
    df = pd.get_dummies(data=df,
                        columns=[col for col in cat_col if col not in bool_col],
                        drop_first=True)
    cat_col = [col for col in get_cols_list(df, 4) if col not in target_col]
    bool_col = [col for col in get_cols_list(df, 2, False) if col not in target_col]
    
    # split df 
    train, test = train_test_split(df, test_size=.2, random_state=seed)
    train_X = train[cat_col + num_col].reset_index(drop=True)
    train_Y = train[target_col].reset_index(drop=True)
    test_X = test[cat_col + num_col].reset_index(drop=True)
    test_Y = test[target_col].reset_index(drop=True)
    
    # Standardize
    std = scaler()
    scaled_fitted = std.fit(train_X[num_col])
    scaled = scaled_fitted.transform(train_X[num_col])
    scaled = pd.DataFrame(scaled, columns=num_col)
    
    train_X = train_X.drop(columns=num_col, axis=1)
    train_X = train_X.merge(scaled, left_index=True, right_index=True, how="left")
    
    # train model
    lr = LogisticRegression(solver="liblinear")
    lr.fit(train_X, train_Y.values.ravel())
    
    # Predict on test set
    scaled_test = scaled_fitted.transform(test_X[num_col])
    scaled_test = pd.DataFrame(scaled_test, columns=num_col)
    
    test_X = test_X.drop(columns=num_col, axis=1)
    test_X = test_X.merge(scaled_test, left_index=True, right_index=True, how="left")
    
    pred = lr.predict(test_X)
    
    # Compute accuracy
    return accuracy_score(test_Y, pred)
    

In [112]:
evaluate_model(df)

0.8009950248756219

This is our baseline model. Every change we do now should improve accuracy compared to this model

# 3. Create different inputs

## 3.1. InternetService as a bool

In [92]:
df["InternetService"].value_counts()

Fiber optic    3096
DSL            2416
No             1520
Name: InternetService, dtype: int64

In [93]:
df_i = df.copy()
df_i["InternetService"] = np.where(df_i["InternetService"] == "No", 0, 1)

In [94]:
evaluate_model(df_i)

0.7981520966595593

## 3.2. previous model

In [103]:
df_p = df.copy()
services = ["PhoneService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies"]
df_p = df_p.assign(no_of_services=df_p[services].sum(axis=1))
df_p = df_p.assign(automatic_payment=np.where(df_p["PaymentMethod"].str.contains("automatic"), 1, 0))

variables = ["no_of_services",
             "MonthlyCharges",
             "automatic_payment",
             "SeniorCitizen",
             "Partner",
             "Dependents",
             "Churn"]

df_p = df_p[variables]

In [104]:
evaluate_model(df_p)

0.7775408670931059

## 3.3. add no_of_services

In [106]:
df_n = df.assign(no_of_services=df_p["no_of_services"])

In [107]:
evaluate_model(df_n)

0.8002842928216063

## 3.4. use MinMaxScaler

In [110]:
from sklearn.preprocessing import MinMaxScaler

In [113]:
evaluate_model(df, scaler=MinMaxScaler)

0.8002842928216063

In [114]:
evaluate_model(df_n, scaler=MinMaxScaler)

0.8002842928216063

In [115]:
evaluate_model(df)

0.8009950248756219

---

In [117]:
df.nunique()

customerID          7032
SeniorCitizen          2
Partner                2
Dependents             2
tenure                72
PhoneService           2
MultipleLines          2
InternetService        3
OnlineSecurity         2
OnlineBackup           2
DeviceProtection       2
TechSupport            2
StreamingTV            2
StreamingMovies        2
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1584
TotalCharges        6530
Churn                  2
Female                 2
dtype: int64

# 4. Refactor evaluate_model function to take classification algorithm as input

In [118]:
def evaluate_model_new(df, classifier, scaler=StandardScaler, seed=666):
    # assign columns
    id_col = ["customerID"]
    target_col = ["Churn"]
    bool_col = [col for col in get_cols_list(df, 2, False) if col not in target_col]
    cat_col = [col for col in get_cols_list(df, 4) if col not in target_col]
    num_col = [col for col in df.columns if col not in id_col + target_col + cat_col]
    
    # create dummies
    df = pd.get_dummies(data=df,
                        columns=[col for col in cat_col if col not in bool_col],
                        drop_first=True)
    cat_col = [col for col in get_cols_list(df, 4) if col not in target_col]
    bool_col = [col for col in get_cols_list(df, 2, False) if col not in target_col]
    
    # split df 
    train, test = train_test_split(df, test_size=.2, random_state=seed)
    train_X = train[cat_col + num_col].reset_index(drop=True)
    train_Y = train[target_col].reset_index(drop=True)
    test_X = test[cat_col + num_col].reset_index(drop=True)
    test_Y = test[target_col].reset_index(drop=True)
    
    # Standardize
    std = scaler()
    scaled_fitted = std.fit(train_X[num_col])
    scaled = scaled_fitted.transform(train_X[num_col])
    scaled = pd.DataFrame(scaled, columns=num_col)
    
    train_X = train_X.drop(columns=num_col, axis=1)
    train_X = train_X.merge(scaled, left_index=True, right_index=True, how="left")
    
    # train model
    model = classifier
    model.fit(train_X, train_Y.values.ravel())
    
    # Predict on test set
    scaled_test = scaled_fitted.transform(test_X[num_col])
    scaled_test = pd.DataFrame(scaled_test, columns=num_col)
    
    test_X = test_X.drop(columns=num_col, axis=1)
    test_X = test_X.merge(scaled_test, left_index=True, right_index=True, how="left")
    
    pred = model.predict(test_X)
    
    # Compute accuracy
    return accuracy_score(test_Y, pred)
    

In [119]:
evaluate_model_new(df,classifier=LogisticRegression(solver="liblinear"))

0.8009950248756219

## 4.1. Use decision tree classifier

In [120]:
from sklearn.tree import DecisionTreeClassifier

In [121]:
evaluate_model_new(df, DecisionTreeClassifier())

0.7313432835820896

In [122]:
evaluate_model_new(df_p, DecisionTreeClassifier())

0.7192608386638237

## 4.2. Use random forest

In [123]:
from sklearn.ensemble import RandomForestClassifier

In [125]:
evaluate_model_new(df, RandomForestClassifier())



0.7789623312011372