# Churn Prediction on Telco Dataset


Term 1 2020 - Instructor: Teerapong Leelanupab

Teaching Assistant: 
1. Tiwipab Meephruek (Mil)
2. Jiratkul Wangsiripaisarn (Brooklyn)
3. Hataichanok Sakkara (Pond)

***

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Load data

In [2]:
def load_data():
    train_df = pd.read_csv('data/TelcoCustomerChurn/train.csv')
    test_df = pd.read_csv('data/TelcoCustomerChurn/test.csv')
    return train_df, test_df

In [3]:
train_df, test_df = load_data()

In [4]:
train_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0401-WDBXM,Male,0,Yes,Yes,72,Yes,Yes,Fiber optic,Yes,...,Yes,No,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.55,7542.25,No
1,6701-DHKWQ,Female,0,Yes,Yes,61,No,No phone service,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),51.35,3244.4,No
2,3315-IKYZQ,Male,0,Yes,Yes,28,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,One year,No,Mailed check,50.8,1386.8,No
3,0578-SKVMF,Female,0,Yes,Yes,22,Yes,No,Fiber optic,No,...,No,No,No,Yes,Month-to-month,No,Electronic check,83.3,1845.9,Yes
4,2498-XLDZR,Female,0,Yes,Yes,32,Yes,No,DSL,Yes,...,Yes,Yes,No,Yes,Two year,No,Mailed check,73.6,2316.85,No


In [5]:
train_df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,5634.0,5634.0,5634.0,5629.0
mean,0.162052,32.624601,64.523571,2302.477314
std,0.368531,24.653462,30.301787,2287.079576
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,34.5375,399.6
50%,0.0,29.0,70.35,1410.25
75%,0.0,56.0,89.95,3848.0
max,1.0,72.0,118.75,8684.8


In [6]:
train_df['Churn'].value_counts() /len(train_df)

No     0.733759
Yes    0.266241
Name: Churn, dtype: float64

In [7]:
test_df['Churn'].value_counts() /len(test_df)

No     0.738112
Yes    0.261888
Name: Churn, dtype: float64

In [8]:
train_df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [9]:
train_df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        5
Churn               0
dtype: int64

In [10]:
test_df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        6
Churn               0
dtype: int64

In [11]:
train_df['TotalCharges'].values

array([7542.25, 3244.4 , 1386.8 , ..., 5742.9 ,  617.85, 2231.05])

# 2. Clean data

In [12]:
def clean_data(df):
    df['TotalCharges'] = df['TotalCharges'].fillna(0)
    return df

In [13]:
clean_data(train_df).isna().sum().sum()

0

# 3. Extract feature

In [14]:
train_df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [15]:
label_col = 'Churn'
num_feat_col = ['tenure', 'MonthlyCharges', 'TotalCharges']
cat_feat_col = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod' ]

In [16]:
def split_data_label(df, label_col):
    label = df[label_col]
    data = df.drop(label_col, axis=1)
    return data, label

def preprocess_num_feat(df, num_feat_col):
    return df[num_feat_col]

def preprocess_cat_feat(df, cat_feat_col):
    cat_feat = pd.get_dummies(df[cat_feat_col])
    return cat_feat

def extract_feat(df, num_feat_col, cat_feat_col):
    data = clean_data(df)
    num_feat = preprocess_num_feat(data, num_feat_col)
    cat_feat = preprocess_cat_feat(data, cat_feat_col)

    feat = pd.concat([num_feat, cat_feat], axis=1)
    return feat

In [17]:
train_data, train_label = split_data_label(train_df, 'Churn')
test_data, test_label = split_data_label(test_df, 'Churn')

In [18]:
train_feat = extract_feat(train_data,num_feat_col, cat_feat_col )
test_feat = extract_feat(test_data, num_feat_col, cat_feat_col)

In [19]:
train_feat

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,SeniorCitizen,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,72,105.55,7542.25,0,0,1,0,1,0,1,...,1,0,0,1,0,1,1,0,0,0
1,61,51.35,3244.40,0,1,0,0,1,0,1,...,0,0,0,1,1,0,0,1,0,0
2,28,50.80,1386.80,0,0,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
3,22,83.30,1845.90,0,1,0,0,1,0,1,...,1,1,0,0,1,0,0,0,1,0
4,32,73.60,2316.85,0,1,0,0,1,0,1,...,1,0,0,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,72,66.75,4760.30,0,1,0,0,1,0,1,...,0,0,0,1,1,0,0,1,0,0
5630,46,91.30,4126.35,1,1,0,1,0,1,0,...,0,0,1,0,0,1,1,0,0,0
5631,60,95.75,5742.90,0,0,1,0,1,1,0,...,0,1,0,0,0,1,0,0,1,0
5632,11,54.60,617.85,0,1,0,1,0,1,0,...,0,0,1,0,0,1,0,0,0,1


# 4. Train model 

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

In [21]:
def train_model(feat, label):
    model = GradientBoostingClassifier(random_state=0)
    model.fit(feat, label)
    return model

In [22]:
model = train_model(train_feat, train_label)

In [23]:
model

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

# 5. Evaluation

In [24]:
def eval_acc(pred, act):
    acc = sum(pred == act) / len(act)
    return acc

In [25]:
pred = model.predict(test_feat)
eval_acc(pred, test_label)

0.8005677785663591

# 6. Create pipeline

In [26]:
def pipeline():
    # 1. Load data
    train_df, test_df = load_data()

    # 2. Clean data
    train_df = clean_data(train_df)
    test_df = clean_data(test_df)

    # 3. Extract feature
    num_feat_col = ['tenure', 'MonthlyCharges', 'TotalCharges']
    cat_feat_col = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
        'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
        'PaymentMethod' ]
    train_data, train_label = split_data_label(train_df, 'Churn')
    test_data, test_label = split_data_label(test_df, 'Churn')

    train_feat = extract_feat(train_df, num_feat_col, cat_feat_col)
    test_feat = extract_feat(test_df, num_feat_col, cat_feat_col)

    # 4. Train model
    model = train_model(train_feat, train_label)

    # 5. Prediction
    pred = model.predict(test_feat)

    # 6. Evaluation
    acc = eval_acc(pred, test_label)

    return acc

In [27]:
pipeline()

0.8005677785663591

In [28]:
def pipeline():
    # Load data
    train_df, test_df = load_data()

    # Clean data
    train_df = clean_data(train_df)
    test_df = clean_data(test_df)

    # Extract feature
    num_feat_col = ['tenure', 'MonthlyCharges', 'TotalCharges']
    cat_feat_col = ['gender' ]
    train_data, train_label = split_data_label(train_df, 'Churn')
    test_data, test_label = split_data_label(test_df, 'Churn')

    train_feat = extract_feat(train_df, num_feat_col, cat_feat_col)
    test_feat = extract_feat(test_df, num_feat_col, cat_feat_col)

    # Train model
    model = train_model(train_feat, train_label)

    # Prediction
    pred = model.predict(test_feat)

    # Evaluation
    acc = eval_acc(pred, test_label)

    return acc

pipeline()

0.7828246983676366