This notebook is intended to build a model for customer churn using two types of Classification (Logistic Regression and SVM).
The steps are as follows:
1. Uploading data
2. Checking for missing and inconsistency data
3. Converting the categorical variables to dummy variables
4. Splitting the data into the explanatories and response variable, where churn is the response (dependent variable)
5. Splitting the train data into train and test
6. Fitting the model to trainset (SVM or Logistic Regression)
7. Calculating the accuracy score
8. Use the model to predict churn in the test set

In [4]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [5]:
df = pd.read_csv('C:\\Users\\LENOVO\\Downloads\\Telegram Desktop\\train.csv')

In [6]:
#Checking the data type for each variable
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4250 non-null   object 
 1   account_length                 4250 non-null   int64  
 2   area_code                      4250 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4250 non-null   object 
 5   number_vmail_messages          4250 non-null   int64  
 6   total_day_minutes              4250 non-null   float64
 7   total_day_calls                4250 non-null   int64  
 8   total_day_charge               4250 non-null   float64
 9   total_eve_minutes              4250 non-null   float64
 10  total_eve_calls                4250 non-null   int64  
 11  total_eve_charge               4250 non-null   float64
 12  total_night_minutes            4250 non-null   f

In [7]:
#Checking for null values
df.isna().sum()

state                            0
account_length                   0
area_code                        0
international_plan               0
voice_mail_plan                  0
number_vmail_messages            0
total_day_minutes                0
total_day_calls                  0
total_day_charge                 0
total_eve_minutes                0
total_eve_calls                  0
total_eve_charge                 0
total_night_minutes              0
total_night_calls                0
total_night_charge               0
total_intl_minutes               0
total_intl_calls                 0
total_intl_charge                0
number_customer_service_calls    0
churn                            0
dtype: int64

No null values found

In [8]:
df.head()

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
1,NJ,137,area_code_415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
2,OH,84,area_code_408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
3,OK,75,area_code_415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no
4,MA,121,area_code_510,no,yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,no


In [9]:
#Statistical Descriptive for Numerical Categories
df.describe()

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
count,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0
mean,100.236235,7.631765,180.2596,99.907294,30.644682,200.173906,100.176471,17.015012,200.527882,99.839529,9.023892,10.256071,4.426353,2.769654,1.559059
std,39.698401,13.439882,54.012373,19.850817,9.182096,50.249518,19.908591,4.271212,50.353548,20.09322,2.265922,2.760102,2.463069,0.745204,1.311434
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,73.0,0.0,143.325,87.0,24.365,165.925,87.0,14.1025,167.225,86.0,7.5225,8.5,3.0,2.3,1.0
50%,100.0,0.0,180.45,100.0,30.68,200.7,100.0,17.06,200.45,100.0,9.02,10.3,4.0,2.78,1.0
75%,127.0,16.0,216.2,113.0,36.75,233.775,114.0,19.8675,234.7,113.0,10.56,12.0,6.0,3.24,2.0
max,243.0,52.0,351.5,165.0,59.76,359.3,170.0,30.54,395.0,175.0,17.77,20.0,20.0,5.4,9.0


In [10]:
#Summary for Categorical Variables
df.describe(include='O').T

Unnamed: 0,count,unique,top,freq
state,4250,51,WV,139
area_code,4250,3,area_code_415,2108
international_plan,4250,2,no,3854
voice_mail_plan,4250,2,no,3138
churn,4250,2,no,3652


In [11]:
df2 = df.copy()

In [12]:
#Untuk variabel international_plan, voicemail plan, serta churn hanya terdiri dari 2 kategori : Yes dan No
#Akan ditransformasi ke dummy variabel
cat_col = ['international_plan','voice_mail_plan','churn']
def getdummy(frame, col):
    for i in col:
        frame[i].replace({"yes":1, "no":0}, inplace = True)
    return frame

In [13]:
getdummy (df2,cat_col)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
0,OH,107,area_code_415,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,0
1,NJ,137,area_code_415,0,0,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,0
2,OH,84,area_code_408,1,0,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
3,OK,75,area_code_415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0
4,MA,121,area_code_510,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4245,MT,83,area_code_415,0,0,0,188.3,70,32.01,243.8,88,20.72,213.7,79,9.62,10.3,6,2.78,0,0
4246,WV,73,area_code_408,0,0,0,177.9,89,30.24,131.2,82,11.15,186.2,89,8.38,11.5,6,3.11,3,0
4247,NC,75,area_code_408,0,0,0,170.7,101,29.02,193.1,126,16.41,129.1,104,5.81,6.9,7,1.86,1,0
4248,HI,50,area_code_408,0,1,40,235.7,127,40.07,223.0,126,18.96,297.5,116,13.39,9.9,5,2.67,2,0


In [14]:
from sklearn import preprocessing

In [15]:
#Splitting the variables into predictor and response variables
x = df2.iloc[:,1:-1].values
y = df2.iloc[:,-1].values

In [16]:
le_area = preprocessing.LabelEncoder()
le_area.fit([ 'area_code_415', 'area_code_408', 'area_code_510'])
x[:,1]= le_area.transform(x[:,1])

In [17]:
from sklearn import preprocessing
x = preprocessing.StandardScaler().fit(x).transform(x.astype(float))
x[0:5]

array([[ 0.17039882,  0.00994348, -0.32054702,  1.67986382,  1.36685697,
        -0.34550964,  1.16344948, -0.3457877 , -0.09302489,  0.14184136,
        -0.09249324,  1.07000318,  0.15730891,  1.0708197 ,  1.24790134,
        -0.57916393,  1.24859124, -0.42634613],
       [ 0.92618569,  0.00994348, -0.32054702, -0.59528635, -0.56791143,
         1.16913636,  0.7100143 ,  1.1692952 , -1.57182002,  0.49348973,
        -1.57234104, -0.75332021,  0.2070828 , -0.75205254,  0.70437917,
         0.2329267 ,  0.69834168, -1.1889602 ],
       [-0.40903778, -1.39871658,  3.11966717, -0.59528635, -0.56791143,
         2.20605783, -1.45639825,  2.20621751, -2.7520697 , -0.61169087,
        -2.75247283, -0.07205667, -0.53952552, -0.07233747, -1.3247703 ,
         1.04501732, -1.32818716,  0.33626795],
       [-0.63577385,  0.00994348,  3.11966717, -0.59528635, -0.56791143,
        -0.25107572,  0.65963261, -0.25102695, -1.0324479 ,  1.09631552,
        -1.0314473 , -0.27067578,  1.05323889, -0.270

In [18]:
x_trainset, x_testset, y_trainset, y_testset = train_test_split(x, y, test_size=0.3, random_state=3)

In [19]:
from sklearn import linear_model
logmodel = linear_model.LogisticRegression(random_state=0, max_iter=100).fit(x_trainset,y_trainset)

In [20]:
y_hat = logmodel.predict(x_testset)
#Calculating accuracy score
from sklearn import metrics
print('Accuracy score using Logistic Regression is: ', metrics.accuracy_score(y_hat, y_testset))

Accuracy score using Logistic Regression is:  0.8776470588235294


Support Vector Machine

In [21]:
from sklearn import svm

In [22]:
from sklearn.metrics import confusion_matrix

In [23]:
#Support Vector Machine

In [24]:
svm_model = svm.SVC(kernel='rbf')
svm_model.fit(x_trainset, y_trainset) 

SVC()

In [25]:
y_svmpred = svm_model.predict(x_testset)

In [26]:
cnf_matrix = confusion_matrix(y_testset, y_svmpred)
cnf_matrix

array([[1098,   10],
       [  79,   88]], dtype=int64)

In [27]:
print('Accuracy score using SVM is: ',metrics.accuracy_score(y_svmpred,y_testset))

Accuracy score using SVM is:  0.9301960784313725


In [36]:
#LOADING THE REAL TEST SET
df_test = pd.read_csv('C:\\Users\\LENOVO\\Downloads\\Telegram Desktop\\test.csv') 

In [37]:
#Checking for Null Values
df_test.isna().sum()

id                               0
state                            0
account_length                   0
area_code                        0
international_plan               0
voice_mail_plan                  0
number_vmail_messages            0
total_day_minutes                0
total_day_calls                  0
total_day_charge                 0
total_eve_minutes                0
total_eve_calls                  0
total_eve_charge                 0
total_night_minutes              0
total_night_calls                0
total_night_charge               0
total_intl_minutes               0
total_intl_calls                 0
total_intl_charge                0
number_customer_service_calls    0
dtype: int64

In [38]:
#Transforming the categorical variables to dummy
getdummy (df_test,['international_plan','voice_mail_plan'])

Unnamed: 0,id,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,1,KS,128,area_code_415,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1
1,2,AL,118,area_code_510,1,0,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.70,0
2,3,IA,62,area_code_415,0,0,0,120.7,70,20.52,307.2,76,26.11,203.0,99,9.14,13.1,6,3.54,4
3,4,VT,93,area_code_510,0,0,0,190.7,114,32.42,218.2,111,18.55,129.6,121,5.83,8.1,3,2.19,3
4,5,NE,174,area_code_415,0,0,0,124.3,76,21.13,277.1,112,23.55,250.7,115,11.28,15.5,5,4.19,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,746,GA,130,area_code_415,0,0,0,119.4,99,20.30,226.3,97,19.24,202.7,111,9.12,11.3,7,3.05,0
746,747,WA,73,area_code_408,0,0,0,177.2,118,30.12,270.5,84,22.99,241.8,112,10.88,12.3,2,3.32,3
747,748,WV,152,area_code_415,0,0,0,184.2,90,31.31,256.8,73,21.83,213.6,113,9.61,14.7,2,3.97,3
748,749,DC,61,area_code_415,0,0,0,140.6,89,23.90,172.8,128,14.69,212.4,97,9.56,13.6,4,3.67,1


In [39]:
#Selecting thhe features as independent variables
x_finaltest = df_test.iloc[:,2:].values
le_area = preprocessing.LabelEncoder()
le_area.fit([ 'area_code_415', 'area_code_408', 'area_code_510'])
x_finaltest[:,1]= le_area.transform(x_finaltest[:,1])

In [40]:
from sklearn import preprocessing
x_finaltest= preprocessing.StandardScaler().fit(x_finaltest).transform(x_finaltest.astype(float))
x_finaltest[0:5]

array([[ 0.69606476, -0.03259835, -0.33825002,  1.59828107,  1.17223958,
         1.59039051,  0.47086951,  1.59068613, -0.11233349, -0.06578963,
        -0.11213984,  0.87540141, -0.48875703,  0.8747719 , -0.10624313,
        -0.61370155, -0.10697644, -0.49763904],
       [ 0.44400133,  1.40556411,  2.95639303, -0.62567218, -0.59901452,
         0.80689199, -0.13810097,  0.80706315,  0.33253137,  0.03754487,
         0.33228657,  0.08312202,  0.91951414,  0.08508483, -1.4427104 ,
         0.62582133, -1.44529723, -1.28173416],
       [-0.96755388, -0.03259835, -0.33825002, -0.62567218, -0.59901452,
        -1.12273146, -1.55903209, -1.12270516,  1.9931045 , -1.25413644,
         1.99268167,  0.06564527, -0.07149149,  0.06782391,  1.01349972,
         0.62582133,  1.01721302,  1.85464633],
       [-0.18615725,  1.40556411, -0.33825002, -0.62567218, -0.59901452,
         0.19249387,  0.67385967,  0.19254357,  0.28651086,  0.55421739,
         0.28716714, -1.35968089,  1.07598872, -1.360

In [41]:
#Predicting Customer Churn With Logistic and SVM
y_predict_log = logmodel.predict(x_finaltest)
y_predict_svm = svm_model.predict(x_finaltest)

In [45]:
test_result = pd.DataFrame ({'Churn_Log':y_predict_log, 'Churn_SVM':y_predict_svm}, columns=['Churn_Log','Churn_SVM'])

In [64]:
print('Total Customer Churn Predicted by Logistic Regression is: ', sum(test_result['Churn_Log']==1),' out of 750 customers')
print('Total Customer Churn Predicted by SVM is: ', sum(test_result['Churn_SVM']==1),' out of 750 customers')

Total Customer Churn Predicted by Logistic Regression is:  31  out of 750 customers
Total Customer Churn Predicted by SVM is:  63  out of 750 customers


In [59]:
df_test = pd.concat((df_test, test_result), axis=1)
df_test[0:5]

Unnamed: 0,id,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,Churn_Log,Churn_SVM
0,1,KS,128,area_code_415,0,1,25,265.1,110,45.07,...,16.78,244.7,91,11.01,10.0,3,2.7,1,0,0
1,2,AL,118,area_code_510,1,0,0,223.4,98,37.98,...,18.75,203.9,118,9.18,6.3,6,1.7,0,0,0
2,3,IA,62,area_code_415,0,0,0,120.7,70,20.52,...,26.11,203.0,99,9.14,13.1,6,3.54,4,0,0
3,4,VT,93,area_code_510,0,0,0,190.7,114,32.42,...,18.55,129.6,121,5.83,8.1,3,2.19,3,0,0
4,5,NE,174,area_code_415,0,0,0,124.3,76,21.13,...,23.55,250.7,115,11.28,15.5,5,4.19,3,0,0
