# Classification
# Telecom churn dataset

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

## Data

In [2]:
df = pd.read_csv('telecom_churn.csv')

In [3]:
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   3333 non-null   object 
 1   Account length          3333 non-null   int64  
 2   Area code               3333 non-null   int64  
 3   International plan      3333 non-null   object 
 4   Voice mail plan         3333 non-null   object 
 5   Number vmail messages   3333 non-null   int64  
 6   Total day minutes       3333 non-null   float64
 7   Total day calls         3333 non-null   int64  
 8   Total day charge        3333 non-null   float64
 9   Total eve minutes       3333 non-null   float64
 10  Total eve calls         3333 non-null   int64  
 11  Total eve charge        3333 non-null   float64
 12  Total night minutes     3333 non-null   float64
 13  Total night calls       3333 non-null   int64  
 14  Total night charge      3333 non-null   

## Preprocessing

### Encoding

In [5]:
label_columns = ['International plan', 'Voice mail plan', 'Customer service calls',]
df_label_columns = df[label_columns]

In [6]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(dtype = 'int32')
enc_label_columns = enc.fit_transform(df_label_columns).toarray()
enc.categories_

[array(['No', 'Yes'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)]

In [7]:
enc_label_columns.shape

(3333, 14)

In [8]:
all_columns = df.columns.drop(['State', 'Area code', 'Total day charge', 'Total eve charge', 
                               'Total night charge', 'Total intl charge', 'Churn'])
all_columns

Index(['Account length', 'International plan', 'Voice mail plan',
       'Number vmail messages', 'Total day minutes', 'Total day calls',
       'Total eve minutes', 'Total eve calls', 'Total night minutes',
       'Total night calls', 'Total intl minutes', 'Total intl calls',
       'Customer service calls'],
      dtype='object')

In [9]:
columns = list(set(all_columns) - set(label_columns))
columns

['Total night minutes',
 'Total day calls',
 'Total eve calls',
 'Total intl minutes',
 'Total day minutes',
 'Number vmail messages',
 'Account length',
 'Total eve minutes',
 'Total night calls',
 'Total intl calls']

In [10]:
X = df[columns].values
X = np.concatenate([X, enc_label_columns], axis=1)
X.shape

(3333, 24)

In [11]:
Y = df['Churn'].values
Y.shape

(3333,)

## Splitting

In [12]:
Rand = 1

In [13]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, Y, random_state=Rand)
print(Xtrain.shape, Xtest.shape)
print(ytrain.shape, ytest.shape)

(2499, 24) (834, 24)
(2499,) (834,)


## Modelling

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report

### LogisticRegression

In [15]:
lgr = LogisticRegression(n_jobs = -1)

In [16]:
lgr.fit(Xtrain, ytrain)

LogisticRegression(n_jobs=-1)

In [17]:
lgr.coef_

array([[ 1.44639818e-03, -6.52712313e-03, -7.09745890e-03,
         2.49861719e-02,  9.51487581e-03, -3.19911863e-02,
        -7.81745822e-04,  3.43995864e-03, -9.22901707e-03,
        -1.57212203e-01, -1.30913598e+00,  1.08651157e+00,
        -1.33264918e-01, -8.93594941e-02, -1.84620241e-01,
        -7.71034714e-01, -3.32804421e-01, -3.09904926e-01,
         7.32075649e-01,  4.15527944e-01,  1.21838838e-01,
         7.49712080e-02,  1.72635819e-02,  1.40626692e-02]])

In [18]:
lgr.intercept_

array([-0.22281347])

#### Validation

In [19]:
# Predict on train

ypred_train = lgr.predict(Xtrain)
ypred_train_proba = lgr.predict_proba(Xtrain)

In [20]:
# Predict on test

ypred = lgr.predict(Xtest)
ypred_proba = lgr.predict_proba(Xtest)

#### Metrics

In [21]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

0.8615446178471389

In [22]:
# Accuracy on test
accuracy_score(ytest, ypred)

0.8465227817745803

In [23]:
# confusion matrix
print(confusion_matrix(ytest, ypred))

[[688  24]
 [104  18]]


In [24]:
# classification report
target_names = ['retained', 'left']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

    retained       0.87      0.97      0.91       712
        left       0.43      0.15      0.22       122

    accuracy                           0.85       834
   macro avg       0.65      0.56      0.57       834
weighted avg       0.80      0.85      0.81       834



In [25]:
# AUC
print("AUC on train =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

AUC on train = 0.8071527225330324
AUC on test = 0.7514735678762203


In [26]:
# cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lgr, Xtrain, ytrain, cv=5)
scores.mean()

0.8643462925851704

### RandomForest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
rfc = RandomForestClassifier(random_state = Rand, n_jobs = -1)

In [29]:
rfc.fit(Xtrain, ytrain)

RandomForestClassifier(n_jobs=-1, random_state=1)

In [30]:
print('train accuracy:', rfc.score(Xtrain, ytrain))
print('test accuracy:', rfc.score(Xtest, ytest))

train accuracy: 1.0
test accuracy: 0.9304556354916067


#### Validation

In [31]:
ypred_train = rfc.predict(Xtrain)
ypred_train_proba = rfc.predict_proba(Xtrain)

In [32]:
ypred = rfc.predict(Xtest)
ypred_proba = rfc.predict_proba(Xtest)

#### Metrics

In [33]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

1.0

In [34]:
# Accuracy on test
accuracy_score(ytest, ypred)

0.9304556354916067

In [35]:
# confusion matrix
print(confusion_matrix(ytest, ypred))

[[701  11]
 [ 47  75]]


In [36]:
# classification report
target_names = ['retained', 'left']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

    retained       0.94      0.98      0.96       712
        left       0.87      0.61      0.72       122

    accuracy                           0.93       834
   macro avg       0.90      0.80      0.84       834
weighted avg       0.93      0.93      0.93       834



In [37]:
# AUC
print("AUC on train =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

AUC on train = 1.0
AUC on test = 0.8901328513538405


In [38]:
# cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc, Xtrain, ytrain, cv=5)
scores.mean()

0.9299759519038077

### GradientBoosting

In [39]:
from sklearn.ensemble import GradientBoostingClassifier

In [40]:
gbc = GradientBoostingClassifier(random_state = Rand)

In [41]:
gbc.fit(Xtrain, ytrain)

GradientBoostingClassifier(random_state=1)

In [42]:
print('train accuracy:', gbc.score(Xtrain, ytrain))
print('test accuracy:', gbc.score(Xtest, ytest))

train accuracy: 0.9715886354541817
test accuracy: 0.9424460431654677


#### Validation

In [43]:
ypred_train = gbc.predict(Xtrain)
ypred_train_proba = gbc.predict_proba(Xtrain)

In [44]:
ypred = gbc.predict(Xtest)
ypred_proba = gbc.predict_proba(Xtest)

#### Metrics

In [45]:
# Accuracy on train
accuracy_score(ytrain, ypred_train)

0.9715886354541817

In [46]:
# Accuracy on test
accuracy_score(ytest, ypred)

0.9424460431654677

In [47]:
# confusion matrix
print(confusion_matrix(ytest, ypred))

[[701  11]
 [ 37  85]]


In [48]:
# classification report
target_names = ['retained', 'left']
print(classification_report(ytest, ypred, target_names=target_names))

              precision    recall  f1-score   support

    retained       0.95      0.98      0.97       712
        left       0.89      0.70      0.78       122

    accuracy                           0.94       834
   macro avg       0.92      0.84      0.87       834
weighted avg       0.94      0.94      0.94       834



In [49]:
# AUC
print("AUC on train =", roc_auc_score(ytrain, ypred_train_proba[:, 1]))
print("AUC on test =", roc_auc_score(ytest, ypred_proba[:, 1]))

AUC on train = 0.9632523470559121
AUC on test = 0.9044656013998895


In [50]:
# cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(gbc, Xtrain, ytrain, cv=5)
scores.mean()

0.9491815631262526