In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from sklearn.feature_selection import RFE

In [2]:
df = pd.read_csv("Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


### Exploring data

In [4]:
df['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [5]:
# Checking for missing values
df.isna().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null int64
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null int64
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null float64
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: float64(1), int64(2), obj

### Preprocessing data

In [7]:
# Dropping the customer ID column
df = df.drop('customerID', axis = 1)

In [8]:
# Converting TotalCharges to numeric
df[df['TotalCharges'] == " "] = 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'])

In [9]:
# Replacing values of Churn with 1 and 0
df['Churn'].replace(to_replace='Yes', value=1, inplace=True)
df['Churn'].replace(to_replace='No',  value=0, inplace=True)

### Creating a train test split

In [10]:
X = df.iloc[:, df.columns != 'Churn']
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.75)

### Converting categorical data to numeric

In [11]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [12]:
X_train.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_0,gender_Female,gender_Male,Partner_0,Partner_No,Partner_Yes,...,Contract_One year,Contract_Two year,PaperlessBilling_0,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_0,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
2061,0,7,48.7,340.25,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,1,0
6401,0,3,55.1,154.65,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
2743,0,67,111.05,7321.05,0,0,1,0,0,1,...,0,1,0,0,1,0,0,0,1,0
3424,0,10,89.5,863.1,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,1,0
4562,0,71,114.6,8100.25,0,1,0,0,0,1,...,0,1,0,1,0,0,1,0,0,0


### Creating a general model using all variables

In [13]:
model = LogisticRegression()
# training the model
model = model.fit(X_train, y_train)



### Predicting values and calculating accuracy

In [14]:
predictions = model.predict(X_test)
metrics.accuracy_score(predictions, y_test)

0.8052243043725156

### A general model that uses all the variables results in an 80% accuracy

### We now try to select specific features to train our model with

In [15]:
weights = pd.Series(model.coef_[0], index=X_test.columns.values)
weights.sort_values(ascending = True)

Contract_Two year                         -0.984412
InternetService_DSL                       -0.397889
OnlineSecurity_Yes                        -0.305938
PaperlessBilling_No                       -0.278758
MultipleLines_No                          -0.246709
StreamingMovies_No                        -0.235171
PhoneService_Yes                          -0.228185
PaymentMethod_Mailed check                -0.179265
StreamingMovies_0                         -0.123201
StreamingTV_0                             -0.123201
TechSupport_0                             -0.123201
DeviceProtection_0                        -0.123201
PaperlessBilling_0                        -0.123201
OnlineBackup_0                            -0.123201
PaymentMethod_0                           -0.123201
InternetService_0                         -0.123201
OnlineSecurity_0                          -0.123201
Contract_0                                -0.123201
MultipleLines_0                           -0.123201
PhoneService

### Using features with the top 10 magnitude weights

In [16]:
abs_weights = abs(weights)
top_10_features = abs_weights.sort_values(ascending = False)[0:10]

In [17]:
top_10_features

Contract_Two year              0.984412
Contract_Month-to-month        0.590013
InternetService_DSL            0.397889
InternetService_Fiber optic    0.373534
OnlineSecurity_Yes             0.305938
Contract_One year              0.291064
SeniorCitizen                  0.288011
OnlineSecurity_No              0.281583
PaperlessBilling_No            0.278758
MultipleLines_No               0.246709
dtype: float64

In [18]:
top_10_features_index = top_10_features.index

In [19]:
top_10_features_index.values

array(['Contract_Two year', 'Contract_Month-to-month',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'OnlineSecurity_Yes', 'Contract_One year', 'SeniorCitizen',
       'OnlineSecurity_No', 'PaperlessBilling_No', 'MultipleLines_No'],
      dtype=object)

In [20]:
top_10_X_train = X_train[top_10_features_index.values]
top_10_X_test = X_test[top_10_features_index.values]

### We now try to predict again using the new features

In [21]:
model = model.fit(top_10_X_train, y_train)
predictions_top_10 = model.predict(top_10_X_test)
metrics.accuracy_score(predictions_top_10, y_test)



0.7730456180200643

### The accuracy actually decreases when we use the top 10 features

### Let us try to use Recursive Feature Elimination (RFE) to select features

In [22]:
selector = RFE(model, n_features_to_select=10)
selector = selector.fit(X_train, y_train)



In [23]:
order = selector.ranking_

In [24]:
rfe_result = []

for i in order:
    rfe_result.append((i, X_train.columns[i]))

In [25]:
rfe_top_10 = rfe_result[0:10]
rfe_top_10 = [x[1] for x in rfe_top_10]

In [26]:
rfe_top_10

['OnlineSecurity_No',
 'StreamingTV_No',
 'StreamingMovies_Yes',
 'Contract_Two year',
 'OnlineSecurity_No internet service',
 'Contract_One year',
 'StreamingMovies_0',
 'OnlineSecurity_0',
 'StreamingMovies_No internet service',
 'Partner_No']

In [27]:
rfe_X_train = X_train[rfe_top_10]
rfe_X_test = X_test[rfe_top_10]

In [28]:
model = model.fit(rfe_X_train, y_train)
predictions_rfe = model.predict(rfe_X_test)
metrics.accuracy_score(predictions_rfe, y_test)



0.7499526784024229

### The accuracy also decreases when we use RFE

In [29]:
rfe_result

[(25, 'OnlineSecurity_No'),
 (41, 'StreamingTV_No'),
 (47, 'StreamingMovies_Yes'),
 (51, 'Contract_Two year'),
 (26, 'OnlineSecurity_No internet service'),
 (50, 'Contract_One year'),
 (44, 'StreamingMovies_0'),
 (24, 'OnlineSecurity_0'),
 (46, 'StreamingMovies_No internet service'),
 (8, 'Partner_No'),
 (19, 'MultipleLines_Yes'),
 (45, 'StreamingMovies_No'),
 (33, 'DeviceProtection_No'),
 (17, 'MultipleLines_No'),
 (39, 'TechSupport_Yes'),
 (1, 'tenure'),
 (6, 'gender_Male'),
 (30, 'OnlineBackup_No internet service'),
 (36, 'TechSupport_0'),
 (29, 'OnlineBackup_No'),
 (2, 'MonthlyCharges'),
 (1, 'tenure'),
 (7, 'Partner_0'),
 (15, 'PhoneService_Yes'),
 (21, 'InternetService_DSL'),
 (1, 'tenure'),
 (3, 'TotalCharges'),
 (4, 'gender_0'),
 (10, 'Dependents_0'),
 (37, 'TechSupport_No'),
 (27, 'OnlineSecurity_Yes'),
 (1, 'tenure'),
 (20, 'InternetService_0'),
 (49, 'Contract_Month-to-month'),
 (18, 'MultipleLines_No phone service'),
 (34, 'DeviceProtection_No internet service'),
 (16, 'Mul

In [None]:
## Make sure you are using RFE right! Watch videos to learn