In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import division

In [2]:
# define working directory
os.chdir('/Users/kksivva/Downloads/datamining')

In [3]:
sales = pd.read_csv('dataset.csv')

In [4]:
# top 5 observations
sales = sales.rename(columns = {"gender":"Gender", "customerID":"CustomerID","tenure":"Tenure"}) 

In [5]:
sales['Partner'] = sales['Partner'].map({
    "No":0,
    "Yes":1
}.get)
sales['Dependents'] = sales['Dependents'].map({
    "No":0,
    "Yes":1
}.get)
sales['PhoneService'] = sales['PhoneService'].map({
    "No":0,
    "Yes":1
}.get)
sales['PaperlessBilling'] = sales['PaperlessBilling'].map({
    "No":0,
    "Yes":1
}.get)
sales['Churn'] = sales['Churn'].map({
    "No":0,
    "Yes":1
}.get)
sales['Gender'] = sales['Gender'].map({
    "Male":0,
    "Female":1
}.get)

In [6]:
sales.head()

Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,StreamingService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,0,1,0,1,0,No phone service,DSL,No,Month-to-month,1,Electronic check,29.85,29.85,0
1,5575-GNVDE,0,0,0,0,34,1,No,DSL,No,One year,0,Mailed check,56.95,1889.5,0
2,3668-QPYBK,0,0,0,0,2,1,No,DSL,No,Month-to-month,1,Mailed check,53.85,108.15,1
3,7795-CFOCW,0,0,0,0,45,0,No phone service,DSL,No,One year,0,Bank transfer (automatic),42.3,1840.75,0
4,9237-HQITU,1,0,0,0,2,1,No,Fiber optic,No,Month-to-month,1,Electronic check,70.7,151.65,1


In [7]:
sales['MultipleLines'].unique()

array(['No phone service', 'No', 'Yes'], dtype=object)

In [8]:
sales['MultipleLines'].replace({
    "No":0,
    "Yes":1,
    "No phone service":2
}, inplace=True)

### One-hot Encoding

In [9]:
sales = pd.get_dummies(sales,columns=['InternetService','PaymentMethod','StreamingService','Contract'])

In [10]:
sales.head()

Unnamed: 0,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,PaperlessBilling,MonthlyCharges,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,StreamingService_No,StreamingService_No internet service,StreamingService_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,7590-VHVEG,1,0,1,0,1,0,2,1,29.85,...,0,0,1,0,1,0,0,1,0,0
1,5575-GNVDE,0,0,0,0,34,1,0,0,56.95,...,0,0,0,1,1,0,0,0,1,0
2,3668-QPYBK,0,0,0,0,2,1,0,1,53.85,...,0,0,0,1,1,0,0,1,0,0
3,7795-CFOCW,0,0,0,0,45,0,2,0,42.3,...,1,0,0,0,1,0,0,0,1,0
4,9237-HQITU,1,0,0,0,2,1,0,1,70.7,...,0,0,1,0,1,0,0,1,0,0


In [11]:
# Reorder columns
columns = [column for column in sales.columns if column != 'Churn']
columns = ['Churn'] + columns
sales = sales[columns]
sales.head()

Unnamed: 0,Churn,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,PaperlessBilling,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,StreamingService_No,StreamingService_No internet service,StreamingService_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,7590-VHVEG,1,0,1,0,1,0,2,1,...,0,0,1,0,1,0,0,1,0,0
1,0,5575-GNVDE,0,0,0,0,34,1,0,0,...,0,0,0,1,1,0,0,0,1,0
2,1,3668-QPYBK,0,0,0,0,2,1,0,1,...,0,0,0,1,1,0,0,1,0,0
3,0,7795-CFOCW,0,0,0,0,45,0,2,0,...,1,0,0,0,1,0,0,0,1,0
4,1,9237-HQITU,1,0,0,0,2,1,0,1,...,0,0,1,0,1,0,0,1,0,0


In [12]:
#Normalize Monthly charges data
sales['MonthlyCharges'] = sales['MonthlyCharges'].apply(lambda v:(v - sales['MonthlyCharges'].min())/(sales['MonthlyCharges'].max() - sales['MonthlyCharges'].min()))

In [13]:
# Replace non-numeric values with 0
sales['TotalCharges'] = pd.to_numeric(sales['TotalCharges'], errors='coerce')

In [14]:
#Normalize Total charges data
sales['TotalCharges'] = sales['TotalCharges'].apply(lambda v:(v - sales['TotalCharges'].min())/(sales['TotalCharges'].max() - sales['TotalCharges'].min()))

In [15]:
# Replace emtpy cells with 0s for 'TotalCharges' column
sales.TotalCharges = sales[['TotalCharges']].infer_objects().fillna(0)

In [16]:
sales['Tenure'].unique()

array([ 1, 34,  2, 45,  8, 22, 10, 28, 62, 13, 16, 58, 49, 25, 69, 52, 71,
       21, 12, 30, 47, 72, 17, 27,  5, 46, 11, 70, 63, 43, 15, 60, 18, 66,
        9,  3, 31, 50, 64, 56,  7, 42, 35, 48, 29, 65, 38, 68, 32, 55, 37,
       36, 41,  6,  4, 33, 67, 23, 57, 61, 14, 20, 53, 40, 59, 24, 44, 19,
       54, 51, 26,  0, 39])

In [17]:
sales['Tenure'] = sales['Tenure'].apply(lambda v:(v - sales['Tenure'].min())/(sales['Tenure'].max() - sales['Tenure'].min()))

In [18]:
# Encode 'CustomerId' column to integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
sales['CustomerID'] = le.fit_transform(sales['CustomerID'])

In [19]:
sales.to_csv("preprocessed_dataset.csv", index=False)

In [20]:
new_df = pd.read_csv("preprocessed_dataset.csv")

In [21]:
# Checking missing values in the cleaned data
new_df.isnull().sum()

Churn                                      0
CustomerID                                 0
Gender                                     0
SeniorCitizen                              0
Partner                                    0
Dependents                                 0
Tenure                                     0
PhoneService                               0
MultipleLines                              0
PaperlessBilling                           0
MonthlyCharges                             0
TotalCharges                               0
InternetService_DSL                        0
InternetService_Fiber optic                0
InternetService_No                         0
PaymentMethod_Bank transfer (automatic)    0
PaymentMethod_Credit card (automatic)      0
PaymentMethod_Electronic check             0
PaymentMethod_Mailed check                 0
StreamingService_No                        0
StreamingService_No internet service       0
StreamingService_Yes                       0
Contract_M

In [22]:
new_df.head()

Unnamed: 0,Churn,CustomerID,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,PaperlessBilling,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,StreamingService_No,StreamingService_No internet service,StreamingService_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,5375,1,0,1,0,0.013889,0,2,1,...,0,0,1,0,1,0,0,1,0,0
1,0,3962,0,0,0,0,0.472222,1,0,0,...,0,0,0,1,1,0,0,0,1,0
2,1,2564,0,0,0,0,0.027778,1,0,1,...,0,0,0,1,1,0,0,1,0,0
3,0,5535,0,0,0,0,0.625,0,2,0,...,1,0,0,0,1,0,0,0,1,0
4,1,6511,1,0,0,0,0.027778,1,0,1,...,0,0,1,0,1,0,0,1,0,0


In [23]:
# Separate target and dependent variables
y = new_df['Churn']
X = new_df.drop(['Churn'],axis=1)

In [24]:
# Importing train_test split to create validation set
from sklearn.model_selection import train_test_split

In [25]:
# Creating the training and validation set
X_train,X_valid,y_train,y_valid = train_test_split(X,y,random_state=101,stratify=y,test_size=0.20)

In [26]:
# distribution in training set
y_train.value_counts(normalize=True)

0    0.734647
1    0.265353
Name: Churn, dtype: float64

In [27]:
# distribution in validation set
y_valid.value_counts(normalize=True)

0    0.734564
1    0.265436
Name: Churn, dtype: float64

In [28]:
# importing decision tree classifier
from sklearn.tree import DecisionTreeClassifier

In [29]:
# Create a deicsion tree function
dt_model =  DecisionTreeClassifier(random_state=10)

In [30]:
dt_model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best')

In [31]:
# Check the train score
dt_model.score(X_train,y_train)

1.0

In [32]:
dt_model.score(X_valid,y_valid)

0.7104329311568488

### Perform Grid Search to figure out the parameters to the model for improving score

In [39]:
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score

# Create a scalar object
sc = StandardScaler()

# Create a pca object[]
pca = decomposition.PCA()

decisionTree = tree.DecisionTreeClassifier()

# Create a pipeline of 3 steps. 
# First , standardize the data
pipe = Pipeline(steps=[('decisionTree', decisionTree)])

# Create a parameter space
n_components = list(range(1, X.shape[1]+1,1))

criterion = ['gini','entropy']
max_depth = [4,6,8,12,24]

parameters = dict(
                 decisionTree__criterion=criterion,
                 decisionTree__max_depth=max_depth)

clf = GridSearchCV(pipe,parameters)
clf.fit(X,y)

# View the best parameters
print( clf.best_estimator_.get_params()['decisionTree'])




DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [40]:
dt_model = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
dt_model.fit(X_train,y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [41]:
dt_model.score(X_train,y_train)

0.7960596379126731

In [42]:
dt_model.score(X_valid,y_valid)

0.7735982966643009

#### The score increased to 77% from 70%