### Machine learning for marketing basics

### Investigate the data
You will now test your knowledge in practice. In this exercise, you will explore the key characteristics of the telecom churn dataset.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
names = "customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn".split(',')
len(names)

21

In [6]:
telco = pd.read_excel('Data/telco.csv')
telco

Unnamed: 0,"customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn"
0,"7590-VHVEG,Female,0,Yes,No,1,No,No phone servi..."
1,"5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Y..."
2,"3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,N..."
3,"7795-CFOCW,Male,0,No,No,45,No,No phone service..."
4,"9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic..."
...,...
7038,"6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,N..."
7039,"2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber o..."
7040,"4801-JZAZL,Female,0,Yes,Yes,11,No,No phone ser..."
7041,"8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic..."


In [7]:
telco = telco.iloc[:,0].str.split(',', expand=True)
telco.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
telco.columns = names
telco.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
telco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
customerID          7043 non-null object
gender              7043 non-null object
SeniorCitizen       7043 non-null object
Partner             7043 non-null object
Dependents          7043 non-null object
tenure              7043 non-null object
PhoneService        7043 non-null object
MultipleLines       7043 non-null object
InternetService     7043 non-null object
OnlineSecurity      7043 non-null object
OnlineBackup        7043 non-null object
DeviceProtection    7043 non-null object
TechSupport         7043 non-null object
StreamingTV         7043 non-null object
StreamingMovies     7043 non-null object
Contract            7043 non-null object
PaperlessBilling    7043 non-null object
PaymentMethod       7043 non-null object
MonthlyCharges      7043 non-null object
TotalCharges        7043 non-null object
Churn               7043 non-null object
dtypes: object(21)
memory usage:

In [10]:
telco_raw = telco.copy()

In [11]:
# Print the data types of telco_raw dataset
telco_raw.dtypes

customerID          object
gender              object
SeniorCitizen       object
Partner             object
Dependents          object
tenure              object
PhoneService        object
MultipleLines       object
InternetService     object
OnlineSecurity      object
OnlineBackup        object
DeviceProtection    object
TechSupport         object
StreamingTV         object
StreamingMovies     object
Contract            object
PaperlessBilling    object
PaymentMethod       object
MonthlyCharges      object
TotalCharges        object
Churn               object
dtype: object

In [12]:
# Print the header of telco_raw dataset
telco_raw.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [13]:
# Print the number of unique values in each telco_raw column
telco_raw.nunique()

customerID          7043
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64

### Separate numerical and categorical columns
you have explored the dataset characteristics and are ready to do some data pre-processing. You will now separate categorical and numerical variables from the telco_raw DataFrame with a customized categorical vs. numerical unique value count threshold.

In [15]:
# Store customerID and Churn column names
custid = ['customerID']
target = ['Churn']

In [16]:
# Store categorical column names
categorical = telco_raw.nunique()[telco_raw.nunique() < 5].keys().tolist()
categorical

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'Churn']

In [17]:
# Remove target from the list of categorical variables
categorical.remove(target[0])

In [18]:
categorical

['gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [19]:
# Store numerical column names
numerical = [x for x in telco_raw.columns if x not in custid + target + categorical]
numerical

['tenure', 'MonthlyCharges', 'TotalCharges']

### Encode categorical and scale numerical variables
In this final step, you will perform one-hot encoding on the categorical variables and then scale the numerical columns.

In [22]:
# Perform one-hot encoding to categorical variables 
telco_raw = pd.get_dummies(data = telco_raw, columns = categorical, drop_first=True)
telco_raw.head()

Unnamed: 0,customerID,tenure,MonthlyCharges,TotalCharges,Churn,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,1,29.85,29.85,No,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,5575-GNVDE,34,56.95,1889.5,No,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,3668-QPYBK,2,53.85,108.15,Yes,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
3,7795-CFOCW,45,42.3,1840.75,No,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,9237-HQITU,2,70.7,151.65,Yes,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0


In [23]:
telco_raw['tenure'] = telco_raw['tenure'].astype('float')

In [24]:
telco_raw['MonthlyCharges']= telco_raw['MonthlyCharges'].astype('float')

In [25]:
telco_raw['TotalCharges'] = telco_raw['TotalCharges'].astype(float)

ValueError: could not convert string to float: 

In [26]:
telco_raw['TotalCharges'] = pd.to_numeric(telco_raw['TotalCharges'])

ValueError: Unable to parse string " " at position 488

In [27]:
telco_raw['TotalCharges'].iloc[488]

' '

In [28]:
telco_raw['TotalCharges'] = telco_raw['TotalCharges'].replace(' ', '0', regex=True)

In [29]:
telco_raw['TotalCharges'] = telco_raw['TotalCharges'].astype(float)

In [30]:
telco_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 32 columns):
customerID                               7043 non-null object
tenure                                   7043 non-null float64
MonthlyCharges                           7043 non-null float64
TotalCharges                             7043 non-null float64
Churn                                    7043 non-null object
gender_Male                              7043 non-null uint8
SeniorCitizen_1                          7043 non-null uint8
Partner_Yes                              7043 non-null uint8
Dependents_Yes                           7043 non-null uint8
PhoneService_Yes                         7043 non-null uint8
MultipleLines_No phone service           7043 non-null uint8
MultipleLines_Yes                        7043 non-null uint8
InternetService_Fiber optic              7043 non-null uint8
InternetService_No                       7043 non-null uint8
OnlineSecurity_No internet serv

In [31]:

from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler instance
scaler = StandardScaler()

# Fit and transform the scaler on numerical columns
scaled_numerical = scaler.fit_transform(telco_raw[numerical])

# Build a DataFrame from scaled_numerical
scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical)
scaled_numerical.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,-1.277445,-1.160323,-0.992611
1,0.066327,-0.259629,-0.172165
2,-1.236724,-0.36266,-0.958066
3,0.514251,-0.746535,-0.193672
4,-1.236724,0.197365,-0.938874


### Bringing it all together

In [33]:
# Drop non-scaled numerical columns
telco_raw = telco_raw.drop(columns=numerical, axis=1)

# Merge the non-numerical with the scaled numerical data
telco = telco_raw.merge(right=scaled_numerical,
                        how='left',
                        left_index=True,
                        right_index=True
                        )

In [34]:
telco.drop(['customerID', 'Churn'], axis=1, inplace=True)
telco.head()

Unnamed: 0,gender_Male,SeniorCitizen_1,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No internet service,...,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure,MonthlyCharges,TotalCharges
0,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,1,0,-1.277445,-1.160323,-0.992611
1,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0.066327,-0.259629,-0.172165
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,1,-1.236724,-0.36266,-0.958066
3,1,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0.514251,-0.746535,-0.193672
4,0,0,0,0,1,0,0,1,0,0,...,0,0,0,1,0,1,0,-1.236724,0.197365,-0.938874


###Split data to training and testing¶

In [36]:
X = telco.values
Y = telco_raw.Churn.values

In [37]:
X.shape

(7043, 30)

In [38]:
Y.shape

(7043,)

In [39]:
from sklearn.model_selection import train_test_split

# Split X and Y into training and testing datasets
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.25)

# Ensure training dataset has only 75% of original X data
print(train_X.shape[0] / X.shape[0])

# Ensure testing dataset has only 25% of original X data
print(test_X.shape[0] / X.shape[0])

0.7499645037626012
0.25003549623739885


### Fit a decision tree
Now, you will take a stab at building a decision tree model. The decision tree is a list of machine-learned if-else rules that decide in the telecom churn case, whether customers will churn or not.

In [41]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [42]:
# Initialize the model with max_depth set at 5
mytree = DecisionTreeClassifier(max_depth = 5)

# Fit the model on the training data
treemodel = mytree.fit(train_X, train_Y)

# Predict values on the testing data
pred_Y = treemodel.predict(test_X)

# Measure model performance on testing data
accuracy_score(test_Y, pred_Y)

0.7796706416808632

### Predict churn with decision tree
Now you will build on the skills you acquired in the earlier exercise, and build a more complex decision tree with additional parameters to predict customer churn. Here you will run the decision tree classifier again on your training data, predict the churn rate on unseen (test) data, and assess model accuracy on both datasets.

In [44]:

# Initialize the Decision Tree
clf = DecisionTreeClassifier(max_depth = 7, 
               criterion = 'gini', 
               splitter  = 'best')

# Fit the model to the training data
clf = clf.fit(train_X, train_Y)

# Predict the values on test dataset
pred_Y = clf.predict(test_X)

# Print accuracy values
print("Training accuracy: ", np.round(clf.score(train_X, train_Y), 3)) 
print("Test accuracy: ", np.round(accuracy_score(test_Y, pred_Y), 3))

Training accuracy:  0.827
Test accuracy:  0.775


### Churn prediction fundamentals

In [47]:
set(telco_raw['Churn'])

{'No', 'Yes'}

In [48]:
telco_raw['Churn'].replace({'No':0, 'Yes':1}, inplace=True)

In [49]:
telco_raw['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

#### Exploring churn distribution

In [53]:
telco_raw.groupby(['Churn']).size() / telco_raw.shape[0] * 100

Churn
0    73.463013
1    26.536987
dtype: float64

#### Split to training and testing data

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X = telco
Y = telco_raw.Churn

In [66]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size = .25)

### Predict churn with logistic regression

In [67]:
from sklearn.linear_model import LogisticRegression

In [68]:
logreg = LogisticRegression()

In [70]:
logreg.fit(train_X, train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
train_X.shape

(5282, 30)

In [72]:
train_Y.shape

(5282,)

### Model performance metrics
Key metrics:
- Accuracy - The % of correctly predicted labels (both Churn and non Churn)
- Precision - The % of total model's positive class predictions (here - predicted as Churn) that were
- correctly classi

#### Measuring model accuracy

In [76]:
from sklearn.metrics import accuracy_score

pred_train_Y = logreg.predict(train_X)
pred_test_Y = logreg.predict(test_X)

train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)

print('Training accuracy:', round(train_accuracy,4))
print('Test accuracy:', round(test_accuracy, 4))

Training accuracy: 0.8101
Test accuracy: 0.7933


#### Measuring precision and recall

In [80]:
from sklearn.metrics import precision_score, recall_score

train_precision = round(precision_score(train_Y, pred_train_Y), 4)
test_precision = round(precision_score(test_Y, pred_test_Y), 4)

train_recall = round(recall_score(train_Y, pred_train_Y), 4)
test_recall = round(recall_score(test_Y, pred_test_Y), 4)

print('Training precision: {}, Training recall: {}'.format(train_precision, train_recall))
print('Test precision: {}, Test recall: {}'.format(train_recall, test_recall))

Training precision: 0.6626, Training recall: 0.5565
Test precision: 0.5565, Test recall: 0.5562


### L1 regularization and feature selection
- LogisticRegression from sklearn performs L2 regularization by default
- L1 regularization or also called LASSO can be called explicitly, and this approach performs feature
selection by shrinking some of the model coef

In [82]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(penalty='l1', C=0.1, solver='liblinear')
logreg.fit(train_X, train_Y)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

#### Tuning L1 regularization

In [85]:
C = [1, .5, .25, .1, .05, .025, .01, .005, .0025]

l1_metrics = np.zeros((len(C), 5))
l1_metrics[:,0] = C

for index in range(0, len(C)):
    logreg = LogisticRegression(penalty='l1', C=C[index], solver='liblinear')
    logreg.fit(train_X, train_Y)
    pred_test_Y = logreg.predict(test_X)
    l1_metrics[index,1] = np.count_nonzero(logreg.coef_)
    l1_metrics[index,2] = accuracy_score(test_Y, pred_test_Y)
    l1_metrics[index,3] = precision_score(test_Y, pred_test_Y)
    l1_metrics[index,4] = recall_score(test_Y, pred_test_Y)

col_names = ['C','Non-Zero Coeffs','Accuracy','Precision','Recall']

print(pd.DataFrame(l1_metrics, columns=col_names))

        C  Non-Zero Coeffs  Accuracy  Precision    Recall
0  1.0000             28.0  0.792164   0.649635  0.546012
1  0.5000             28.0  0.789892   0.646192  0.537832
2  0.2500             23.0  0.789892   0.646192  0.537832
3  0.1000             19.0  0.790460   0.649254  0.533742
4  0.0500             15.0  0.791028   0.656331  0.519427
5  0.0250             12.0  0.787621   0.659280  0.486708
6  0.0100              8.0  0.785917   0.686667  0.421268
7  0.0050              3.0  0.785349   0.736170  0.353783
8  0.0025              2.0  0.722317   0.000000  0.000000


  _warn_prf(average, modifier, msg_start, len(result))


* C= 0.025 seems a good option!

### Predict churn with decision trees

In [90]:
from sklearn.tree import DecisionTreeClassifier

In [91]:
mytree = DecisionTreeClassifier()

In [92]:
treemodel = mytree.fit(train_X, train_Y)

#### Measuring model accuracy

In [93]:
from sklearn.metrics import accuracy_score

pred_train_Y = mytree.predict(train_X)
pred_test_Y = mytree.predict(test_X)

train_accuracy = accuracy_score(train_Y, pred_train_Y)
test_accuracy = accuracy_score(test_Y, pred_test_Y)

print('Training accuracy:', round(train_accuracy,4))
print('Test accuracy:', round(test_accuracy, 4))

Training accuracy: 0.9983
Test accuracy: 0.7263


#### Measuring precision and recall

In [97]:
from sklearn.metrics import precision_score, recall_score

train_precision = round(precision_score(train_Y, pred_train_Y), 4)
test_precision = round(precision_score(test_Y, pred_test_Y), 4)

train_recall = round(recall_score(train_Y, pred_train_Y), 4)
test_recall = round(recall_score(test_Y, pred_test_Y), 4)

print('Training precision: {}, Training recall: {}'.format(train_precision, train_recall))
print('Test precision: {}, Test recall: {}'.format(train_recall, test_recall))

Training precision: 0.9993, Training recall: 0.9942
Test precision: 0.9942, Test recall: 0.4949


### Tree depth parameter tuning

In [99]:
depth_list = list(range(2,15))
depth_tuning = np.zeros((len(depth_list), 4))
depth_tuning[:,0] = depth_list

for index in range(len(depth_list)):
    mytree = DecisionTreeClassifier(max_depth=depth_list[index])
    mytree.fit(train_X, train_Y)
    pred_test_Y = mytree.predict(test_X)
    depth_tuning[index,1] = accuracy_score(test_Y, pred_test_Y)
    depth_tuning[index,2] = precision_score(test_Y, pred_test_Y)
    depth_tuning[index,3] = recall_score(test_Y, pred_test_Y)

col_names = ['Max_Depth','Accuracy','Precision','Recall']
print(pd.DataFrame(depth_tuning, columns=col_names))

    Max_Depth  Accuracy  Precision    Recall
0         2.0  0.786485   0.690236  0.419223
1         3.0  0.786485   0.690236  0.419223
2         4.0  0.775696   0.598739  0.582822
3         5.0  0.777967   0.613426  0.541922
4         6.0  0.771721   0.624642  0.445808
5         7.0  0.768313   0.599509  0.498978
6         8.0  0.763203   0.583721  0.513292
7         9.0  0.760363   0.565815  0.588957
8        10.0  0.752413   0.553971  0.556237
9        11.0  0.741056   0.534447  0.523517
10       12.0  0.735378   0.524416  0.505112
11       13.0  0.729699   0.513292  0.513292
12       14.0  0.725724   0.506224  0.498978


* 5 Max.Depth seems a good option!

### Identify and interpret churn drivers

In [110]:
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [111]:
from sklearn import tree
import graphviz

exported = tree.export_graphviz(
                                decision_tree=mytree,
                                out_file=None,
                                feature_names=train_X.columns,
                                precision=1,
                                class_names=['Not churn','Churn'],
                                filled = True)
graph = graphviz.Source(exported)
display(graph)

ExecutableNotFound: failed to execute ['dot.bat', '-Tsvg'], make sure the Graphviz executables are on your systems' PATH

<graphviz.files.Source at 0x15918e771c8>

In [115]:
logreg = LogisticRegression(penalty='l1', C=0.025, solver='liblinear')

logreg.fit(train_X, train_Y)

LogisticRegression(C=0.025, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [116]:
logreg.coef_

array([[ 0.        ,  0.        ,  0.        , -0.05885293, -0.79692471,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        -0.4753234 ,  0.        , -0.07079394,  0.        , -0.04228168,
         0.        , -0.46758826,  0.        ,  0.        ,  0.        ,
         0.        , -0.47356833, -0.72561486,  0.00212208,  0.        ,
         0.26648295,  0.        , -0.8464971 ,  0.90644643,  0.        ]])

#### Transforming logistic regression coef

In [120]:
coefficients = pd.concat([pd.DataFrame(train_X.columns),
                        pd.DataFrame(np.transpose(logreg.coef_))],
                        axis = 1)

coefficients.columns = ['Feature', 'Coefficient']
coefficients['Exp_Coefficient'] = np.exp(coefficients['Coefficient'])
coefficients = coefficients[coefficients['Coefficient']!=0]

print(coefficients.sort_values(by=['Coefficient']))

                           Feature  Coefficient  Exp_Coefficient
27                          tenure    -0.846497         0.428915
4                 PhoneService_Yes    -0.796925         0.450713
22               Contract_Two year    -0.725615         0.484027
10              OnlineSecurity_Yes    -0.475323         0.621684
21               Contract_One year    -0.473568         0.622776
16                 TechSupport_Yes    -0.467588         0.626511
12                OnlineBackup_Yes    -0.070794         0.931654
3                   Dependents_Yes    -0.058853         0.942845
14            DeviceProtection_Yes    -0.042282         0.958600
23            PaperlessBilling_Yes     0.002122         1.002124
25  PaymentMethod_Electronic check     0.266483         1.305365
28                  MonthlyCharges     0.906446         2.475510


* "tenure" seems to be the mostt important attribute

Source: Datacamp