# Import libraries

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

# Load data

https://www.kaggle.com/blastchar/telco-customer-churn#WA_Fn-UseC_-Telco-Customer-Churn.csv

In [2]:
df = pd.read_csv('Data/telco_customer_churn.csv', sep=',')

In [3]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [4]:
df.shape

(7032, 21)

# Data preprocessing / Feature engineering

#### Select variables

### Initial look at the data

In [5]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Remove customerID and Churn

# selected_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
#                      'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
#                      'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
#                      'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
#                      'MonthlyCharges', 'TotalCharges']
# 
# df[selected_features].head(5)

df.drop(columns=['customerID', 'Churn']).head(5)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65


In [7]:
df_new = df.drop(columns=['customerID', 'Churn'])

In [8]:
df_new.shape

(7032, 19)

#### One-hot encoding

In [9]:
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                       'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                       'Contract', 'PaperlessBilling', 'PaymentMethod']

df_new = pd.get_dummies(df_new, columns=categorical_columns)

# df_new = df_new.drop(columns=categorical_columns)

In [10]:
df_new.shape

(7032, 46)

In [11]:
df_new.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Female',
       'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       '

In [12]:
df_new.head(5)

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_0,SeniorCitizen_1,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,1,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,34,56.95,1889.5,0,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,2,53.85,108.15,0,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,45,42.3,1840.75,0,1,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,2,70.7,151.65,1,0,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


# Split into train and test set

In [13]:
X = df_new.values
y = df['Churn'].values

In [14]:
X

array([[1.0000e+00, 2.9850e+01, 2.9850e+01, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [3.4000e+01, 5.6950e+01, 1.8895e+03, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [2.0000e+00, 5.3850e+01, 1.0815e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       ...,
       [1.1000e+01, 2.9600e+01, 3.4645e+02, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [4.0000e+00, 7.4400e+01, 3.0660e+02, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [6.6000e+01, 1.0565e+02, 6.8445e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

In [15]:
y

array(['No', 'No', 'Yes', ..., 'No', 'Yes', 'No'], dtype=object)

In [16]:
y = np.where(y=='Yes', 1.0, 0.0)

In [17]:
y

array([0., 0., 1., ..., 0., 1., 0.])

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(7032, 46)
(5625, 46)
(1407, 46)


# Build and train model

In [20]:
from keras import backend, Sequential, regularizers, optimizers, models
from keras.layers import InputLayer, Dense

Using TensorFlow backend.


In [21]:
# Clear tensorflow session
backend.clear_session()

W0624 12:19:52.159466 4519531968 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:89: The name tf.reset_default_graph is deprecated. Please use tf.compat.v1.reset_default_graph instead.

W0624 12:19:52.160421 4519531968 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:92: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0624 12:19:52.172303 4519531968 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:96: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [22]:
# Build model
model = Sequential()

model.add(Dense(
    32,
    input_shape=(X_train.shape[1],),
    activation='tanh',
    kernel_regularizer=regularizers.l2(0.01),
    bias_regularizer=regularizers.l2(0.01)
))

model.add(Dense(
    1,
    activation='sigmoid',
    kernel_regularizer=regularizers.l2(0.01),
    bias_regularizer=regularizers.l2(0.01)
))

W0624 12:19:52.177304 4519531968 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:508: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0624 12:19:52.178962 4519531968 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3837: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.



In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 32)                1504      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,537
Trainable params: 1,537
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Define loss function
loss = 'binary_crossentropy'

In [25]:
# Define metrics
metrics = ['binary_accuracy']

In [26]:
# Define optimizer
optimizer = optimizers.Adam(lr=1e-3, decay=1e-2)

In [27]:
# Compile model
model.compile(
    loss=loss,
    optimizer=optimizer,
    sample_weight_mode=None,
    metrics=metrics
)

W0624 12:19:52.242517 4519531968 deprecation_wrapper.py:119] From /anaconda3/lib/python3.7/site-packages/keras/optimizers.py:757: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0624 12:19:52.250383 4519531968 deprecation.py:323] From /anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [28]:
# Train model
model.fit(
    X_train,
    y_train,
    batch_size=128,
    epochs=10,
    validation_split=0.2,
    verbose=1
)

W0624 12:19:52.412720 4519531968 variables.py:2445] Variable *= will be deprecated. Use `var.assign(var * other)` if you want assignment to the variable value or `x = x * y` if you want a new python Tensor object.


Train on 4500 samples, validate on 1125 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13650f198>

# Make predictions

In [29]:
df_new.columns

Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Female',
       'gender_Male', 'SeniorCitizen_0', 'SeniorCitizen_1', 'Partner_No',
       'Partner_Yes', 'Dependents_No', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       '

In [30]:
X_test[0]

array([ 1.  , 20.65, 20.65,  0.  ,  1.  ,  1.  ,  0.  ,  1.  ,  0.  ,
        1.  ,  0.  ,  0.  ,  1.  ,  1.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,
        0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,
        0.  ])

In [31]:
test_input = np.array([2.0, 24.4, 1548.65, 
              0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
              0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0,
              1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
              1.0, 0.0, 0.0, 0.0])

In [32]:
test_input = np.expand_dims(test_input, axis=0)

In [33]:
test_input.shape

(1, 46)

In [34]:
prediction = model.predict(test_input)
print(prediction)

[[0.19752231]]


In [35]:
y_pred = model.predict(X_test)

In [36]:
y_pred

array([[0.42626423],
       [0.16768366],
       [0.40606162],
       ...,
       [0.51296186],
       [0.140202  ],
       [0.38158485]], dtype=float32)

# Performance metrics

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [38]:
print(y_test[:12])
print(y_pred.squeeze()[:12])

[0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0.]
[0.42626423 0.16768366 0.40606162 0.501161   0.7760742  0.4160865
 0.23187393 0.7712338  0.32724756 0.15849373 0.20164996 0.09523353]


In [39]:
y_test = np.where(y_test==1.0, 'Yes', 'No')
y_pred = np.where(y_pred>0.5, 'Yes', 'No')

In [40]:
print(y_test[:12])
print(y_pred.squeeze()[:12])

['No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No']
['No' 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No']


In [41]:
confusion_matrix(y_test, y_pred)

array([[981,  57],
       [249, 120]])

In [42]:
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Yes')
recall = recall_score(y_test, y_pred, pos_label='Yes')

print(f'Accuracy: {acc:.4}')
print(f'Precision: {precision:.4}')
print(f'Recall/Sensitivity: {recall:.4}')

Accuracy: 0.7825
Precision: 0.678
Recall/Sensitivity: 0.3252


# Øvelser - Fine tune model

- Prøv at træne modellen med forskellige hyperparametre, og se hvordan det påvirker performance metrics.
- Prøv at fjerne kolonner i træningsdata og træn modellen igen. Se hvordan det påvirker performance metrics.
- Prøv at tilføje flere forskellige lag til det neurale netværk. Se hvordan det påvirker performance metrics.
- Prøv at brug en anden optimzer til træningen af det neurale netværk. Se hvordan det påvirker performance metrics.
- Prøv at lade modellen træne over længere ved at sætte ```epochs``` op. Se hvordan det påvirker performance metrics.
- Test forskellige kombinationer af de overstående punkter, og se hvor god performance man kan få.

# Exercise - Fine tune model

- Try training the model with different hyperparameters, and see how it affects the performance metrics.
- Try removing columns from the training data and retrain the model. See how it affects the performance metrics.
- Try adding more layers and different layers to the neural network. See how it affects the performance metrics.
- Try using another optimizer for training the neural netowrk. See how it affects the performance metrics.
- Try letting the model train for more time by increasing the number ```epochs```. See how it affects the performance metrics.
- Test different combinations of the methods in the previous bullets, and see how high performance you can get.