In [1]:
# Import the needed modules.
import numpy as np
import pandas as pd
import sklearn as skl
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
import tensorflow as tf




# PART 1: Get and Clean the Telecom Customer Churn Data

In [2]:
# Read the CSV file into a Pandas DataFrame.
telecom_df = pd.read_csv("Resources/WA_Fn-UseC_-Telco-Customer-Churn.csv") 

# Review the DataFrame.
telecom_df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Drop the non-useful 'customerID' column.
telecom_df = telecom_df.drop(["customerID"],axis=1)
telecom_df.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [4]:
# Check data types of columns.
# Object type columns will need to be converted to numeric type.
telecom_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [5]:
# Replace values in 'Churn' column from "No" and "Yes" to "0" and "1"
# This will turn 'Churn' into an int64 data type
telecom_cleaned = telecom_df.replace({'Churn': {'No': 0, 'Yes': 1}})

In [6]:
# Replace empty space string cells with 0 in 'TotalCharges'.
# Preparation to turn this column into numeric data type
telecom_cleaned = telecom_cleaned.replace({'TotalCharges': {' ': 0}})

In [7]:
# Change data type of 'TotalCharges' to Float.
telecom_cleaned['TotalCharges'] = telecom_cleaned['TotalCharges'].apply(pd.to_numeric)

In [8]:
# Check data type of columns to varify changes from cleaning.
telecom_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [9]:
# Use 'get_dummies' method to transform categorical columns into boolean columns that can be fed into deep learning model.
telecom_dummies_df = pd.get_dummies(telecom_cleaned)
telecom_dummies_df.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,True,False,False,True,True,...,False,True,False,False,False,True,False,False,True,False
1,0,34,56.95,1889.5,0,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,True
2,0,2,53.85,108.15,1,False,True,True,False,True,...,False,True,False,False,False,True,False,False,False,True
3,0,45,42.3,1840.75,0,False,True,True,False,True,...,False,False,True,False,True,False,True,False,False,False
4,0,2,70.7,151.65,1,True,False,True,False,True,...,False,True,False,False,False,True,False,False,True,False


In [32]:
telecom_dummies_df.to_csv('Resources/telecom_churn_cleaned.csv')

# PART 2: Split Data into Variables and Features and Train and Test Sets

In [10]:
# Separate the data into dependent and independent variables.
# Separate the y variable, the dependent variable.
y = telecom_dummies_df.Churn

# Separate the X variable, the independent variables.
X = telecom_dummies_df.drop(columns="Churn")

In [11]:
# Review the first five data points of y variable Series.
y[:5]

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [12]:
# Review the first five rows of X variable DataFrame.
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,True,False,False,True,True,False,...,False,True,False,False,False,True,False,False,True,False
1,0,34,56.95,1889.5,False,True,True,False,True,False,...,False,False,True,False,True,False,False,False,False,True
2,0,2,53.85,108.15,False,True,True,False,True,False,...,False,True,False,False,False,True,False,False,False,True
3,0,45,42.3,1840.75,False,True,True,False,True,False,...,False,False,True,False,True,False,True,False,False,False
4,0,2,70.7,151.65,True,False,True,False,True,False,...,False,True,False,False,False,True,False,False,True,False


In [13]:
# Check the balance of our target values.
y.value_counts()

Churn
0    5174
1    1869
Name: count, dtype: int64

In [14]:
# Import the train_test_learn module.
from sklearn.model_selection import train_test_split

# Split the data using train_test_split.
# Split data into train and test set using default 75/25 split.
# Assign a random_state of 42 to the function so we can consistently reproduce results.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
# Import StandardScaler 
from sklearn.preprocessing import StandardScaler

# Create StandardScaler instances.
scaler = StandardScaler()

# Fit the StandardScaler.
telecom_scaler = scaler.fit(X_train)

# Scale the data.
X_train_scaled = telecom_scaler.transform(X_train)
X_test_scaled = telecom_scaler.transform(X_test)

# PART 3: Create Tuner To Find Optimal Neural Network Construction for Neural Network Model

In [16]:
# Create a method that creates a new Sequential model with hyperparameter options.
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function works best for hidden layers.
    activation = hp.Choice('activation',['relu','tanh'])
    
    # Allow kerastuner to decide number of neurons in input layer.
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=30,
        step=5), activation=activation, input_dim=45))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers.
    for i in range(hp.Int('num_layers', 1, 5)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=30,
            step=5),
            activation=activation))
    
    # Use 'sigmoid' activation as predetermined for output layer
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the tuner model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [17]:
# Import the kerastuner library
import keras_tuner as kt

# Instantiate tuner to run through 100 epochs to search for neural network model that is most accurate.
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=100,
    hyperband_iterations=2)





In [19]:
# Run the kerastuner and search for best hyperparameters for neural network model.
tuner.search(X_train_scaled,y_train,epochs=100,validation_data=(X_test_scaled,y_test))

In [20]:
# Get top 3 model hyperparameters and print the values to use to construct.
top_hyper = tuner.get_best_hyperparameters(3)
for param in top_hyper:
    print(param.values)

{'activation': 'relu', 'first_units': 1, 'num_layers': 3, 'units_0': 21, 'units_1': 6, 'units_2': 11, 'units_3': 11, 'units_4': 26, 'tuner/epochs': 100, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}
{'activation': 'relu', 'first_units': 1, 'num_layers': 3, 'units_0': 26, 'units_1': 6, 'units_2': 16, 'units_3': 26, 'units_4': 21, 'tuner/epochs': 100, 'tuner/initial_epoch': 34, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0461'}
{'activation': 'tanh', 'first_units': 1, 'num_layers': 4, 'units_0': 21, 'units_1': 21, 'units_2': 1, 'units_3': 26, 'units_4': 21, 'tuner/epochs': 100, 'tuner/initial_epoch': 34, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0239'}


In [22]:
# Evaluate the accuracy of top 3 models against the test dataset.
top_model = tuner.get_best_models(3)
for model in top_model:
    model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)
    print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

56/56 - 0s - loss: 0.4068 - accuracy: 0.8194 - 172ms/epoch - 3ms/step
Loss: 0.4068242609500885, Accuracy: 0.8194207549095154
56/56 - 0s - loss: 0.4196 - accuracy: 0.8183 - 169ms/epoch - 3ms/step
Loss: 0.4195992350578308, Accuracy: 0.8182850480079651
56/56 - 0s - loss: 0.4071 - accuracy: 0.8177 - 156ms/epoch - 3ms/step
Loss: 0.4071190059185028, Accuracy: 0.8177171945571899


In [23]:
# Display the best model hyperparameters.
# We will be using these below for our deep learning model.
first_hyper = tuner.get_best_hyperparameters(2)[0]
first_hyper.values

{'activation': 'relu',
 'first_units': 1,
 'num_layers': 3,
 'units_0': 21,
 'units_1': 6,
 'units_2': 11,
 'units_3': 11,
 'units_4': 26,
 'tuner/epochs': 100,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

# PART 4: Create and Validate Deep Leaning Model Based on Recommendations from Tuner

In [25]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features_number = 45 # Determined by number of columns in X dataframe
input_layer = 1 # Determined by number of 'num_layers' output of tuner
hidden_layer0 = 21 # Determined by 'units_0' output of tuner
hidden_layer1 = 6 # Determined by 'units_1' output of tuner
hidden_layer2 = 11 # Determined by 'units_2' output of tuner

nn = tf.keras.models.Sequential()

# Input layer
nn.add(
    tf.keras.layers.Dense(units=input_layer, input_dim=input_features_number, activation="tanh") # use 'tanh' for input layer
)

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer0, activation="tanh")) # use 'tanh' for hidden layer

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer1, activation="tanh")) # use 'tanh' for hidden layer

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_layer2, activation="tanh")) # use 'tanh' for hidden layer

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid")) # use 'sigmoid' for output layer

# Check the structure of the model.
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 1)                 46        
                                                                 
 dense_10 (Dense)            (None, 21)                42        
                                                                 
 dense_11 (Dense)            (None, 6)                 132       
                                                                 
 dense_12 (Dense)            (None, 11)                77        
                                                                 
 dense_13 (Dense)            (None, 1)                 12        
                                                                 
Total params: 309 (1.21 KB)
Trainable params: 309 (1.21 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
# Compile the model.
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [27]:
# Train the model.
# Run model through 100 epochs.
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [28]:
# Evaluate the model for accuracy using the test data.
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

56/56 - 0s - loss: 0.4098 - accuracy: 0.8098 - 115ms/epoch - 2ms/step
Loss: 0.4098491370677948, Accuracy: 0.8097671866416931


In [30]:
# Export neural network model to keras file
nn.save("Resources/Telecom_Churn_Model.keras")