In [None]:
import numpy as np #for statistics and linear algebra computation
import pandas as pd #for data processing and reading in data
import matplotlib.pyplot as plt #for plotting data  and visuals
import seaborn as sb #for more visualizations

from scipy import stats #statistical tools
from sklearn.model_selection import train_test_split #splitting data into training and test sets

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report #for computing confusion matrix

#import keras library to implement neural networks
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#import optimizers to train data
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import binary_crossentropy

In [None]:
data = pd.read_csv("dataset.csv")#read dataset into variable
# data.head(7)

In [None]:
# plt.figure(figsize=(20,20))
#sb.scatterplot(x='Balance', y='Exited', data=data) #display scatter plot to help visualize relationship b/w inputs and output
#sb.scatterplot(x='EstimatedSalary', y='Exited', data=data)
#sb.barplot(x='Age', y='Exited', data=data) #display bar graph

In [None]:
# sb.distplot(data['Tenure']) #display distribution plot
#sb.distplot(data['EstimatedSalary'])

In [None]:
correlation = data.corr() #find correlation in data
plt.figure(figsize=(10,10))
sb.heatmap(correlation)
plt.show()

In [None]:
x = data.drop('CustomerId', axis=1)
x = x.drop('HasCrCard', axis=1)
x = x.drop('EstimatedSalary', axis=1)
x = x.drop('Surname', axis=1)
x = x.drop('Balance', axis=1)
x = x.drop('Exited', axis=1) #drop output variable you want to find from x data

# print(x.shape) #obtains shape of data (this one is 9000 x 8)
# print(x.columns) #obtains the names of the columns
# print(x.Geography) #obtains the values that correspond to the column
# print(x['Geography', 'Age']) #same aslast line but different format and can get multiple columns
# print(x.loc[15]) #obtains a specific row
# print(x.loc[0:2]) #obtains multiple rows
# print(x[['Age', 'Gender']].loc[[1,3]]) #obtains specific rows and specific columns

#convert geography values to int values
#1 = Spain, 2 = France, 3 = Germany
x['Geography'] = x['Geography'].replace(['Spain', 'France', 'Germany'], [1, 2, 3]) #replace a value with a new value
        
#convert gender values to int values
#Female = 0, Male = 1
x['Gender'] = x['Gender'].replace(['Female', 'Male'], [0, 1])

x #get input values that have a correlation with target output 'Exited'

In [None]:
y = data['Exited']
y #separate output values from dataset

In [None]:
#split dataset into training and testing set (80-20 split)
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=57) #set random state to an integer for identical splits
# xTrain

In [None]:
#need to scale data so that there aren't any outrageous values or distortions because of varying ranges of values
scale = MinMaxScaler()
#scale training samples
xTrain_scaled = scale.fit_transform(xTrain)
xTest_scaled = scale.fit_transform(xTest)

In [None]:
#create ANN model here
#ANN model 1
ANN1 = keras.Sequential([
    keras.Input(shape=7), #add input layer here
    layers.Dense(11, name="layer1"), #at this point it should have the weighted sum of the values from the input layer
    layers.Dense(10, activation="relu", name="layer2"), #apply activation function
    layers.Dense(5, activation="relu", name="layer3"),
    layers.Dense(4, activation="relu", name="layer4"),
    layers.Dense(1, activation="sigmoid", name="layer5")
    
])
print(len(ANN1.layers))

In [None]:
ANN1.summary()

In [None]:
#design F1-score function 
#F1-score = 2pr/(p+r)
#https://keras.io/guides/writing_your_own_callbacks/, reference site
class callbackValues(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None): #epoch level method called at end of each epoch
        keys = list(logs.keys()) #obtain what metrics were calculated and obtained each epoch
        #print("Info obtained at end of epoch ", keys)
        #here calculate the f1-score and print it out
        train_precision = logs[keys[2]]
        test_precision = logs[keys[6]]
        train_recall = logs[keys[3]]
        test_recall = logs[keys[7]]
        train_denom = train_precision + train_recall #calculate f1 score for training data at each epoch
        test_denom = test_precision + test_recall #calculate f1 score for test data at each epoch
        #below is an error check to prevent division by zero
        if (train_denom <= 0.0):
            train_denom = 1
        if (test_denom <= 0.0):
            test_denom = 1
        
        train_f1 = 2 * train_precision * train_recall / (train_denom)
        test_f1 = 2 * test_precision * test_recall / (test_denom)
        print("Training F1 score: ", train_f1)
        print("Validation F1 score: ", test_f1)

In [None]:
#train the model with the training dataset and validate with test dataset for accuracy, precision, recall, and F1-score parameters
metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
ANN1.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=metrics)
ANN1.fit(x=xTrain_scaled, y=yTrain, validation_data=(xTest_scaled, yTest), batch_size=55, epochs=100, shuffle=False,verbose=2, callbacks=[callbackValues()])

In [None]:
#evaluate model with test dataset (do not need since it is validated in the fit function by adding validation data parameter to it, but below is an alternative)
#results = ANN1.evaluate(xTest_scaled, yTest, batch_size=20)

In [None]:
#ANN model 2
ANN2 = keras.Sequential([
    keras.Input(shape=7), #add input layer here
    layers.Dense(11, name="layer1"), #at this point it should have the weighted sum of the values from the input layer
    layers.Dense(11, activation="relu", name="layer2"), #apply activation function
    layers.Dense(9, activation="relu", name="layer3"),
    layers.Dense(5, activation="relu", name="layer4"),
    layers.Dense(3, activation="sigmoid", name="layer5"),
    layers.Dense(1, activation="sigmoid", name="layer6")
])
ANN2.summary() #print summary of ANN2 structure

In [None]:
#train and validate model for ANN 2
metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
ANN2.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=metrics)
ANN2.fit(x=xTrain_scaled, y=yTrain, validation_data=(xTest_scaled, yTest), batch_size=100, epochs=100000, shuffle=False,verbose=2, callbacks=[callbackValues()])

In [None]:
#ANN model 3
ANN3 = keras.Sequential([
    keras.Input(shape=7), #add input layer here
    layers.Dense(15, name="layer1"), #at this point it should have the weighted sum of the values from the input layer
    layers.Dense(20, activation="relu", name="layer2"), #apply activation function
    layers.Dense(10, activation="relu", name="layer3"),
    layers.Dense(5, activation="relu", name="layer4"),
    layers.Dense(2, activation="relu", name="layer5"),
    layers.Dense(1, activation="sigmoid", name="layer6")
])
ANN3.summary() #print summary of ANN3 structure

In [None]:
#train and validate model for ANN 3
metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
ANN3.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=metrics)
ANN3.fit(x=xTrain_scaled, y=yTrain, validation_data=(xTest_scaled, yTest), batch_size=150, epochs=1100, shuffle=False,verbose=2, callbacks=[callbackValues()])

In [None]:
#ANN model 4
ANN4 = keras.Sequential([
    keras.Input(shape=7), #add input layer here
    layers.Dense(14, name="layer1"), #at this point it should have the weighted sum of the values from the input layer
    layers.Dense(7, activation="relu", name="layer2"), #apply activation function
    layers.Dense(3, activation="relu", name="layer3"),
    layers.Dense(1, activation="sigmoid", name="layer4"),
])
ANN4.summary() #print summary of ANN3 structure

In [None]:
#train and validate model for ANN 4
metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
ANN4.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=metrics)
ANN4.fit(x=xTrain_scaled, y=yTrain, validation_data=(xTest_scaled, yTest), batch_size=300, epochs=1000, shuffle=False,verbose=2, callbacks=[callbackValues()])