In [1]:
#use CSV logger to records logs from fit function
#CSV logger is part of callbacks so you must call it in the fit function as a callback
#site: https://keras.io/api/callbacks/csv_logger/

import numpy as np #for statistics and linear algebra computation
import pandas as pd #for data processing and reading in data

from scipy import stats #statistical tools
from sklearn.model_selection import train_test_split #splitting data into training and test sets

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report #for computing confusion matrix

#import keras library to implement neural networks
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#import optimizers to train data
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import binary_crossentropy

In [2]:
data = pd.read_csv("dataset.csv")#read dataset into variable

x = data.drop('CustomerId', axis=1)
x = x.drop('HasCrCard', axis=1)
x = x.drop('EstimatedSalary', axis=1)
x = x.drop('Surname', axis=1)
x = x.drop('Balance', axis=1)
x = x.drop('Exited', axis=1) #drop output variable you want to find from x data

#convert geography values to int values
#1 = Spain, 2 = France, 3 = Germany
x['Geography'] = x['Geography'].replace(['Spain', 'France', 'Germany'], [1, 2, 3]) #replace a value with a new value
        
#convert gender values to int values
#Female = 0, Male = 1
x['Gender'] = x['Gender'].replace(['Female', 'Male'], [0, 1])

# x #get input values that have a correlation with target output 'Exited'

y = data['Exited']
# y #separate output values from dataset

In [3]:
#split dataset into training and testing set (80-20 split)
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=34) #set random state to an integer for identical splits

#need to scale data so that there aren't any outrageous values or distortions because of varying ranges of values
scale = MinMaxScaler()
#scale training samples
xTrain_scaled = scale.fit_transform(xTrain)
xTest_scaled = scale.fit_transform(xTest)

In [4]:
#design F1-score function 
#F1-score = 2pr/(p+r)
#https://keras.io/guides/writing_your_own_callbacks/, reference site
class callbackValues(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None): #epoch level method called at end of each epoch
        keys = list(logs.keys()) #obtain what metrics were calculated and obtained each epoch
        #print("Info obtained at end of epoch ", keys)
        #here calculate the f1-score and print it out
        train_precision = logs[keys[2]]
        test_precision = logs[keys[6]]
        train_recall = logs[keys[3]]
        test_recall = logs[keys[7]]
        train_denom = train_precision + train_recall #calculate f1 score for training data at each epoch
        test_denom = test_precision + test_recall #calculate f1 score for test data at each epoch
        #below is an error check to prevent division by zero
        if (train_denom <= 0.0):
            train_denom = 1
        if (test_denom <= 0.0):
            test_denom = 1
        
        train_f1 = 2 * train_precision * train_recall / (train_denom)
        test_f1 = 2 * test_precision * test_recall / (test_denom)
        print("Training F1 score: ", train_f1)
        print("Validation F1 score: ", test_f1)

In [5]:
#although ANN 2 was the best in terms of accuracy, I decided to use ANN 3 (second best) because it 
    #obtains close to the accuracy obtained in ANN 2 in less time
ANN3 = keras.Sequential([
    keras.Input(shape=7), #add input layer here
    layers.Dense(15, name="layer1"), #at this point it should have the weighted sum of the values from the input layer
    layers.Dense(20, activation="relu", name="layer2"), #apply activation function
    layers.Dense(10, activation="relu", name="layer3"),
    layers.Dense(5, activation="relu", name="layer4"),
    layers.Dense(2, activation="relu", name="layer5"),
    layers.Dense(1, activation="sigmoid", name="layer6")
])

In [6]:
#train and validate model for ANN 3
metrics=['accuracy', keras.metrics.Precision(), keras.metrics.Recall()]
ANN3.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=metrics)
ANN3.fit(x=xTrain_scaled, y=yTrain, validation_data=(xTest_scaled, yTest), batch_size=150, epochs=1100, shuffle=False,verbose=2, callbacks=[callbackValues()])

Epoch 1/1100
48/48 - 5s - loss: 0.6916 - accuracy: 0.4418 - precision: 0.1643 - recall: 0.4175 - val_loss: 0.6869 - val_accuracy: 0.6278 - val_precision: 0.1416 - val_recall: 0.1850
Training F1 score:  0.23578627775947847
Validation F1 score:  0.1604010006133059
Epoch 2/1100
48/48 - 0s - loss: 0.6823 - accuracy: 0.7343 - precision: 0.1285 - recall: 0.0498 - val_loss: 0.6754 - val_accuracy: 0.8083 - val_precision: 1.0000 - val_recall: 0.0029
Training F1 score:  0.0718098026961648
Validation F1 score:  0.005763688632349697
Epoch 3/1100
48/48 - 0s - loss: 0.6702 - accuracy: 0.7937 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.6611 - val_accuracy: 0.8078 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Training F1 score:  0.0
Validation F1 score:  0.0
Epoch 4/1100
48/48 - 0s - loss: 0.6542 - accuracy: 0.7937 - precision: 0.0000e+00 - recall: 0.0000e+00 - val_loss: 0.6419 - val_accuracy: 0.8078 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00
Training F1 score:  0.0
Val

<tensorflow.python.keras.callbacks.History at 0x1e97bc50310>

In [7]:
#predict output values from judge.csv file and output predictions to a file named judge-pred.csv with the customerID and Exited value
judge = pd.read_csv("judge.csv")

judgeID = judge['CustomerId']
# print(judgeID[0:10])
judge = judge.drop('CustomerId', axis=1)
judge = judge.drop('HasCrCard', axis=1)
judge = judge.drop('EstimatedSalary', axis=1)
judge = judge.drop('Surname', axis=1)
judge = judge.drop('Balance', axis=1)

judge['Geography'] = judge['Geography'].replace(['Spain', 'France', 'Germany'], [1, 2, 3]) 
judge['Gender'] = judge['Gender'].replace(['Female', 'Male'], [0, 1])

#scale judge data
judge_scaled = scale.fit_transform(judge)

# csv_logger = keras.callbacks.CSVLogger('judge-pred.csv')
yPredict = (ANN3.predict(judge_scaled) > 0.5).astype("int32")
# yPredict = ANN3.predict(judge_scaled) #gets predictions in probabilities
yPredict

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
    

In [8]:
#save predictions to file
outfile = open('judge-pred.csv', 'w')
outfile.write("CustomerID, Exited\n")
for i in range(len(yPredict)):
    outfile.write(str(judgeID[i]))
    outfile.write(", ")
    outfile.write(str(yPredict[i][0]))
    outfile.write("\n")
outfile.close()