In [12]:
import numpy as np
import tensorflow as tf
from sklearn import preprocessing
# I used the sklearn preprocessing library to standardize the data more easily.

raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')

unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]
# Except for the first column (customer IDs that bear no useful information) and the last column (targets), the inputs are all columns in the csv.

In [13]:
shuffled_indices = np.arange(unscaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices)
# Because it was actually arranged by date, I shuffled the indices of the data so that it is not biased when I feed it into the model.
# I also batched the data so I want it to be as randomly spread out as possible

unscaled_inputs_all = unscaled_inputs_all[shuffled_indices]
targets_all = targets_all[shuffled_indices]
# Then I shuffled the inputs and targets using the shuffled indices. 

In [14]:
num_one_targets = int(np.sum(targets_all))
# To count how many targets are 1 (customers that converted)

zero_targets_counter = 0
# Here I set a counter for targets that are 0 (customers that did not convert)


indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
# I removed some input/target pairs to create a balanced dataset
# Count the number of targets that are 0. 
# Once there are as many 0s as 1s, I marked entries where the target is 0.
# I will remove these marked entries/indices below.
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)
# I created two new variables, one that will contain the inputs, and one that will contain the targets.

In [15]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)
# I took advantage of the preprocessing capability of sklearn here.

In [16]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]
# I shuffled the preprocessed data, inputs, and targets to prepare them for the machine learning process.

In [17]:
samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count
# Here I counted the samples in each subset and aimed for an 80-10-10 distribution of training, validation, and testing.

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]
# These are the variables I created that recorded the inputs and targets for training.

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]
# These are the variables I created that recorded the inputs and targets for validation.

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]
# These are the variables I created that recorded the inputs and targets for testing.

print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)
# Here I wanted to make sure that the training, validation, and testing data are balanced like my dataset (for targets 0 and 1 are 50-50).
# I printed the number of targets that are 1s, the total number of samples, and the proportion for training, validation, and test.

1783.0 3579 0.4981838502374965
218.0 447 0.48769574944071586
236.0 448 0.5267857142857143


In [18]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)
# I saved the three datasets in 3 .npz files in a coherent manner.

In [19]:
npz = np.load('Audiobooks_data_train.npz')

train_inputs = npz['inputs'].astype(np.float)
train_targets = npz['targets'].astype(np.int)
# I wanted to make sure that all my inputs are floats.
# I wanted my targets to be integers so I can smoothly one-hot encode them with sparse_categorical_crossentropy.

npz = np.load('Audiobooks_data_validation.npz')
validation_inputs, validation_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
# This is the validation data in a temporary variable

npz = np.load('Audiobooks_data_test.npz')
test_inputs, test_targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
# This is the test data in a temporary variable

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  This is separate from the ipykernel package so we can avoid doing imports until
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  after removing the cwd from sys.path.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if __name__ == '__main__':
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if __name__ == '__main__':
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  del sys.path[0]
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  del sys.path[0]


In [20]:
# Training the model

input_size = 10
output_size = 2
hidden_layer_size = 50
    
model = tf.keras.Sequential([
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    # 1st hidden layer
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'), 
    # 2nd hidden layer
    tf.keras.layers.Dense(output_size, activation='softmax') # output layer
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

batch_size = 100
max_epochs = 100

early_stopping = tf.keras.callbacks.EarlyStopping(patience=3)
# Here I set an early stopping mechanism with a patience=3 to be a bit tolerant against random validation loss increases.

model.fit(train_inputs,
          train_targets,
          batch_size=batch_size, 
          epochs=max_epochs, 
          callbacks=[early_stopping],
          # callbacks are functions called when a task is completed to check if the validation loss is increasing.
          validation_data=(validation_inputs, validation_targets),
          verbose = 2 
          # to get enough information about the training process
          )  

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 1s - loss: 0.6220 - accuracy: 0.6513 - val_loss: 0.5651 - val_accuracy: 0.7114
Epoch 2/100
3579/3579 - 0s - loss: 0.5031 - accuracy: 0.7625 - val_loss: 0.4817 - val_accuracy: 0.7562
Epoch 3/100
3579/3579 - 0s - loss: 0.4429 - accuracy: 0.7776 - val_loss: 0.4442 - val_accuracy: 0.7718
Epoch 4/100
3579/3579 - 0s - loss: 0.4133 - accuracy: 0.7882 - val_loss: 0.4301 - val_accuracy: 0.7763
Epoch 5/100
3579/3579 - 0s - loss: 0.3966 - accuracy: 0.7960 - val_loss: 0.4203 - val_accuracy: 0.7718
Epoch 6/100
3579/3579 - 0s - loss: 0.3863 - accuracy: 0.8022 - val_loss: 0.4191 - val_accuracy: 0.7450
Epoch 7/100
3579/3579 - 0s - loss: 0.3759 - accuracy: 0.8025 - val_loss: 0.4072 - val_accuracy: 0.7919
Epoch 8/100
3579/3579 - 0s - loss: 0.3688 - accuracy: 0.8069 - val_loss: 0.4054 - val_accuracy: 0.7875
Epoch 9/100
3579/3579 - 0s - loss: 0.3653 - accuracy: 0.8097 - val_loss: 0.4026 - val_accuracy: 0.7718
Epoch 10/100
3579/3579 - 0

<tensorflow.python.keras.callbacks.History at 0x7faddd43be50>

In [21]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [22]:
print('\nTest loss: {0:.2f}. Test accuracy: {1:.2f}%'.format(test_loss, test_accuracy*100.))


Test loss: 0.37. Test accuracy: 79.69%
