# 1. Preprocessing the data

In [121]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

In [122]:
raw_csv_data = np.loadtxt('/Users/jackcook/Documents/Course notes/Data Sources/business-case-books.csv',
                         delimiter=',')

# Exclude ID column & target
unscaled_inputs_raw = raw_csv_data[:,1:-1]

targets_raw = raw_csv_data[:,-1]

### Pre-shuffle data

This ensures that we aren't picking any rows more favourably over another.

In [123]:
# arange returns evenly spaced values in a given interval
shuffled_indices = np.arange(unscaled_inputs_raw.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = unscaled_inputs_raw[shuffled_indices]
shuffled_targets = targets_raw[shuffled_indices]

### Balance data

In [124]:
num_one_targets = shuffled_targets.sum()

# Count zeroes and record positions
zero_targets_count = 0
indices_to_remove = []

for i in range(shuffled_inputs.shape[0]):
    if shuffled_targets[i] == 0:
        zero_targets_count += 1
        # Note indices to remove
        if zero_targets_count > num_one_targets:
            indices_to_remove.append(i)
            
def delete_data(array):
    return np.delete(array, indices_to_remove, axis=0)
            
unscaled_inputs_balanced = delete_data(shuffled_inputs)
targets_balanced = delete_data(shuffled_targets)

print(unscaled_inputs_balanced.shape, targets_balanced.shape)
print(np.sum(targets_balanced))

(4474, 10) (4474,)
2237.0


### Re-shuffle data

In [125]:
# arange returns evenly spaced values in a given interval
reshuffled_indices = np.arange(unscaled_inputs_balanced.shape[0])
np.random.shuffle(reshuffled_indices)

reshuffled_bal_inputs = unscaled_inputs_balanced[reshuffled_indices]
reshuffled_bal_targets = targets_balanced[reshuffled_indices]

### Standardise inputs 

In [126]:
scaled_inputs = preprocessing.scale(reshuffled_bal_inputs)

print(scaled_inputs.shape, np.sum(reshuffled_bal_targets))

(4474, 10) 2237.0


### Train, validation, and test split

In [127]:
samples_count = scaled_inputs.shape[0]

# 80 : 10 : 10 split
train_samples_count = int(samples_count*0.8)
val_samples_count = int(samples_count*0.1)
test_samples_count = samples_count - train_samples_count - val_samples_count

train_inputs = scaled_inputs[:train_samples_count]
val_inputs = scaled_inputs[train_samples_count : - test_samples_count]
test_inputs = scaled_inputs[- test_samples_count:]

print('Samples count: {}\nTrain count: {}\nVal count: {}\nTest count: {}\n'.format(
    samples_count,
    train_inputs.shape[0],
    val_inputs.shape[0],
    test_inputs.shape[0]))

# Then split the targets in the same way
train_targets = reshuffled_bal_targets[:train_samples_count]
val_targets = reshuffled_bal_targets[train_samples_count : - test_samples_count]
test_targets = reshuffled_bal_targets[- test_samples_count:]

Samples count: 4474
Train count: 3579
Val count: 447
Test count: 448



Check that the datasets are balanced

In [128]:
print('Samples % 1s: {}\nTrain % 1s: {}\nVal % 1s: {}\nTest % 1s: {}\n'.format(
    round(100 * np.sum(targets_balanced)/samples_count),
    round(100 * np.sum(train_targets)/train_samples_count),
    round(100 * np.sum(val_targets)/val_samples_count),
    round(100 * np.sum(test_targets)/test_samples_count)))

Samples % 1s: 50.0
Train % 1s: 50.0
Val % 1s: 49.0
Test % 1s: 52.0



### Save as .csv 

This is just an exercise as this helps up practise loading numpy npz data.

In [129]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_val', inputs=val_inputs, targets=val_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

# 2. Creating the model

In [137]:
npz_train = np.load('Audiobooks_data_train.npz')

train_inputs = npz_train['inputs'].astype(np.float)
train_targets = npz_train['targets'].astype(np.int)

npz_val = np.load('Audiobooks_data_val.npz')

val_inputs = npz_val['inputs'].astype(np.float)
val_targets = npz_val['targets'].astype(np.int)

npz_test = np.load('Audiobooks_data_test.npz')

test_inputs = npz_test['inputs'].astype(np.float)
test_targets = npz_test['targets'].astype(np.int)

### Set up model

In [144]:
input_size = 10
output_size = 2 # Purchase or no purchase
hidden_layer_size = 150

model = tf.keras.Sequential([
    # We don't need to flatten our inputs this time as it's already the right shape
    
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(hidden_layer_size, activation='relu'),
    tf.keras.layers.Dense(output_size, activation='softmax'),
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy', # Applies one-hot encoding to targets
    metrics=['accuracy']
)

batch_size = 10
max_epochs = 100

# By running for all 100 epochs we risk overfitting
# so, we should introduce early stopping
# Callbacks are utilities called at certain points during training

# Stops training when validation loss doesn't improve
# Patience allows n number of epochs where val loss increases before stopping
early_stopping = tf.keras.callbacks.EarlyStopping(patience=2)

model.fit(train_inputs,
          train_targets, 
          batch_size=batch_size,
          epochs=max_epochs,
          callbacks=[early_stopping],
          validation_data=(val_inputs, val_targets),
          verbose=2)

Train on 3579 samples, validate on 447 samples
Epoch 1/100
3579/3579 - 1s - loss: 0.4351 - accuracy: 0.7720 - val_loss: 0.3724 - val_accuracy: 0.8076
Epoch 2/100
3579/3579 - 0s - loss: 0.3805 - accuracy: 0.7952 - val_loss: 0.3873 - val_accuracy: 0.8076
Epoch 3/100
3579/3579 - 0s - loss: 0.3740 - accuracy: 0.7974 - val_loss: 0.3679 - val_accuracy: 0.7852
Epoch 4/100
3579/3579 - 0s - loss: 0.3660 - accuracy: 0.7966 - val_loss: 0.3889 - val_accuracy: 0.7629
Epoch 5/100
3579/3579 - 0s - loss: 0.3693 - accuracy: 0.7977 - val_loss: 0.3513 - val_accuracy: 0.8345
Epoch 6/100
3579/3579 - 0s - loss: 0.3667 - accuracy: 0.7958 - val_loss: 0.3575 - val_accuracy: 0.8098
Epoch 7/100
3579/3579 - 1s - loss: 0.3553 - accuracy: 0.8061 - val_loss: 0.3524 - val_accuracy: 0.8210


<tensorflow.python.keras.callbacks.History at 0x142ef7a58>

# 3. Test the model

In [145]:
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets)



In [147]:
print('Test loss: {0: .2f}\nTest accuracy: {1: .2f}%'.format(test_loss,test_accuracy))

Test loss:  0.37
Test accuracy:  0.81%
