In [1]:
import numpy as np
from sklearn import preprocessing 
import tensorflow as tf

## Extract the data from the csv

In [2]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv', delimiter =',')

unscaled_inputs_all= raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

## Balance the dataset

In [3]:
"""
We check for the number of targets that bought again (i.e. the 1s). We have to balance the data, so we have
the same amount of people that bought the books again, and people that did not buy the books again.
"""

num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
ind_to_remove = []
for i in range(targets_all.shape[0]):
    if targets_all[i]==0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            ind_to_remove.append(i)
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, ind_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, ind_to_remove, axis=0)

## Standardizing the inputs

In [4]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

## Shuffle the data

In [5]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

## Split the data into train, validation and test

In [6]:
samples_count = shuffled_inputs.shape[0]

# 80% of the data is used for training, while 10% used for validation and 10% for testing.
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count -train_samples_count - validation_samples_count


train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]


test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

1771.0 3579 0.4948309583682593
235.0 447 0.5257270693512305
231.0 448 0.515625


## Save the datasets

In [7]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)