In [147]:
import numpy as np
from sklearn import preprocessing

In [148]:
raw_data = np.loadtxt('Audiobooks_data.csv', delimiter=',')

In [149]:
unscaled_inputs = raw_data[:, 1:-1]
targets = raw_data[:, -1]

In [150]:
# Balancing

ones_count = int(np.sum(targets))
indices_to_remove = []
zeros_count = 0

for i in range(targets.shape[0]):
    if targets[i] == 0:
        zeros_count += 1
        if zeros_count > ones_count:
            indices_to_remove.append(i)

unscaled_inputs_bal = np.delete(unscaled_inputs, indices_to_remove, axis=0)
targets_bal = np.delete(targets, indices_to_remove, axis=0)

In [151]:
onehot_encoded_targets = []
for i in targets_bal:
    t = [1,0] if i == 0 else [0,1]
    onehot_encoded_targets.append(t)
onhot_encoded_targets = np.array(onehot_encoded_targets)

In [152]:
# Standardizing

scaled_inputs_bal = preprocessing.scale(unscaled_inputs_bal)

In [153]:
# Shuffle data

shuffle_indices = np.arange(targets_bal.shape[0])
np.random.shuffle(shuffle_indices)

inputs = scaled_inputs_bal[shuffle_indices]
targets = onhot_encoded_targets[shuffle_indices]

In [154]:
# Train, Test, Validation

sample_count = targets.shape[0]

train_sample_count = int(0.9 * sample_count)
val_sample_count = int(0.05 * sample_count)
test_sample_count = int(0.05 * sample_count)

train_sample_inputs = inputs[0:train_sample_count]
train_sample_targets = targets[0:train_sample_count]

val_sample_inputs = inputs[train_sample_count:train_sample_count+val_sample_count]
val_sample_targets = targets[train_sample_count:train_sample_count+val_sample_count]

test_sample_inputs = inputs[train_sample_count+val_sample_count:]
test_sample_targets = targets[train_sample_count+val_sample_count:]

In [155]:
# Saving

np.savez("train_data", inputs=train_sample_inputs, targets=train_sample_targets)
np.savez("val_data", inputs=val_sample_inputs, targets=val_sample_targets)
np.savez("test_data", inputs=test_sample_inputs, targets=test_sample_targets)

In [156]:
print(np.sum(train_sample_targets) / train_sample_targets.shape[0])
print(np.sum(val_sample_targets) / val_sample_targets.shape[0])
print(np.sum(test_sample_targets) / test_sample_targets.shape[0])

1.0
1.0
1.0
