# Model selection

Tested models:
- k-Nearest Neighbours,
- Random Forest,
- Decision Tree,
- Artificial Neural Network,
- k-Means.

For each model there was also a test which meant to balance dataset to even or near-even number of samples for sessions with and without a purchase - 40% purchase, 60% non purchase, opposite, but also even split.

Best predictions score on evaluation stage was achieved for 6:4 ratio of dataset (60% non-purchase, 40% purchase sessions) with 90% correct predictions for non-purchase sessions and 84.4% for sessions with purchase. Second best model was artificial neural network using custom generator to pull balanced data directly from entire dataset in each epoch. This model achieved 91.9% correct predictions for sessions without purchase and 71.4% correct predictions for sessions with purchase and 94.2% correct predictions for sessions without purchase and 68.3% correct predictions for sessions with purchase after lowering dropout and limiting training time.




# Random Forest

In [2]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import utils

x, y = utils.load_data()
for k in range(3):
    x, y = shuffle(x, y)
    x_train, x_eva, y_train, y_eva = train_test_split(x, y, test_size=0.1)
    x_train, y_train = utils.balance_data(x_train, y_train, ratio=(0.6, 0.4))
    clf = RandomForestClassifier(random_state=0)
    clf.fit(x_train, y_train)

    predictions = clf.predict(x_eva)
    eva_score = [0, 0]
    eva_len = [0, 0]
    for i in range(len(predictions)):
        if np.argmax(predictions[i]) == np.argmax(y_eva[i]):
            eva_score[np.argmax(predictions[i])] += 1
        if np.argmax(y_eva[i]) == 0:
            eva_len[0] += 1
        else:
            eva_len[1] += 1
    print("Correct predictions (10% evaluation data): "+str(eva_score[0]/eva_len[0])+", "+str(eva_score[1]/eva_len[1]))

Correct predictions (10% evaluation data): 0.8933962264150943, 0.8497109826589595
Correct predictions (10% evaluation data): 0.8806262230919765, 0.8151658767772512
Correct predictions (10% evaluation data): 0.8740384615384615, 0.8290155440414507





# Artificial Neural Network with generator

In [None]:
import tensorflow as tf
import numpy as np
from datetime import datetime
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import utils

batch_size = 16
ratio = 0.5
x, y = utils.load_data()
x, y = shuffle(x, y)
x_train, x_eva, y_train, y_eva = train_test_split(x, y, test_size=0.1)
x1, x2, y1, y2 = utils.separate_data(x_train, y_train)

# defining model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=x[0].shape),
    tf.keras.layers.Dense(units=32, activation="relu"),
    tf.keras.layers.Dense(units=16, activation="relu"),
    tf.keras.layers.Dense(units=8, activation="tanh"),
    tf.keras.layers.Dense(units=8, activation="relu"),
    tf.keras.layers.Dense(units=8, activation="tanh"),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(units=8, activation="relu"),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(units=2, activation="softmax")
])
opt = tf.keras.optimizers.Adam()
model.compile(optimizer=opt, loss='mse', metrics=['accuracy'])


def lr_rate_schedule(epochs, lr):
    if epochs < 1000:
        return lr
    else:
        return lr*0.1
    return lr


class BalancedGenerator(tf.keras.utils.Sequence):
    def __init__(self, x1, x2, y1, y2, batch_size, ratio):
        self.x1 = x1
        self.x2 = x2
        self.y1 = y1
        self.y2 = y2
        self.batch_size = batch_size
        self.ratio = ratio

    def __len__(self):
        return (np.ceil((len(self.x1) + len(self.x2)) / float(self.batch_size))).astype(np.int)

    def __getitem__(self, idx):
        batch_x = self.x1[idx * int(self.batch_size * self.ratio): (idx + 1) * int(self.batch_size * self.ratio)]
        batch_x.extend(self.x2[idx * int(self.batch_size * self.ratio): (idx + 1) * int(self.batch_size * self.ratio)])
        batch_y = self.y1[idx * int(self.batch_size * self.ratio): (idx + 1) * int(self.batch_size * self.ratio)]
        batch_y.extend(self.y2[idx * int(self.batch_size * self.ratio): (idx + 1) * int(self.batch_size * self.ratio)])
        return np.array(batch_x), np.array(batch_y)


lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_rate_schedule)  # variable lr
log_dir = "./logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")  # training graphs
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

training_batch_generator = BalancedGenerator(x1, x2, y1, y2, batch_size, ratio)

for k in range(5):  # control overfit
    model.fit(training_batch_generator, epochs=50, verbose=0, callbacks=[lr_callback]) #, tensorboard_callback])

    # model evaluation
    predictions = model.predict(np.asarray(x_eva))
    eva_score = [0, 0]
    eva_len = [0, 0]
    for i in range(len(predictions)):
        if np.argmax(predictions[i]) == np.argmax(y_eva[i]):
            eva_score[np.argmax(predictions[i])] += 1
        if np.argmax(y_eva[i]) == 0:
            eva_len[0] += 1
        else:
            eva_len[1] += 1
    print("Correct predictions (10% evaluation data): "+str(eva_score[0]/eva_len[0])+", "+str(eva_score[1]/eva_len[1]))