In [2]:
import os
import sys
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

import tensorflow as tf
from tensorflow import keras
tf.keras.backend.set_floatx('float64')

In [4]:
curdir = os.getcwd()
prevdir = os.path.dirname(curdir)

folder = prevdir + '/datasets/ml-100k/'
rating_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(folder + 'u.data', sep = '\t', names = rating_cols, encoding = 'latin-1')
ratings.drop('timestamp', axis = 1, inplace = True)

In [3]:
min_rated = 3
value_counts = ratings['movie_id'].value_counts()
remove_movies = value_counts.loc[value_counts < min_rated].index
ratings = ratings.loc[~ ratings['movie_id'].isin(remove_movies)]

Helpful article on data splitting: https://stackoverflow.com/questions/43129764/splitting-data-set-into-training-and-testing-sets-on-recommender-systems

### Preparing the dataset

In [4]:
from sklearn.model_selection import train_test_split

def split_dataset(ratings):

    users = set(ratings['user_id'].unique())
    movie_counts = ratings['movie_id'].value_counts()
    ratings = ratings.sort_values(by = 'movie_id').reset_index(drop = True)
    rem_ratings = ratings.copy()

    idx = 0
    count = 0
    X_test = None
    X_train = None
    X_valid = None
    while idx < ratings.shape[0]:
        
        print(f'\r{idx} of {ratings.shape[0] - 1}', end = " ", sep = " ")
        
        if count == 0:
            if not isinstance(X_train, pd.DataFrame):
                first_row = ratings.loc[idx].to_dict()
                X_train = pd.DataFrame(first_row, index = [0])     
            else:
                X_train = X_train.append(ratings.loc[idx], ignore_index = True)
            rem_ratings.drop(idx, inplace = True)

            idx += 1
            count = 1
               
        elif count == 1:
            if not isinstance(X_valid, pd.DataFrame):
                first_row = ratings.loc[idx].to_dict()
                X_valid = pd.DataFrame(first_row, index = [0])
            else:
                X_valid = X_valid.append(ratings.loc[idx], ignore_index = True)
            rem_ratings.drop(idx, inplace = True)
            
            idx += 1
            count = 2
            
        elif count == 2:
            if not isinstance(X_test, pd.DataFrame):
                first_row = ratings.loc[idx].to_dict()
                X_test = pd.DataFrame(first_row, index = [0])
            else:
                X_test = X_test.append(ratings.loc[idx], ignore_index = True)
            
            count = 0
            rem_ratings.drop(idx, inplace = True)
            movie_id = ratings.loc[idx, 'movie_id'] 
            assert movie_counts[movie_id] > 2
            idx += (movie_counts[movie_id] - 2)
    
    train, test= train_test_split(rem_ratings, 
                                  test_size = 0.2, 
                                  random_state = 42)
    
    X_test = pd.concat([X_test, test])
    X_train = pd.concat([X_train, train])
    
    users_test = X_test['user_id'].unique()
    users_valid = X_valid['user_id'].unique()
    users_train = X_train['user_id'].unique()
    
    rem_users_test = list(users.difference(set(users_test)))
    rem_users_valid = list(users.difference(set(users_valid)))
    rem_users_train = list(users.difference(set(users_train)))

    for user_id in rem_users_test:
        row = ratings.loc[ratings['user_id'] == user_id].sample()
        X_test = X_test.append(row, ignore_index = True)
        
    for user_id in rem_users_valid:
        row = ratings.loc[ratings['user_id'] == user_id].sample()
        X_valid = X_valid.append(row, ignore_index = True)
        
    for user_id in rem_users_train:
        row = ratings.loc[ratings['user_id'] == user_id].sample()
        X_train = X_train.append(row, ignore_index = True)
    
    return X_train, X_valid, X_test


In [8]:
matrix = X_train.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()
matrix_valid = X_valid.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()

In [12]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def score(cf_model):
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    y_true = np.array(X_test['rating'])
    return rmse(y_true, y_pred)

In [86]:
def print_statusbar(iteration, total, losses_list, metrics=None):
    metrics = " - ".join(["{}: {:.4f}".format(m.name, m.result()) for m in losses_list + (metrics or [])])
    end = "" if iteration < total else "\n"
    print("\r{}/{} - ".format(iteration, total) + metrics, end = end)

In [14]:
@tf.function
def random_batch(matrix, filled_idx_tuples, batch_size = None):
    
    user_batch = None
    item_batch = None
    ratings_batch = None
    
    if batch_size == None:
        indices = range(len(filled_idx_tuples))
    else:
        indices = np.random.randint(len(filled_idx_tuples), size = batch_size)
    
    for i, rdm_idx in enumerate(indices):
        (user_idx, item_idx) = filled_idx_tuples[rdm_idx]

        if i == 0:
            user_batch = matrix[user_idx, :]
            item_batch = matrix[:, item_idx]
            ratings_batch = matrix[user_idx, item_idx]
        else:        
            user_batch = np.vstack((user_batch, matrix[user_idx, :]))
            item_batch = np.vstack((item_batch, matrix[:, item_idx]))
            ratings_batch = np.vstack((ratings_batch, matrix[user_idx, item_idx]))
            
    return ((user_batch, item_batch), ratings_batch)

In [None]:
idx_valid = np.where(~ np.isnan(matrix_valid))
filled_idx_tuples_valid = list(zip(*idx_valid))
validation_set = random_batch(matrix_valid, filled_idx_tuples_valid)

### Model

In [17]:
class GMFLayer(keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def build(self, input_shape):
        assert(input_shape[0][1] == input_shape[1][1]) 
        
    def call(self, inputs):
        user_vec, item_vec = inputs
        elem_product = tf.math.multiply(user_vec, item_vec)
        return elem_product 
    
    def compute_output_shape(self, batch_input_shape):
        return batch_input_shape[0]

In [161]:
@tf.function
def sigmoid_1_5(z):
    return 1 + (4 * tf.math.sigmoid(z))

In [129]:
from tensorflow.keras.layers import Dense, LeakyReLU

class DenseBlock(keras.layers.Layer):
    def __init__(self, n_layers, n_units, regularizer = None, leaky_alpha = 0.3, **kwargs):
        super().__init__(**kwargs)
        self.hidden = []
        for __ in range(n_layers):
            self.hidden.append(Dense(n_units, kernel_regularizer = regularizer))
            self.hidden.append(LeakyReLU(alpha = leaky_alpha))
        
    def call(self, inputs):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        return Z
    

In [130]:
from tensorflow.keras.layers import Dense, Embedding, LeakyReLU

class NeuMFModel(keras.models.Model):
    def __init__(self, n_dense_block = 4, n_units = 30, regul = None, leaky_apha = 0.3, **kwargs):
        
        super().__init__(**kwargs)
        self.n_units = n_units
        
        self.mf_user_vec = Dense(n_units, kernel_regularizer = regul, name = 'mf_user_vec') #
        self.leaky_mf_user_vec = LeakyReLU(alpha = leaky_alpha)
        
        self.mf_item_vec = Dense(n_units, kernel_regularizer = regul, name = 'mf_item_vec') #
        self.leaky_mf_item_vec = LeakyReLU(alpha = leaky_alpha)
        
        self.mlp_user_vec = Dense(n_units, kernel_regularizer = regul, name = 'mlp_user_vec') #
        self.leaky_mlp_user_vec = LeakyReLU(alpha = leaky_alpha)
        
        self.mlp_item_vec = Dense(n_units, kernel_regularizer = regul, name = 'mlp_item_vec') #
        self.leaky_mlp_item_vec = LeakyReLU(alpha = leaky_alpha)

        self.concat_mlp_layer = keras.layers.Concatenate(name = 'concat_mlp_layer')

        self.gmf_layer = GMFLayer(name = 'gmf_layer')
        self.mlp_block = DenseBlock(n_dense_block, n_units, regularizer = regul, name = 'mlp_block')
        
        self.neu_mf_layer = keras.layers.Concatenate(name = 'neu_mf_layer')
        self.rating_output = Dense(1, activation = sigmoid_1_5, 
                                   kernel_regularizer = regul, name = 'output')
        
    def call(self, inputs):
        user_input, item_input = inputs        
        mf_user_vec = self.mf_user_vec(user_input)
        mf_user_vec_act = self.leaky_mf_user_vec(mf_user_vec)
        
        mf_item_vec = self.mf_item_vec(item_input)
        mf_item_vec_act = self.leaky_mf_item_vec(mf_item_vec)
        
        mlp_user_vec = self.mlp_user_vec(user_input)
        mlp_user_vec_act = self.leaky_mlp_user_vec(mlp_user_vec)
        
        mlp_item_vec = self.mlp_item_vec(item_input)
        mlp_item_vec_act = self.leaky_mlp_item_vec(mlp_item_vec)
        
        concat_mlp_layer = self.concat_mlp_layer([mlp_user_vec_act, mlp_item_vec_act])
        mlp_block = self.mlp_block(concat_mlp_layer)
        gmf_layer = self.gmf_layer([mf_user_vec_act, mf_item_vec_act])
        neu_mf_layer = self.neu_mf_layer([gmf_layer, mlp_block])
        rating_output = self.rating_output(neu_mf_layer)
        return rating_output

In [131]:
@tf.function
def training_loop(model, matrix, validation_set, optimizer, n_epochs, batch_size = 64):
    
    indices = np.where(~ np.isnan(matrix))
    filled_idx_tuples = list(zip(*indices))
    matrix = np.nan_to_num(matrix)
    
    X_valid, y_valid = validation_set
    user_valid_set, movie_valid_set = X_valid
    user_valid_set = np.nan_to_num(user_valid_set)
    movie_valid_set = np.nan_to_num(movie_valid_set)
    
    n_steps = len(filled_idx_tuples) // batch_size
    loss_func = keras.losses.MeanSquaredError()
    metrics = [keras.metrics.MeanSquaredError(name = 'validation mse')]
    mean_loss = keras.metrics.Mean(name = 'train mse')
    mean_total_loss = keras.metrics.Mean(name = 'train mse + reg')
        
    for epoch in range(1, n_epochs + 1):
        print("Epoch {}/{}".format(epoch, n_epochs))
        for step in range(1, n_steps + 1):
            input_batch, output_batch = random_batch(matrix, filled_idx_tuples, batch_size)
            with tf.GradientTape() as tape:
                y_pred = model(input_batch)
                loss = tf.reduce_mean(loss_func(output_batch, y_pred))
                total_loss = tf.add_n([loss] + model.losses)
    
            gradients = tape.gradient(total_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            
            mean_loss(loss)
            mean_total_loss(total_loss)
            
            y_pred_valid = model((user_valid_set, movie_valid_set))
            for idx, metric in enumerate(metrics):
                if idx < len(metrics) - 1:
                    metric(output_batch, y_pred)
                else: 
                    metric(y_valid, y_pred_valid) 
            
            print_statusbar(step * batch_size, len(filled_idx_tuples), [mean_loss, mean_total_loss], metrics)
            
        print_statusbar(len(filled_idx_tuples), len(filled_idx_tuples), [mean_loss, mean_total_loss], metrics)
        for metric in [mean_loss, mean_total_loss] + metrics:
            metric.reset_states()



In [None]:
leaky_alpha = 0.4
reg = keras.regularizers.l2(0.1)
opt = keras.optimizers.Adam(0.0001)

model = NeuMFModel(name = 'neu_mf_model', n_dense_block = 5, n_units = 30, 
                   regul = reg, leaky_apha = leaky_alpha)

training_loop(model, matrix, validation_set, optimizer = opt, n_epochs = 10)

### Debugging the model

#### Try to overfit using a small dataset

In [133]:
users = ratings['user_id'].unique()
movies = ratings['movie_id'].unique()

perc = 0.001
users_sub = np.random.choice(users, int(len(users) * perc), replace = False)
movies_sub = np.random.choice(movies, int(len(movies) * perc), replace = False)

filter_ = (ratings['user_id'].isin(users_sub)) | (ratings['movie_id'].isin(movies_sub)) 
ratings_sub = ratings.loc[filter_]

min_rated = 3
value_counts = ratings_sub['movie_id'].value_counts()
remove_movies = value_counts.loc[value_counts < min_rated].index
ratings_sub = ratings_sub.loc[~ ratings_sub['movie_id'].isin(remove_movies)]

In [134]:
X_train_sub, X_valid_sub, X_test_sub = split_dataset(ratings_sub)

0 of 190 1 of 190 2 of 190 

In [135]:
matrix_sub = X_train_sub.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()
matrix_valid_sub = X_valid_sub.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()

In [136]:
idx_valid_sub = np.where(~ np.isnan(matrix_valid_sub))
filled_idx_tuples_valid_sub = list(zip(*idx_valid_sub))
validation_set_sub = random_batch(matrix_valid_sub, filled_idx_tuples_valid_sub)

In [None]:
leaky_alpha = 0.4
reg = keras.regularizers.l2(0.1)
opt = keras.optimizers.Adam(0.0001)

model = NeuMFModel(name = 'neu_mf_model', n_dense_block = 5, n_units = 30, 
                   regul = reg, leaky_apha = leaky_alpha)

training_loop(model, matrix_sub, validation_set_sub, optimizer = opt, n_epochs = 1020)

# Result: we are able to overfit a small dataset

#### Check loss from random inputs 

In [143]:
n_users = 250
n_movies = 300
train_perc_nan = 0.99
valid_perc_nan = 0.999

matrix_rdm = 5 * np.random.rand(n_users, n_movies)
matrix_rdm.ravel()[(
    np.random.choice(matrix_rdm.size, int(n_movies * n_users * train_perc_nan), replace = False))] = np.nan

matrix_valid_rdm = 5 * np.random.rand(n_users, n_movies)
matrix_valid_rdm.ravel()[(
    np.random.choice(matrix_valid_rdm.size, int(n_movies * n_users * valid_perc_nan), replace = False))] = np.nan

In [144]:
idx_valid_rdm = np.where(~ np.isnan(matrix_valid_rdm))
filled_idx_tuples_valid_rdm = list(zip(*idx_valid_rdm))
validation_set_rdm = random_batch(matrix_valid_rdm, filled_idx_tuples_valid_rdm)

In [146]:
leaky_alpha = 0.4
reg = keras.regularizers.l2(0.1)
opt = keras.optimizers.Adam(0.0001)

model = NeuMFModel(name = 'neu_mf_model', n_dense_block = 5, n_units = 30, 
                   regul = reg, leaky_apha = leaky_alpha)

training_loop(model, matrix_rdm, validation_set_rdm, optimizer = opt, n_epochs = 20)
# Result: dataset is informative since loss of random noise is high

Epoch 1/20
750/750 - train mse: 2.3224 - train mse + reg: 39.6613 - validation mse: 2.2497
Epoch 2/20
750/750 - train mse: 2.5132 - train mse + reg: 39.1978 - validation mse: 2.2424
Epoch 3/20
750/750 - train mse: 2.3061 - train mse + reg: 38.3481 - validation mse: 2.2350
Epoch 4/20
750/750 - train mse: 2.3594 - train mse + reg: 37.7692 - validation mse: 2.2285
Epoch 5/20
750/750 - train mse: 2.3485 - train mse + reg: 37.1371 - validation mse: 2.2219
Epoch 6/20
750/750 - train mse: 2.2244 - train mse + reg: 36.4030 - validation mse: 2.2152
Epoch 7/20
750/750 - train mse: 2.3235 - train mse + reg: 35.9028 - validation mse: 2.2085
Epoch 8/20
750/750 - train mse: 2.3679 - train mse + reg: 35.3589 - validation mse: 2.2014
Epoch 9/20
750/750 - train mse: 2.2578 - train mse + reg: 34.6712 - validation mse: 2.1945
Epoch 10/20
750/750 - train mse: 2.1991 - train mse + reg: 34.0455 - validation mse: 2.1876
Epoch 11/20
750/750 - train mse: 2.3279 - train mse + reg: 33.6170 - validation mse: 2.18

### Try simpler NN models

#### Weighted combination of element-wise product

In [152]:
users = ratings['user_id'].unique()
movies = ratings['movie_id'].unique()

perc = 1
users_sub = np.random.choice(users, int(len(users) * perc), replace = False)
movies_sub = np.random.choice(movies, int(len(movies) * perc), replace = False)

filter_ = (ratings['user_id'].isin(users_sub)) | (ratings['movie_id'].isin(movies_sub)) 
ratings_sub = ratings.loc[filter_]

min_rated = 3
value_counts = ratings_sub['movie_id'].value_counts()
remove_movies = value_counts.loc[value_counts < min_rated].index
ratings_sub = ratings_sub.loc[~ ratings_sub['movie_id'].isin(remove_movies)]

In [153]:
X_train_sub, X_valid_sub, X_test_sub = split_dataset(ratings_sub)

matrix_sub = X_train_sub.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()
matrix_valid_sub = X_valid_sub.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()

idx_valid_sub = np.where(~ np.isnan(matrix_valid_sub))
filled_idx_tuples_valid_sub = list(zip(*idx_valid_sub))
validation_set_sub = random_batch(matrix_valid_sub, filled_idx_tuples_valid_sub)

99721 of 99722                                                                                                                                                             

In [154]:
from tensorflow.keras.layers import Dense, Embedding, LeakyReLU

class NeuCFModel(keras.models.Model):
    def __init__(self, n_dense_block = 4, n_units = 30, regul = None, leaky_apha = 0.3, **kwargs):
        
        super().__init__(**kwargs)
        self.n_units = n_units
        
        self.mf_user_vec = Dense(n_units, kernel_regularizer = regul, name = 'mf_user_vec') #
        self.leaky_mf_user_vec = LeakyReLU(alpha = leaky_alpha)
        
        self.mf_item_vec = Dense(n_units, kernel_regularizer = regul, name = 'mf_item_vec') #
        self.leaky_mf_item_vec = LeakyReLU(alpha = leaky_alpha)
        
        self.gmf_layer = GMFLayer(name = 'gmf_layer')
        self.rating_output = Dense(1, activation = sigmoid_1_5, kernel_regularizer = regul, name = 'output')
        
    def call(self, inputs):
        user_input, item_input = inputs        
        mf_user_vec = self.mf_user_vec(user_input)
        mf_user_vec_act = self.leaky_mf_user_vec(mf_user_vec)
        
        mf_item_vec = self.mf_item_vec(item_input)
        mf_item_vec_act = self.leaky_mf_item_vec(mf_item_vec)
        
        gmf_layer = self.gmf_layer([mf_user_vec_act, mf_item_vec_act])
        rating_output = self.rating_output(gmf_layer)
        return rating_output

In [None]:
leaky_alpha = 0.4
reg = keras.regularizers.l2(0.1)
opt = keras.optimizers.Adam(0.0001)

model = NeuCFModel(name = 'neu_cf_model', n_dense_block = 5, n_units = 30, 
                   regul = reg, leaky_apha = leaky_alpha)

training_loop(model, matrix_sub, validation_set_sub, optimizer = opt, n_epochs = 20)
# Result: effective at learning

#### Only MLP

In [293]:
users = ratings['user_id'].unique()
movies = ratings['movie_id'].unique()

perc = 1
users_sub = np.random.choice(users, int(len(users) * perc), replace = False)
movies_sub = np.random.choice(movies, int(len(movies) * perc), replace = False)

filter_ = (ratings['user_id'].isin(users_sub)) | (ratings['movie_id'].isin(movies_sub)) 
ratings_sub = ratings.loc[filter_]

min_rated = 3
value_counts = ratings_sub['movie_id'].value_counts()
remove_movies = value_counts.loc[value_counts < min_rated].index
ratings_sub = ratings_sub.loc[~ ratings_sub['movie_id'].isin(remove_movies)]

In [294]:
X_train_sub, X_valid_sub, X_test_sub = split_dataset(ratings_sub)

matrix_sub = X_train_sub.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()
matrix_valid_sub = X_valid_sub.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id').to_numpy()

idx_valid_sub = np.where(~ np.isnan(matrix_valid_sub))
filled_idx_tuples_valid_sub = list(zip(*idx_valid_sub))
validation_set_sub = random_batch(matrix_valid_sub, filled_idx_tuples_valid_sub)

1814 of 1814       

In [156]:
from tensorflow.keras.layers import Dense, LeakyReLU

class DenseBlock(keras.layers.Layer):
    def __init__(self, n_layers, n_units, regularizer = None, leaky_alpha = 0.3, **kwargs):
        super().__init__(**kwargs)
        self.hidden = []
        for __ in range(n_layers):
            self.hidden.append(Dense(n_units, kernel_regularizer = regularizer))
            self.hidden.append(LeakyReLU(alpha = leaky_alpha))
        
    def call(self, inputs):
        Z = inputs
        for layer in self.hidden:
            Z = layer(Z)
        return Z
    

In [157]:
from tensorflow.keras.layers import Dense, Embedding, LeakyReLU

class EncoderMLP(keras.models.Model):
    def __init__(self, n_dense_block = 4, n_units = 30, regul = None, leaky_apha = 0.3, **kwargs):
        
        super().__init__(**kwargs)
        self.n_units = n_units
        
        self.mlp_user_vec = Dense(n_units, kernel_regularizer = regul, name = 'mlp_user_vec') #
        self.leaky_mlp_user_vec = LeakyReLU(alpha = leaky_alpha)
        
        self.mlp_item_vec = Dense(n_units, kernel_regularizer = regul, name = 'mlp_item_vec') #
        self.leaky_mlp_item_vec = LeakyReLU(alpha = leaky_alpha)

        self.concat_mlp_layer = keras.layers.Concatenate(name = 'concat_mlp_layer')

        self.gmf_layer = GMFLayer(name = 'gmf_layer')
        self.mlp_block = DenseBlock(n_dense_block, n_units, regularizer = regul, name = 'mlp_block')
        self.rating_output = Dense(1, activation = sigmoid_1_5, kernel_regularizer = regul, name = 'output')
        
    def call(self, inputs):
        
        user_input, item_input = inputs        
        mlp_user_vec = self.mlp_user_vec(user_input)
        mlp_user_vec_act = self.leaky_mlp_user_vec(mlp_user_vec)
        
        mlp_item_vec = self.mlp_item_vec(item_input)
        mlp_item_vec_act = self.leaky_mlp_item_vec(mlp_item_vec)
        
        concat_mlp_layer = self.concat_mlp_layer([mlp_user_vec_act, mlp_item_vec_act])
        mlp_block = self.mlp_block(concat_mlp_layer)
        rating_output = self.rating_output(mlp_block)
        return rating_output

In [None]:
leaky_alpha = 0.4
reg = keras.regularizers.l2(0.1)
opt = keras.optimizers.Adam(0.0001)

model = NeuCFModel(name = 'enc_model', n_dense_block = 5, n_units = 30, 
                   regul = reg, leaky_apha = leaky_alpha)

training_loop(model, matrix_sub, validation_set_sub, optimizer = opt, n_epochs = 20)
# Result: effective at learning