In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Concatenate, Dense, Dot, Dropout, Embedding, Input, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, ModelCheckpoint
from sklearn.metrics import mean_squared_error
import random
import os

# Setting random seeds to replicate results easily
os.environ['PYTHONHASHSEED']=str(0)
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

# Root Mean Squared Error (RMSE)

We need a reliable way to evaluate the performance of recommendation algorithms. RMSE is one of the popular metrics to estimate how good the recommendation algorithm is. Since RMSE is measuring the prediction errors, the smaller error that the model can achieve, the better performance it is, and vice versa.

$$RMSE=\sqrt{\sum_{i=1}^n\frac{(\hat{y}_i - y_i)^2}{N}}$$

$\hat{y}_i$: The predicted answer of sample $i$

$y$: The ground truth answer of sample $i$

In [None]:
def rmse(pred, actual):
    '''
    params:
        pred <np.array>: an array containing all predicted ratings
        actual <np.array>: an array containing all ground truth ratings

    return:
        a scalar whose value is the rmse
    '''
    # Ignore ratings with value zero.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return np.sqrt(mean_squared_error(pred, actual))

# Neural Collaborative Filtering (NCF) Model Implementation

Here we implement two instantiations of NCF model. 

The first instantiation computes the recommendation score (e.g., ratings) between a pair of user and item using dot product of their embeddings, which is equivalent to matrix factorization model for recommendation.

The second instantiation concatenates the user's and item's embeddings, then feed the the concatenated vector into a MLP to calculate the recommendation score. Adoption of MLP equips the model with high flexibility and non-linearity to effectively learn the interaction between user and item latent features.

In [None]:
def build_ncf_model(n_users, n_items, embed_size, dropout='0.15', output_layer='dot'):
    '''
    params:
        n_users <int>: The number of user embedding vectors
        n_items <int>: The number of item embedding vectors
        embed_size <int>: The dimension of each embedding vector
        output_layer <str>: Indicates the instantiation of NCF to use, available options are either 'dot' or 'mlp'

    return:
        a keras Model object for the constructed ncf model 
    '''
    # Get the users and items input
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')


    # Get the embeddings of users and items
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)
    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)


    if output_layer == 'dot':
        # Compute the dot product of users' and items' embeddings as the model output
        model_output = Dot(axes=1)([user_emb, item_emb])

    elif output_layer == 'mlp':
        # Concatenate the users' and items' embeddings as the input of MLP
        mlp_input = Concatenate()([user_emb, item_emb])

        # First fully-connected layer
        dense_1 = Dense(64, activation='relu', kernel_regularizer='l2')(mlp_input)
        dense_1_dp = Dropout(dropout)(dense_1)

        # Second fully-connected layer
        dense_2 = Dense(32, activation='relu', kernel_regularizer='l2')(dense_1_dp)
        dense_2_dp = Dropout(dropout)(dense_2)

        # Final fully-connected layer to compute model output
        model_output = Dense(1)(dense_2_dp)
    else:
        raise NotImplementedError

    model = Model(inputs=[user_input, item_input], outputs=model_output)
    return model

In [None]:
def build_ncf_combine_model(n_users, n_items, embed_size, output_layer='dot'):
    '''
    params:
        n_users <int>: The number of user embedding vectors
        n_items <int>: The number of item embedding vectors
        embed_size <int>: The dimension of each embedding vector
        output_layer <str>: Indicates the instantiation of NCF to use, available options are either 'dot' or 'mlp'

    return:
        a keras Model object for the constructed ncf model 
    '''
    # Get the users and items input
    user_input = Input(shape=(1,), dtype='int32', name='user_input')
    item_input = Input(shape=(1,), dtype='int32', name='item_input')


    # Get the embeddings of users and items
    user_emb = Embedding(output_dim=embed_size, input_dim=n_users, input_length=1)(user_input)
    user_emb = Reshape((embed_size,))(user_emb)
    item_emb = Embedding(output_dim=embed_size, input_dim=n_items, input_length=1)(item_input)
    item_emb = Reshape((embed_size,))(item_emb)

    model_output = Dot(axes=1)([user_emb, item_emb])

    mlp_input = Concatenate()([user_emb, item_emb])

    # First fully-connected layer
    dense_1 = Dense(64, activation='relu', kernel_regularizer='l2')(mlp_input)
    dense_1_dp = Dropout(0.0)(dense_1)

    # Second fully-connected layer
    dense_2 = Dense(32, activation='relu', kernel_regularizer='l2')(dense_1_dp)
    dense_2_dp = Dropout(0.0)(dense_2)

    
    NMFLayer = Concatenate()([model_output, dense_2_dp])
    NMFOutput = Dense(1, activation='relu')(NMFLayer)

    model = Model(inputs=[user_input, item_input], outputs=NMFOutput)
    return model

# Ratings Prediction

### Loading training and validation rating table

In [None]:
tr_df = pd.read_csv("data/review.csv")
val_df = pd.read_csv("data/validation.csv")

### Building two dictionaries to map original user ids and item ids into corresponding indices in respective embedding matrices

In [None]:
# Get the unique set of all user ids and set of all business ids in train set
user_set = list(tr_df.ReviewerID.unique())
business_set = list(tr_df.ProductID.unique())

# Build user vocabulary
user_vocab = dict(zip(user_set, range(1, len(user_set) + 1)))

# Reserve the first row of the embedding matrix for users unseen in the training set
user_vocab['unk'] = 0 
n_users = len(user_vocab)

# Build business vocabulary
business_vocab = dict(zip(business_set, range(1, len(business_set) + 1)))
# Reserve the first row of the embedding matrix for businesses unseen in the training set
business_vocab['unk'] = 0
n_items = len(business_vocab)

### Replacing the original user and item ids in train and valdiation set with indices in embedding matrices

In [None]:
# Transforming user_id into a number by the user_vocab dictionary, and
# transforming business_id into a number by the business_vocab dictonary
tr_users = tr_df.ReviewerID.apply(lambda x: user_vocab[x]).values
tr_items = tr_df.ProductID.apply(lambda x: business_vocab[x]).values
val_users = val_df.ReviewerID.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
val_items = val_df.ProductID.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values

### Retrieving ratings in the training and validation set

In [None]:
tr_ratings = tr_df.Star.values
val_ratings = val_df.Star.values

### Building the NCF model defined above

In [None]:
model = build_ncf_model(n_users, n_items, embed_size=16, dropout=0.0, output_layer='mlp')
# model = build_ncf_combine_model(n_users, n_items, embed_size=16, output_layer='mlp')

### Training the model using Adam optimizer and mean squared error loss

In [None]:
model.compile(optimizer='adam', loss='mse')

history = model.fit(
        [tr_users, tr_items], 
        tr_ratings, 
        epochs=2, 
        verbose=1,)
model.save('model.keras')

### Evaluating the model on train and validation sets using RMSE

In [None]:
model = tf.keras.models.load_model('model.keras')
y_pred = model.predict([tr_users, tr_items])
print("Train set RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("Validation set RMSE: ", rmse(y_pred, val_ratings))

In [None]:
# Save Test prediction

test_df = pd.read_csv("data/prediction.csv")
test_users = test_df.ReviewerID.apply(lambda x: user_vocab[x] if x in user_vocab else 0).values
test_items = test_df.ProductID.apply(lambda x: business_vocab[x] if x in business_vocab else 0).values

model = tf.keras.models.load_model('model.keras')

y_pred = model.predict([test_users, test_items])
test_df['Star'] = y_pred
test_df.to_csv('prediction.csv', index=False)

In [None]:
# Save Validation prediction

model = tf.keras.models.load_model('model.keras')

y_pred = model.predict([val_users, val_items])
val_df['Star'] = y_pred
val_df.to_csv('validation_prediction.csv', index=False)

In [None]:
model = tf.keras.models.load_model('model_submission.h5')
y_pred = model.predict([tr_users, tr_items])
print("Train set RMSE: ", rmse(y_pred, tr_ratings))
y_pred = model.predict([val_users, val_items])
print("Validation set RMSE: ", rmse(y_pred, val_ratings))

### Parameter-Tuning on MLP

In [None]:
embed_sizes = [16, 50, 64, 128]
dropouts = [0.0, 0.15, 0.3]
i = 0
for embed in embed_sizes:
    for dp in dropouts:
        model = build_ncf_model(n_users, n_items, embed_size=embed, dropout = dp, output_layer='mlp')
        model.compile(optimizer='adam', loss='mse')

        history = model.fit(
                [tr_users, tr_items], 
                tr_ratings, 
                epochs=2, 
                verbose=0,
                callbacks=[ModelCheckpoint(f'models/model{i}.h5')])
        
        y_pred = model.predict([val_users, val_items])
        print(f"model{i}.h5, Embed Size: {embed}, Dropout: {dp}, Validation set RMSE: ", rmse(y_pred, val_ratings))
        i+=1