Neural Collaborative Filtering (https://arxiv.org/pdf/1708.05031.pdf)
- combine generalized matrix factorization, which learns linear interaction, with multi-layer perceptron, which is capable of modelling more complex & non-linear interactions
- GMF and MLP use separate embeddings for user & item, allowing them to learn independently
- NCF, like vanilla matrix factorization, only takes in user-item interaction data and does not consider auxiliary features of the users and items
- and since there's only interaction data, this model is not ideal for cold start users & items

<img src="model_illustration.png" style="width:70%">

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix

In [10]:
class NCF(nn.Module):
    def __init__(self, mlp_layers, n_features, gmf_embed_dim, mlp_embed_dim, dropout=0.2):
        super().__init__()
        # gmf
        self.gmf_embedding = nn.Embedding(n_features, gmf_embed_dim)
        
        # mlp
        self.mlp_embedding = nn.Embedding(n_features, mlp_embed_dim)
        # mlp input is the concatenation of user & item embedding
        self.mlp_input_dim = 2*mlp_embed_dim
        mlp_input_dim = self.mlp_input_dim
        layers = []
        for layer_dim in mlp_layers:
            layers.append(nn.Linear(mlp_input_dim, layer_dim))
            layers.append(nn.BatchNorm1d(layer_dim))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            mlp_input_dim = layer_dim
        self.mlp = nn.Sequential(*layers)
        
        # final fully connected layer
        self.fc = nn.Linear(gmf_embed_dim+mlp_input_dim, 1)
        
    def forward(self, x):
        # x = [item_id, user_id]
    
        # gmf
        gmf_embedding = self.gmf_embedding(x)
        item_gmf_embedding = gmf_embedding[: ,0]
        user_gmf_embedding = gmf_embedding[: ,1]
        # element wise multiplication
        gmf_out = item_gmf_embedding * user_gmf_embedding
        
        # mlp 
        mlp_embedding = self.mlp_embedding(x)
        concat_mlp_embedding = mlp_embedding.view(-1,self.mlp_input_dim)
        mlp_out = self.mlp(concat_mlp_embedding)
        
        # concat gmf & mlp output
        out = torch.cat([gmf_out, mlp_out], dim=1)
        out = self.fc(out)
        return torch.sigmoid(out)
    
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            return self.forward(x) 
        

In [12]:
def train(model, dataloader, epochs=20, lr=0.001):
    device = (
        torch.device("cuda:0") if torch.cuda.is_available(
        ) else torch.device("cpu")
    )
    model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.BCELoss()
    training_history = []
    for epoch in range(epochs):
        epoch_loss = 0
        for x, y in dataloader:
            y_pred = model.forward(x)
            loss = criterion(y_pred, y)
            epoch_loss += loss
            model.zero_grad()
            loss.backward()
            optimizer.step()
        epoch_loss /= len(dataloader)
        training_history.append(epoch_loss)
        if epoch%10 == 0:
            print(f"Epoch {epoch}: {epoch_loss:.4f}")
    return model, training_history

# Data Preparation
- X is an array of feature indices: [[item_id, user_id]]
- y is just a (n_sample, 1) array of the ground truth

In [13]:
import sys
sys.path.append('..')
import utils

In [14]:
rating, item, user = utils.get_movielens()

In [17]:
X = rating[['item_id', 'user_id']]
X['user_id'] = X['user_id'] + rating['item_id'].max() + 1 # offset
X = X.to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['user_id'] = X['user_id'] + rating['item_id'].max() + 1 # offset


In [23]:
# convert rating to 1/0
threshold = 3
y = np.where(rating['rating'].to_numpy()>=threshold, 1, 0).reshape(-1, 1)

Train test split

Here, for simplicity, we are only using a random split, with 80% as the train set, and 20% as the test set. In practice, the splitting maybe done by user, e.g. 80/20 split of a user's rating/interaction history.

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
dataset = data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train).float())
train_dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

# Model

In [26]:
model = NCF(
    mlp_layers=[32, 16, 8], 
    n_features=X.max()+1, 
    gmf_embed_dim=30, 
    mlp_embed_dim=30, 
    dropout=0.2
)

In [27]:
model, history = train(model=model,
                       dataloader=train_dataloader,
                       epochs=100,
                       lr=0.001)


Epoch 0: 0.5329
Epoch 10: 0.3415
Epoch 20: 0.2368
Epoch 30: 0.1483
Epoch 40: 0.0851
Epoch 50: 0.0473
Epoch 60: 0.0262
Epoch 70: 0.0150
Epoch 80: 0.0074
Epoch 90: 0.0043


In [28]:
y_pred_soft = model.predict(torch.from_numpy(X_test))
y_pred = np.where(y_pred_soft.numpy() > 0.5, 1, 0)

acc = accuracy_score(y_pred, y_test)
auc = roc_auc_score(y_test, y_pred_soft)
f1 = f1_score(y_test, y_pred)
cf_mat = confusion_matrix(y_test, y_pred)

In [29]:
print(f"Accuracy: {acc}")
print(f"AUC: {auc}")
print(f"F1 Score: {f1}")


Accuracy: 0.7627
AUC: 0.6320061250614306
F1 Score: 0.8562515144172522


In [30]:
cf_mat


array([[ 1119,  2349],
       [ 2397, 14135]])