In [1]:
import torch

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch import nn, optim, Tensor

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn import LGConv

In [3]:
import pandas as pd

df = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='latin-1')
users = pd.read_csv('BX-Users.csv', sep=';', encoding='latin-1')
books = pd.read_csv('BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines="skip")
authors = pd.read_csv('BX-authors.csv')

# Preprocessing
df = df.loc[df['ISBN'].isin(books['ISBN'].unique()) & df['User-ID'].isin(users['User-ID'].unique())]


# Keep the 100k highest ratings
df = df[df['Book-Rating'] >= 8].iloc[:100000]

  books = pd.read_csv('BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines="skip")


In [4]:
df = pd.merge(df, authors[['ISBN', 'Book-Author-ID']], on='ISBN', how='left')


In [5]:
# Create mappings
# user_mapping = {userid: i for i, userid in enumerate(sampled_df['User-ID'].unique())}
# item_mapping = {isbn: i for i, isbn in enumerate(sampled_df['ISBN'].unique())}
user_mapping = {userid: i for i, userid in enumerate(df['User-ID'].unique())}
item_mapping = {isbn: i for i, isbn in enumerate(df['ISBN'].unique())}
author_mapping = {authorid: i for i, authorid in enumerate(df['Book-Author-ID'].unique())}

# Count users and items
num_users = len(user_mapping)
num_items = len(item_mapping)
num_authors = len(author_mapping)
num_total = num_users + num_items + num_authors

In [6]:
# Build the adjacency matrix based on user ratings
user_ids = torch.LongTensor([user_mapping[i] for i in df['User-ID']])
item_ids = torch.LongTensor([item_mapping[i] for i in df['ISBN']])
author_ids = torch.LongTensor([author_mapping[i] for i in df['Book-Author-ID']])

# edge_index = torch.stack((user_ids, item_ids, author_ids))
# Assuming user_ids, item_ids, and author_ids are defined as in your snippet

user_item_edges = torch.stack((user_ids, item_ids))  # 2xN tensor for user-item edges
item_author_edges = torch.stack((item_ids, author_ids))  # 2xM tensor for item-author edges

edge_index = torch.cat((user_item_edges, item_author_edges), dim=1)  # 2x(N+M) tensor for all edges

In [7]:
train_index, test_index = train_test_split(range(len(df)), test_size=0.2, random_state=0)
val_index, test_index = train_test_split(test_index, test_size=0.5, random_state=0)


train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]
test_edge_index = edge_index[:, test_index]

In [8]:
def sample_mini_batch(edge_index):
    # Generate BATCH_SIZE random indices
    index = np.random.choice(range(edge_index.shape[1]), size=BATCH_SIZE)

    # Generate negative sample indices
    # It should still return user-positive_item-negative_item triples for user-item interactions
    edge_index_neg = structured_negative_sampling(edge_index)
    edge_index_neg = torch.stack(edge_index_neg, dim=0)

    user_index = edge_index_neg[0, index]  # Users involved in the sampled interactions
    pos_item_index = edge_index_neg[1, index]  # Positive items for the sampled interactions
    neg_item_index = edge_index_neg[2, index]  # Negative items for the sampled interactions

    return user_index, pos_item_index, neg_item_index


In [9]:
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, num_authors, num_layers=4, dim_h=64):
        super().__init__()

        self.num_users = num_users
        self.num_items = num_items
        self.num_authors = num_authors
        self.num_layers = num_layers
        self.emb_users = nn.Embedding(num_embeddings=self.num_users, embedding_dim=dim_h)
        self.emb_items = nn.Embedding(num_embeddings=self.num_items, embedding_dim=dim_h)
        self.emb_authors = nn.Embedding(num_embeddings=self.num_authors, embedding_dim=dim_h)  # New embedding layer for authors

        self.convs = nn.ModuleList(LGConv() for _ in range(num_layers))

        nn.init.normal_(self.emb_users.weight, std=0.01)
        nn.init.normal_(self.emb_items.weight, std=0.01)
        nn.init.normal_(self.emb_authors.weight, std=0.01)  # Initialize author embeddings

    def forward(self, edge_index):
        emb = torch.cat([self.emb_users.weight, self.emb_items.weight, self.emb_authors.weight])  # Include author embeddings
        embs = [emb]

        for conv in self.convs:
            emb = conv(x=emb, edge_index=edge_index)
            embs.append(emb)

        emb_final = 1/(self.num_layers+1) * torch.mean(torch.stack(embs, dim=1), dim=1)

        emb_users_final, emb_items_final, emb_authors_final = torch.split(emb_final, [self.num_users, self.num_items, self.num_authors])  # Split embeddings for users, items, and authors

        return emb_users_final, self.emb_users.weight, emb_items_final, self.emb_items.weight, emb_authors_final, self.emb_authors.weight  # Return author embeddings as well


In [10]:
def bpr_loss(emb_users_final, emb_users, emb_pos_items_final, emb_pos_items, emb_neg_items_final, emb_neg_items, margin=1.0, pos_weights=None):
    reg_loss = LAMBDA * (emb_users.norm().pow(2) +
                         emb_pos_items.norm().pow(2) +
                         emb_neg_items.norm().pow(2))

    pos_ratings = torch.mul(emb_users_final, emb_pos_items_final).sum(dim=-1)
    neg_ratings = torch.mul(emb_users_final, emb_neg_items_final).sum(dim=-1)

    # Introduce a margin
    basic_loss = pos_ratings - neg_ratings - margin

    # Apply weighting for positive samples if provided
    if pos_weights is not None:
        weighted_loss = basic_loss * pos_weights
    else:
        weighted_loss = basic_loss

    bpr_loss = torch.mean(torch.nn.functional.softplus(weighted_loss))

    return -bpr_loss + reg_loss

In [11]:
def get_user_items(edge_index):
    user_items = dict()
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_items:
            user_items[user] = []
        user_items[user].append(item)
    return user_items

In [12]:
def compute_recall_at_k(items_ground_truth, items_predicted):
    num_correct_pred = np.sum(items_predicted, axis=1)
    num_total_pred = np.array([len(items_ground_truth[i]) for i in range(len(items_ground_truth))])

    recall = np.mean(num_correct_pred / num_total_pred)

    return recall

In [13]:
def compute_ndcg_at_k(items_ground_truth, items_predicted):
    test_matrix = np.zeros((len(items_predicted), K))

    for i, items in enumerate(items_ground_truth):
        length = min(len(items), K)
        test_matrix[i, :length] = 1

    max_r = test_matrix
    idcg = np.sum(max_r * 1. / np.log2(np.arange(2, K + 2)), axis=1)
    dcg = items_predicted * (1. / np.log2(np.arange(2, K + 2)))
    dcg = np.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg / idcg
    ndcg[np.isnan(ndcg)] = 0.

    return np.mean(ndcg)

In [14]:
# wrapper function to get evaluation metrics
def get_metrics(model, edge_index, exclude_edge_indices):

    ratings = torch.matmul(model.emb_users.weight, model.emb_items.weight.T)

    for exclude_edge_index in exclude_edge_indices:
        user_pos_items = get_user_items(exclude_edge_index)
        exclude_users = []
        exclude_items = []
        for user, items in user_pos_items.items():
            exclude_users.extend([user] * len(items))
            exclude_items.extend(items)
        ratings[exclude_users, exclude_items] = -1024

    # get the top k recommended items for each user
    _, top_K_items = torch.topk(ratings, k=K)

    # get all unique users in evaluated split
    users = edge_index[0].unique()

    test_user_pos_items = get_user_items(edge_index)

    # convert test user pos items dictionary into a list
    test_user_pos_items_list = [test_user_pos_items[user.item()] for user in users]

    # determine the correctness of topk predictions
    items_predicted = []
    for user in users:
        ground_truth_items = test_user_pos_items[user.item()]
        label = list(map(lambda x: x in ground_truth_items, top_K_items[user]))
        items_predicted.append(label)

    recall = compute_recall_at_k(test_user_pos_items_list, items_predicted)
    ndcg = compute_ndcg_at_k(test_user_pos_items_list, items_predicted)

    return recall, ndcg

In [15]:
# wrapper function to evaluate model
def test(model, edge_index, exclude_edge_indices):
    # Using '_' for unused author embeddings
    emb_users_final, emb_users, emb_items_final, emb_items, _, _ = model.forward(edge_index)
    user_indices, pos_item_indices, neg_item_indices = structured_negative_sampling(edge_index, contains_neg_self_loops=False)

    emb_users_final, emb_users = emb_users_final[user_indices], emb_users[user_indices]
    emb_pos_items_final, emb_pos_items = emb_items_final[pos_item_indices], emb_items[pos_item_indices]
    emb_neg_items_final, emb_neg_items = emb_items_final[neg_item_indices], emb_items[neg_item_indices]


    loss = bpr_loss(emb_users_final, emb_users, emb_pos_items_final, emb_pos_items, emb_neg_items_final, emb_neg_items).item()

    recall, ndcg = get_metrics(model, edge_index, exclude_edge_indices)

    return loss, recall, ndcg

In [17]:
K = 20
LAMBDA = 1e-6
BATCH_SIZE = 1024
EPOCHS = 51

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LightGCN(num_users, num_items, num_authors)
model = model.to(device)
edge_index = edge_index.to(device)
train_edge_index = train_edge_index.to(device)
val_edge_index = val_edge_index.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
%%time

n_batch = int(len(train_index)/BATCH_SIZE)

for epoch in range(EPOCHS):
    model.train()

    for _ in range(n_batch):
        optimizer.zero_grad()

        # Use train_edge_index for the forward pass during training
        emb_users_final, emb_users, emb_items_final, emb_items, emb_authors_final, emb_authors = model.forward(train_edge_index)

        user_indices, pos_item_indices, neg_item_indices = sample_mini_batch(train_edge_index)

        emb_users_final, emb_users = emb_users_final[user_indices], emb_users[user_indices]
        emb_pos_items_final, emb_pos_items = emb_items_final[pos_item_indices], emb_items[pos_item_indices]
        emb_neg_items_final, emb_neg_items = emb_items_final[neg_item_indices], emb_items[neg_item_indices]

        train_loss = bpr_loss(emb_users_final, emb_users, emb_pos_items_final, emb_pos_items, emb_neg_items_final, emb_neg_items)

        train_loss.backward()
        optimizer.step()

    if epoch % 5 == 0:
        model.eval()
        val_loss, recall, ndcg = test(model, val_edge_index, [train_edge_index])
        print(f"Epoch {epoch} | Train loss: {train_loss.item():.5f} | Val loss: {val_loss:.5f} | Val recall@{K}: {recall:.5f} | Val ndcg@{K}: {ndcg:.5f}")

Epoch 0 | Train loss: -0.31333 | Val loss: -0.31293 | Val recall@20: 0.00653 | Val ndcg@20: 0.00408
Epoch 5 | Train loss: -0.32512 | Val loss: -0.27301 | Val recall@20: 0.01489 | Val ndcg@20: 0.00747
Epoch 10 | Train loss: -0.49774 | Val loss: -0.07128 | Val recall@20: 0.01460 | Val ndcg@20: 0.00775
Epoch 15 | Train loss: -0.99398 | Val loss: 0.34665 | Val recall@20: 0.01760 | Val ndcg@20: 0.00855
Epoch 20 | Train loss: -1.82348 | Val loss: 0.91118 | Val recall@20: 0.01835 | Val ndcg@20: 0.00903
Epoch 25 | Train loss: -2.48324 | Val loss: 1.58961 | Val recall@20: 0.01807 | Val ndcg@20: 0.00879
Epoch 30 | Train loss: -3.53394 | Val loss: 2.36438 | Val recall@20: 0.01765 | Val ndcg@20: 0.00869
Epoch 35 | Train loss: -5.42285 | Val loss: 3.23769 | Val recall@20: 0.01757 | Val ndcg@20: 0.00891
Epoch 40 | Train loss: -6.87920 | Val loss: 4.21863 | Val recall@20: 0.01765 | Val ndcg@20: 0.00905
Epoch 45 | Train loss: -8.71951 | Val loss: 5.28722 | Val recall@20: 0.01765 | Val ndcg@20: 0.00905

In [20]:
test_loss, test_recall, test_ndcg = test(model, test_edge_index.to(device), [train_edge_index, val_edge_index])

print(f"Test loss: {test_loss:.5f} | Test recall@{K}: {test_recall:.5f} | Test ndcg@{K}: {test_ndcg:.5f}")

Test loss: 6.37681 | Test recall@20: 0.01689 | Test ndcg@20: 0.00827
