In [1]:
# cell for Google Colab ipynb opening

## %%capture
# !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://pytorch-geometric.com/whl/torch-2.2.1+cu121.html
# !pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-2.2.0+cu121.html
# !pip install faiss-gpu

In [1]:
import numpy as np
import pandas as pd
import torch

from torch_geometric import EdgeIndex
from torch_geometric.utils import degree
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.nn import MIPSKNNIndex
from torch_geometric.metrics import LinkPredMAP, LinkPredPrecision, LinkPredRecall
from torch_geometric.nn.models.lightgcn import BPRLoss

from tqdm import tqdm
import os
os.environ['PYDEVD_DISABLE_FILE_VALIDATION']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from utils.model import GNN

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
faiss_device = torch.device('cpu') # 'cuda' for Colab, 'cpu' for my Windows laptop

In [2]:
from utils.data_transformation import data_to_heterograph

data, users_rev_mapping, movies_rev_mapping = data_to_heterograph('encoded_data.npz')
data

HeteroData(
  movie={ x=[15008, 202] },
  user={ x=[744288, 14] },
  (user, watched, movie)={
    edge_index=[2, 4424477],
    time=[4424477],
  },
  (movie, rev_watched, user)={
    edge_index=[2, 4424477],
    time=[4424477],
  }
)

In [3]:
# sparse interaction matrix
4424477 / (15008 * 744288)

0.0003960938540619029

## Train/test split

In [15]:
train_ratio = 0.8
train_size = int(train_ratio * data['user', 'movie'].num_edges)
watch_threshold = 5

edges = data['user', 'movie'].edge_index
time = data['user', 'movie'].time
num_users, num_movies = data['user'].num_nodes, data['movie'].num_nodes

loader_kwargs = dict(
    data=data, batch_size=1024,
    num_neighbors=[5, 5, 5],
    time_attr='time', temporal_strategy='last',
    num_workers=4)

train_loader = LinkNeighborLoader(
    edge_label_index=(('user', 'movie'), edges[:, :train_size]),
    edge_label_time=time[torch.arange(train_size)]-1,
    neg_sampling=dict(mode='binary', amount=1),
    shuffle=True,
    **loader_kwargs)

user_loader = NeighborLoader(
    input_nodes='user',
    input_time=(time[train_size]-1).repeat(num_users),
    **loader_kwargs)

movie_loader = NeighborLoader(
    input_nodes='movie',
    input_time=(time[train_size]-1).repeat(num_movies),
    **loader_kwargs)

sparse_size = (num_users, num_movies)
train_edges = EdgeIndex(edges[:, :train_size].contiguous().to(device),
                        sparse_size=sparse_size).sort_by('row').values
test_edges = EdgeIndex(edges[:, train_size:].contiguous().to(device),
                       sparse_size=sparse_size).sort_by('row').values

is_test_node = degree(train_edges[0], num_nodes=num_users) >= watch_threshold

test_edges = test_edges[:, is_test_node[test_edges[0]]]
train_edges = train_edges[:, is_test_node[train_edges[0]]]

## GNN

In [3]:
gnn_model = GNN(num_layers=3, hidden_channels=64).to(device)
optimizer = torch.optim.Adam(gnn_model.parameters(), lr=0.005, weight_decay=1e-4)
bpr_loss = BPRLoss()

In [None]:
from utils.data_transformation import sparse_batch_narrow

def train():
    gnn_model.train()
    total_loss = total_examples = 0
    for batch in tqdm(train_loader):
        batch = batch.to(device)
        batch_size = len(batch['user', 'movie'].input_id)
        optimizer.zero_grad()

        out = gnn_model(batch.x_dict, batch.edge_index_dict,
                        batch['user', 'movie'].edge_label_index)

        loss = bpr_loss(out[:batch_size], out[batch_size:])
        loss.backward(); optimizer.step()

        total_loss += float(loss) * batch_size
        total_examples += batch_size

    return total_loss / total_examples

@torch.no_grad()
def test(test_edges: EdgeIndex, train_edges: EdgeIndex, k: int, top_count: int = None):
    gnn_model.eval()
    movie_embs = gnn_model.get_movies_embeddings(movie_loader, device)

    if top_count is None:
        movie_embs = torch.cat(movie_embs, dim=0)
    else:
        emb_size = movie_embs[0].size()[1]
        movie_embs.append(torch.zeros((1, emb_size), device=device))
        movie_embs = torch.cat(movie_embs, dim=0)
        top_indices = list(range(top_count)) + [len(movie_embs)-1]
        movie_embs = movie_embs[top_indices]
    
    mipsknn = MIPSKNNIndex(movie_embs.to(faiss_device))
    metrics = LinkPredMAP(k), LinkPredPrecision(k), LinkPredRecall(k)
    users_infered = 0
    for batch in user_loader:
        batch = batch.to(device)
        batch_size = batch['user'].batch_size
        batch_user_embs = gnn_model.encoder(batch.x_dict, batch.edge_index_dict)\
            ['user'][:batch_size]
        
        batch_test_user_embs =\
            batch_user_embs[is_test_node[users_infered:users_infered+batch_size]].to(faiss_device)

        batch_test_edges = sparse_batch_narrow(test_edges, users_infered, batch_size)
        batch_train_edges = sparse_batch_narrow(train_edges, users_infered, batch_size).to(faiss_device)

        if top_count is not None:
            batch_test_edges[1, batch_test_edges[1] >= top_count] = top_count
            batch_train_edges[1, batch_train_edges[1] >= top_count] = top_count

        top_indices_mat = mipsknn.search(batch_test_user_embs, k, exclude_links=batch_train_edges)[1]
        for metric in metrics:
            metric.update(top_indices_mat.cpu(), batch_test_edges)

        users_infered += batch_size

    return tuple(float(metric.compute()) for metric in metrics)

k = 20
metrics_list = []
try:
    for epoch_num in range(1, 30):
        loss = train()
        print(f'Train: Epoch №{epoch_num:02d}, Loss: {loss:.4f}')
        map, precision, recall = test(test_edges, train_edges, k=20, top_count=2000)
        print('Test@%d, MAP: %.4f, Precision: %.4f, Recall: %.4f' % (k, map, precision, recall))
        metrics_list.append([loss, map, precision, recall])

except KeyboardInterrupt:
  print('--KeyboardInterrupt--')

In [137]:
torch.save(gnn_model.state_dict(), './gnn_state.pt')

## Example of making recommendations by Graph Neural Network

In [3]:
%%capture
gnn_model = GNN(num_layers=3, hidden_channels=64)
gnn_model.load_state_dict(torch.load('./gnn_state.pt'))
gnn_model.to(device)

In [4]:
df_inter = pd.read_csv('processed_data/interactions.csv')
df_items = pd.read_csv('processed_data/items.csv')
df_users = pd.read_csv('processed_data/users.csv')

users_mapping = {user_id: idx for idx, user_id in users_rev_mapping.items()}
movies_mapping = {movie_id: idx for idx, movie_id in movies_rev_mapping.items()}
movies_id_to_df_idx = {movie_id: df_idx for df_idx, movie_id in df_items['item_id'].items()}

In [57]:
user_id = 297478
df_inter[df_inter['user_id'] == user_id].merge(df_items)

Unnamed: 0,user_id,item_id,last_watch_dt,watched_pct,content_type,title,genres,age_rating,keywords,views
0,297478,4024,2021-03-13,100.0,film,"Я, робот","боевики, драмы, фантастика, триллеры",12,"самоубийство, искусственный интеллект, человек...",2458
1,297478,16361,2021-03-13,97.0,film,Doom: Аннигиляция,"боевики, ужасы, фантастика, триллеры",18,"планета Марс, ад, космос, демон, по мотивам ви...",6794
2,297478,4475,2021-03-13,92.0,film,Тачки,"спорт, мультфильм, комедии",6,"автомобильная гонка, трасса 66, Porsche, выход...",4033
3,297478,281,2021-03-13,100.0,film,Человек-муравей и Оса,"боевики, фантастика, приключения, комедии",12,"насекомое, муравей, сокращение, продолжение, с...",2020
4,297478,10958,2021-03-13,25.0,film,И грянул гром,"приключения, фантастика, триллеры, боевики, ужасы",16,"смерть, путешествие во времени, романтика, кло...",420
5,297478,6470,2021-03-13,100.0,film,Роботы,"фантастика, мультфильм, приключения, комедии",6,"изобретатель, деловой человек, робот, CGI-аним...",929
6,297478,4880,2021-06-21,9.0,series,Афера,комедии,18,"Афера, Аферисты, Карантин, Пандемия, Карантин ...",43337


In [58]:
df_users[df_users['user_id'] == user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
263449,297478,age_25_34,income_40_60,М,1


In [59]:
users_idx = torch.tensor([users_mapping[user_id]])
with torch.no_grad():
    recs = gnn_model.recommend(users_idx, data,
        top_count=None, k=20, device=device, faiss_device=faiss_device)
    
rec_ids = np.vectorize(movies_rev_mapping.get)(recs.squeeze())
rec_df_idx = np.vectorize(movies_id_to_df_idx.get)(rec_ids)
df_items.iloc[rec_df_idx].reset_index(drop=True)

EdgeIndex([[263449, 263449, 263449, 263449, 263449, 263449, 263449],
           [   313,     74,    150,    387,   1778,    902,      6]],
          sparse_size=(744288, 15008), nnz=7)


Unnamed: 0,item_id,content_type,title,genres,age_rating,keywords,views
0,16166,film,Зверополис,"приключения, мультфильм, детективы, комедии",6,"аллегория, лев, бегемот, лиса, слон, овца, бел...",11471
1,10440,series,Хрустальный,"триллеры, детективы",18,"хруст, хрусталь, хруста, хрус, полицейский, пе...",133553
2,4151,series,Секреты семейной жизни,комедии,18,"брызги крови, кровь, жестокое обращение с живо...",66206
3,16270,film,Тайна Коко,"мультфильм, фэнтези, приключения",12,"Мексика, гитара, музыкант, скелет, музыка, заг...",6201
4,13243,film,Головоломка,"фантастика, мультфильм, комедии",6,"мечта, мультфильм, воображаемый друг, начальна...",4528
5,12988,film,Гномео и Джульетта,"мелодрамы, мультфильм, приключения, комедии",0,"сад, запретная любовь, поцелуй, садовый гном, ...",1826
6,13915,film,Вперёд,"для детей, приключения, семейное, фэнтези, ком...",6,"эльфы, мир фантазий, эльф, главный герой подро...",6276
7,14942,film,История игрушек: Большой побег,"мультфильм, фэнтези, комедии",6,"заложник, колледж, игрушка, побег, детский сад...",2525
8,13218,film,Принцесса Эмми,"семейное, мультфильм",6,"2019, германия, бельгия, соединенное королевст...",886
9,7582,film,Холодное сердце II,"фэнтези, мультфильм, музыкальные",6,"королева, магия, королевство, плотина, дух, же...",5814
