# test-gnn-recommendation

- 参考
    - https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8
    - https://medium.com/arangodb/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems-dd69db688465
- サンプルデータ: [The Movies Dataset](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)
    - `credits.csv`
    - `keywords.csv`
    - `links.csv`
    - `links_small.csv`
    - `movies_metadata.csv`
    - `ratings.csv`
    - `ratings_small.csv`

In [1]:
import ast
import itertools

import numpy as np
import pandas as pd
import torch
import torch_geometric.transforms as T
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData
from torch_geometric.transforms import RandomLinkSplit, ToUndirected

pd.options.display.max_columns = 100

In [2]:
# データの読み込み
LINKS_PATH = "./data/links.csv"
METADATA_PATH = "./data/movies_metadata.csv"
RATINGS_PATH = "./data/ratings.csv"

df_links = pd.read_csv(LINKS_PATH, dtype=str)
df_metadata = pd.read_csv(METADATA_PATH, dtype=str)
df_ratings = pd.read_csv(RATINGS_PATH, dtype=str)

df_metadata = df_metadata.drop([19730, 29503, 35587])
df_metadata["title"] = df_metadata["title"].fillna("")

In [3]:
# GPUが使用可能な場合GPUを使用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [4]:
edge_index = torch.from_numpy(
    df_ratings[["userId", "movieId"]].astype(int).values.T
).clone()
edge_label = torch.from_numpy(
    (df_ratings["rating"].astype(float) * 2)
    .astype(int)
    .values  # 0.5単位なので、2倍して10段階評価に変更
).clone()

In [5]:
# 映画タイトル畳み込み
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
title_embeddings = model.encode(
    df_metadata["title"].values,
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device,
)
print("Title Embeddings shape:", title_embeddings.shape)

# ジャンルのエンコード
def genre_list_to_genre_names(genre_list):
    genre_list = ast.literal_eval(genre_list)
    if len(genre_list) == 0:
        return []
    genre_names = []
    for genre_dict in genre_list:
        genre_names.append(genre_dict["name"])
    return genre_names


genres = df_metadata["genres"].apply(genre_list_to_genre_names).values
unique_genres = set(itertools.chain(*genres))
mapping = {g: i for i, g in enumerate(unique_genres)}
encoded_genres = torch.zeros(len(genres), len(mapping))
for i, genre in enumerate(genres):
    for g in genre:
        encoded_genres[i, mapping[g]] = 1
encoded_genres = encoded_genres.to(device)
print("Encoded Genres shape:", encoded_genres.shape)

# 映画の特徴量を結合
movie_x = torch.cat((title_embeddings, encoded_genres), dim=-1)
print("Shape of the concatenated features:", movie_x.shape)

Batches:   0%|          | 0/1421 [00:00<?, ?it/s]

Title Embeddings shape: torch.Size([45463, 384])
Encoded Genres shape: torch.Size([45463, 20])
Shape of the concatenated features: torch.Size([45463, 404])


In [6]:
# グラフを作成
data = HeteroData()
l = len(set(df_ratings["userId"].values))
data[
    "user"
].x = torch.sparse_coo_tensor(  # 密行列だと300GBくらいメモリを要求された。ほんとはtorch.eye(l, layout=torch.torch.sparse_coo)で書きたいが、beta版しかない。
    [[t for t in range(l)], [t for t in range(l)]], np.ones(l), (l, l)
)
data["movie"].x = movie_x
data["user", "rates", "movie"].edge_index = edge_index
data["user", "rates", "movie"].edge_label = edge_label
data = ToUndirected()(data)  # 映画->ユーザーのエッジも作成
del data["movie", "rev_rates", "user"].edge_label  # 映画->ユーザーのエッジラベルを削除
data = data.to(device)
data

HeteroData(
  [1muser[0m={ x=[270896, 270896] },
  [1mmovie[0m={ x=[45463, 404] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 26024289],
    edge_label=[26024289]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 26024289] }
)

In [7]:
# エッジを学習・検証・テストに分割
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[("user", "rates", "movie")],
    rev_edge_types=[("movie", "rev_rates", "user")],
)(data)

In [8]:
# レートに偏りがあるため、重みづけする。
def weighted_mse_loss(pred, target, weight=None):
    weight = 1.0 if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

weight = torch.bincount(train_data["user", "movie"].edge_label)
weight = weight.max() / weight
weight

tensor([    inf, 17.2904,  8.2972, 17.3159,  3.9705,  5.5711,  1.3312,  2.2462,
         1.0000,  3.2264,  1.8362], device='cuda:0')