# test-gnn-recommendation

- 参考
    - https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8
    - https://medium.com/arangodb/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems-dd69db688465
- サンプルデータ: [The Movies Dataset](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)
    - `credits.csv`
    - `keywords.csv`
    - `links.csv`
    - `links_small.csv`
    - `movies_metadata.csv`
    - `ratings.csv`
    - `ratings_small.csv`

In [1]:
import ast
import itertools

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData

pd.options.display.max_columns = 100

In [2]:
# データの読み込み
LINKS_PATH = "./data/links_small.csv"
METADATA_PATH = "./data/movies_metadata.csv"
RATINGS_PATH = "./data/ratings_small.csv"

df_links = pd.read_csv(LINKS_PATH, dtype=str)
df_metadata = pd.read_csv(METADATA_PATH, dtype=str)
df_ratings = pd.read_csv(RATINGS_PATH, dtype=str)

df_metadata = df_metadata.drop([19730, 29503, 35587])
df_metadata["title"] = df_metadata["title"].fillna("")

In [3]:
# GPUが使用可能な場合GPUを使用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [4]:
src = df_ratings[["userId", "movieId"]].astype(int).values.T
dst = df_ratings["movieId"].astype(int).values
ratings = df_ratings["rating"].astype(float).values

edge_index = torch.from_numpy(src.astype(np.float32)).clone()
edge_label = torch.from_numpy(ratings.astype(np.float32)).clone()

In [5]:
# 映画タイトル畳み込み
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
title_embeddings = model.encode(
    df_metadata["title"].values,
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device,
)
print('Title Embeddings shape:', title_embeddings.shape)

# ジャンルのエンコード
def genre_list_to_genre_names(genre_list):
    genre_list = ast.literal_eval(genre_list)
    if len(genre_list) == 0:
        return []
    genre_names = []
    for genre_dict in genre_list:
        genre_names.append(genre_dict["name"])
    return genre_names

genres = df_metadata["genres"].apply(genre_list_to_genre_names).values
unique_genres = set(itertools.chain(*genres))
mapping = {g: i for i, g in enumerate(unique_genres)}
encoded_genres = torch.zeros(len(genres), len(mapping))
for i, genre in enumerate(genres):
    for g in genre:
        encoded_genres[i, mapping[g]] = 1
encoded_genres = encoded_genres.to(device)
print("Encoded Genres shape:", encoded_genres.shape)
        
# 映画の特徴量を結合
movie_x = torch.cat((title_embeddings, encoded_genres), dim=-1)
print("Shape of the concatenated features:", movie_x.shape)

Batches:   0%|          | 0/1421 [00:00<?, ?it/s]

Title Embeddings shape: torch.Size([45463, 384])
Encoded Genres shape: torch.Size([45463, 20])
Shape of the concatenated features: torch.Size([45463, 404])
