# test-gnn-recommendation

- 参考
    - https://towardsdatascience.com/hands-on-graph-neural-networks-with-pytorch-pytorch-geometric-359487e221a8
    - https://medium.com/arangodb/integrate-arangodb-with-pytorch-geometric-to-build-recommendation-systems-dd69db688465
- サンプルデータ: [The Movies Dataset](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)
    - `credits.csv`
    - `keywords.csv`
    - `links.csv`
    - `links_small.csv`
    - `movies_metadata.csv`
    - `ratings.csv`
    - `ratings_small.csv`

In [1]:
import ast
import itertools

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch_geometric.transforms as T
from sentence_transformers import SentenceTransformer
from torch.nn import Linear
from torch_geometric.data import HeteroData
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.transforms import RandomLinkSplit, ToUndirected
from tqdm.notebook import tqdm

pd.options.display.max_columns = 100

In [2]:
# データの読み込み
LINKS_PATH = "./data/links_small.csv"
METADATA_PATH = "./data/movies_metadata.csv"
RATINGS_PATH = "./data/ratings_small.csv"

df_links = pd.read_csv(LINKS_PATH, dtype=str)
df_metadata = pd.read_csv(METADATA_PATH, dtype=str)
df_ratings = pd.read_csv(RATINGS_PATH, dtype=str)

df_metadata = df_metadata.drop([19730, 29503, 35587])
df_metadata["title"] = df_metadata["title"].fillna("")
df_ratings = pd.merge(df_ratings, df_links, on="movieId", how="inner")

In [3]:
# GPUが使用可能な場合GPUを使用
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [4]:
def node_mappings(df, id_col):
    mapping = {index: i for i, index in enumerate(df.set_index(id_col).index.unique())}

    return mapping


user_mappings = node_mappings(df_ratings, "userId")
movie_mappings = node_mappings(df_ratings, "tmdbId")
df_ratings["uid"] = df_ratings["userId"].apply(lambda x: user_mappings[x])
df_ratings["mid"] = df_ratings["tmdbId"].apply(lambda x: movie_mappings[x])
edge_index = torch.from_numpy(df_ratings[["uid", "mid"]].astype(int).values.T).clone()
edge_label = torch.from_numpy(
    (df_ratings["rating"].astype(float) * 2)
    .astype(int)
    .values  # 0.5単位なので、2倍して10段階評価に変更
).clone()

In [5]:
# 映画タイトル畳み込み
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
title_embeddings = model.encode(
    df_metadata["title"].values,
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device,
)
print("Title Embeddings shape:", title_embeddings.shape)

# ジャンルのエンコード
def genre_list_to_genre_names(genre_list):
    genre_list = ast.literal_eval(genre_list)
    if len(genre_list) == 0:
        return []
    genre_names = []
    for genre_dict in genre_list:
        genre_names.append(genre_dict["name"])
    return genre_names


genres = df_metadata["genres"].apply(genre_list_to_genre_names).values
unique_genres = set(itertools.chain(*genres))
mapping = {g: i for i, g in enumerate(unique_genres)}
encoded_genres = torch.zeros(len(genres), len(mapping))
for i, genre in enumerate(genres):
    for g in genre:
        encoded_genres[i, mapping[g]] = 1
encoded_genres = encoded_genres.to(device)
print("Encoded Genres shape:", encoded_genres.shape)

# 映画の特徴量を結合
movie_x = torch.cat((title_embeddings, encoded_genres), dim=-1)
print("Shape of the concatenated features:", movie_x.shape)

Batches:   0%|          | 0/1421 [00:00<?, ?it/s]

Title Embeddings shape: torch.Size([45463, 384])
Encoded Genres shape: torch.Size([45463, 20])
Shape of the concatenated features: torch.Size([45463, 404])


In [6]:
# グラフを作成
data = HeteroData()
l = len(set(df_ratings["userId"].values))
data["user"].x = torch.eye(l)  # 必ずしもone-hotでなくても良い？
data["movie"].x = movie_x
data["user", "rates", "movie"].edge_index = edge_index
data["user", "rates", "movie"].edge_label = edge_label
data = ToUndirected()(data)  # 映画->ユーザーのエッジも作成
del data["movie", "rev_rates", "user"].edge_label  # 映画->ユーザーのエッジラベルを削除
data = data.to(device)
data

HeteroData(
  [1muser[0m={ x=[671, 671] },
  [1mmovie[0m={ x=[45463, 404] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100004],
    edge_label=[100004]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100004] }
)

In [7]:
# エッジを学習・検証・テストに分割
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[("user", "rates", "movie")],
    rev_edge_types=[("movie", "rev_rates", "user")],
)(data)

In [8]:
# レートに偏りがあるため、重みづけする。
def weighted_mse_loss(pred, target, weight=None):
    weight = 1.0 if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()


weight = torch.bincount(train_data["user", "movie"].edge_label)
weight = weight.max() / weight
weight

tensor([    inf, 27.0492,  8.6061, 17.1419,  3.9699,  6.4866,  1.4489,  2.7331,
         1.0000,  3.7203,  1.9047], device='cuda:0')

In [9]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        # these convolutions have been replicated to match the number of edge types
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        # concat user and movie embeddings
        z = torch.cat([z_dict["user"][row], z_dict["movie"][col]], dim=-1)
        # concatenated embeddings passed to linear layer
        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr="sum")
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        # z_dict contains dictionary of movie and user embeddings returned from GraphSage
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)


model = Model(hidden_channels=32).to(device)

# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [10]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(
        train_data.x_dict,
        train_data.edge_index_dict,
        train_data["user", "movie"].edge_label_index,
    )
    target = train_data["user", "movie"].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [11]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(
        data.x_dict, data.edge_index_dict, data["user", "movie"].edge_label_index
    )
    pred = pred.clamp(min=0, max=5)
    target = data["user", "movie"].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [12]:
for epoch in range(1, 300):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(
        f"Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, "
        f"Val: {val_rmse:.4f}, Test: {test_rmse:.4f}"
    )

Epoch: 001, Loss: 111.3587, Train: 7.2567, Val: 7.2217, Test: 7.2398
Epoch: 002, Loss: 106.2987, Train: 7.0330, Val: 6.9990, Test: 7.0168
Epoch: 003, Loss: 99.2290, Train: 6.5970, Val: 6.5658, Test: 6.5831
Epoch: 004, Loss: 86.3066, Train: 5.8117, Val: 5.7860, Test: 5.8022
Epoch: 005, Loss: 65.8885, Train: 4.4905, Val: 4.4753, Test: 4.4880
Epoch: 006, Loss: 40.0015, Train: 2.9782, Val: 2.9693, Test: 2.9746
Epoch: 007, Loss: 24.0261, Train: 2.9757, Val: 2.9552, Test: 2.9621
Epoch: 008, Loss: 50.8880, Train: 2.9757, Val: 2.9552, Test: 2.9621
Epoch: 009, Loss: 43.8176, Train: 2.9757, Val: 2.9556, Test: 2.9625
Epoch: 010, Loss: 27.8706, Train: 2.9873, Val: 2.9827, Test: 2.9873
Epoch: 011, Loss: 23.7607, Train: 3.5117, Val: 3.5066, Test: 3.5146
Epoch: 012, Loss: 27.8819, Train: 3.9742, Val: 3.9647, Test: 3.9754
Epoch: 013, Loss: 32.7350, Train: 4.1526, Val: 4.1418, Test: 4.1533
Epoch: 014, Loss: 34.9970, Train: 4.0973, Val: 4.0874, Test: 4.0987
Epoch: 015, Loss: 34.2388, Train: 3.8479, Val:

In [13]:
total_users = len(user_mappings)
total_movies = len(movie_mappings)
movie_recs = []
for user_id in tqdm(range(0, total_users)):
    user_row = torch.tensor([user_id] * total_movies)
    all_movie_ids = torch.arange(total_movies)
    edge_label_index = torch.stack([user_row, all_movie_ids], dim=0)
    pred = model(data.x_dict, data.edge_index_dict, edge_label_index)
    pred = pred.clamp(min=0, max=10)
    # we will only select movies for the user where the predicting rating is =10
    rec_movie_ids = (pred == 10).nonzero(as_tuple=True)
    top_ten_recs = [rec_movies for rec_movies in rec_movie_ids[0].tolist()[:10]]
    movie_recs.append({"user": user_id, "rec_movies": top_ten_recs})
movie_recs

  0%|          | 0/671 [00:00<?, ?it/s]

[{'user': 0,
  'rec_movies': [1977, 2117, 2228, 3053, 3386, 3560, 4275, 4638, 4734, 5226]},
 {'user': 1,
  'rec_movies': [246, 1623, 1977, 2117, 2225, 2228, 3053, 3386, 3479, 3483]},
 {'user': 2, 'rec_movies': [24, 99, 246, 500, 617, 665, 691, 710, 771, 779]},
 {'user': 3,
  'rec_movies': [99, 246, 500, 617, 665, 691, 710, 1085, 1091, 1170]},
 {'user': 4,
  'rec_movies': [99, 246, 500, 617, 665, 691, 710, 1085, 1091, 1170]},
 {'user': 5, 'rec_movies': [4, 24, 49, 99, 157, 158, 160, 177, 180, 186]},
 {'user': 6,
  'rec_movies': [246, 500, 1623, 1977, 1987, 2026, 2117, 2225, 2228, 3053]},
 {'user': 7,
  'rec_movies': [1977, 2117, 2225, 2228, 3053, 3386, 3560, 4275, 4638, 4734]},
 {'user': 8,
  'rec_movies': [246, 1623, 1977, 1987, 2117, 2225, 2228, 3053, 3386, 3479]},
 {'user': 9, 'rec_movies': [4, 6, 7, 14, 19, 24, 40, 46, 49, 55]},
 {'user': 10,
  'rec_movies': [1977, 2117, 2225, 2228, 3053, 3386, 3479, 3483, 3485, 3520]},
 {'user': 11,
  'rec_movies': [1977, 2117, 2225, 2228, 3053, 33

In [15]:
df_metadata[df_metadata["id"] == list(movie_mappings.keys())[1977]]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
4276,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",,10728,tt0016847,de,Faust – Eine deutsche Volkssage,God and Satan war over earth; to settle things...,4.021486,/qVm0rfvoT72pabRR87AE36kxmR0.jpg,"[{'name': 'Universum Film (UFA)', 'id': 12372}]","[{'iso_3166_1': 'DE', 'name': 'Germany'}]",1926-10-13,0,116.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}]",Released,The Voice of the Tempter,Faust,False,7.8,66
