In [1]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, DataLoader, TensorDataset)
import tqdm

In [2]:
import pandas as pd
# 訓練データとテストデータを分ける
from sklearn import model_selection
df = pd.read_csv("ml-20m/ratings.csv")
# Xは(userId, movieId)のペア
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values

# 訓練データとテストデータを9:1に分割
train_X, test_X, train_y, test_y = model_selection.train_test_split(X, Y, test_size=0.1)

# XはIDで整数なのでint64、Yは実数値なのでfloat32のTensorに変換する
train_dataset = TensorDataset(torch.tensor(train_X, dtype=torch.int64), torch.tensor(train_y, dtype=torch.float32))
test_dataset = TensorDataset(torch.tensor(test_X, dtype=torch.int64), torch.tensor(test_y, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=1024, num_workers=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1024, num_workers=4)

In [3]:
"""
行列因子分解
"""

class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)

    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        # user_feature*item_featureは(batch_size, k)次元なのでkについてsumを取ると、それぞれのサンプルの内積になる。
        out = torch.sum(user_feature * item_feature, 1)
        # [0, 5]の範囲に収まるように変換
        out = nn.functional.sigmoid(out) * 5
        return out

In [4]:
"""
ユーザーや商品の個数
"""
max_user, max_item = X.max(0)
# np.int64型をPythonの標準のintにキャスト
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user + 1, max_item + 1)

In [5]:
"""
評価関数の作成
"""

def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    for x, y in loader:
        x = x.to(device)
        ys.append(y)
        with torch.no_grad():
            ypred = net(x).to("cpu").view(-1)
        ypreds.append(ypred)
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score.item()

In [7]:
"""
訓練部分の作成
"""
from statistics import mean

net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score, flush=True)

100%|██████████| 17579/17579 [02:23<00:00, 122.92it/s]


0 0.8906274459096065 0.712262749671936


100%|██████████| 17579/17579 [02:28<00:00, 118.04it/s]


1 0.8405928744907109 0.7014431357383728


100%|██████████| 17579/17579 [02:30<00:00, 116.92it/s]


2 0.8163615460281615 0.6969767808914185


100%|██████████| 17579/17579 [02:30<00:00, 116.83it/s]


3 0.8030463051515212 0.6957675814628601


100%|██████████| 17579/17579 [02:31<00:00, 115.99it/s]


4 0.7949830328070728 0.6930658221244812


In [8]:
# 訓練したモデルをCPUに移す
net.to("cpu")
# ユーザー1の映画10の評価を計算したい
query = (1, 10)
# int64のtensorに変換し、batchの次元を付加
query = torch.tensor(query, dtype=torch.int64).view(1, -1)
# netに渡す
net(query)

tensor([ 3.7181])

In [9]:
# ユーザー1の上位5本の映画をピックアップ
query = torch.stack([
    torch.zeros(max_item).fill_(1),
    torch.arange(1, max_item + 1)
], 1).long()
# scoresは上位k本のスコア
# indicesは上位k本の位置、すなわちmovieId
scores, indices = torch.topk(net(query), 5)

In [10]:
print(scores)
print(indices)

tensor([ 5.0000,  5.0000,  5.0000,  5.0000,  5.0000])
tensor([ 1.0661e+05,  7.3386e+04,  1.2585e+05,  8.7156e+04,  7.2894e+04])
