## [Surprise & Pytorch] Matrix Factorization

pytorchで行列分解(Matrix Factorization)をやってみる。

推薦システムを開発していると、最初のベースラインとして、行列分解（Matrix Factorization）がモデルとして採用されることが多くあります。

### github
- jupyter notebook形式のファイルは[こちら](https://github.com/hiroshi0530/wa-src/blob/master/rec/gr/mf/surprise_nb.ipynb)

### google colaboratory
- google colaboratory で実行する場合は[こちら](https://colab.research.google.com/github/hiroshi0530/wa-src/blob/master/rec/gr/mf/surprise_nb.ipynb)

### 実行環境

In [2]:
!sw_vers

ProductName:	macOS
ProductVersion:	11.6.7
BuildVersion:	20G630


In [3]:
!python -V

Python 3.8.13


$$
f(\mathbf{U}, \mathbf{V})=\frac{1}{2} \sum_{(x, y) \in \mathcal{D}}\left(r_{x y}-\mathbf{u}_{x}^{\top} \mathbf{v}_{y}\right)^{2}+\frac{\lambda}{2}\left(\|\mathbf{U}\|_{F}^{2}+\|\mathbf{V}\|_{F}^{2}\right)
$$

$$
\begin{aligned}
&\mathbf{u}_{x} \leftarrow-\eta\left\{-\left(r_{x y}-\mathbf{u}_{x}^{\top} \mathbf{v}_{y}\right) \mathbf{v}_{y}+\lambda \mathbf{u}_{x}\right\} \\
&\mathbf{v}_{y} \leftarrow-\eta\left\{-\left(r_{x y}-\mathbf{u}_{x}^{\top} \mathbf{v}_{y}\right) \mathbf{u}_{x}+\lambda \mathbf{v}_{y}\right\}
\end{aligned}
$$

基本的なライブラリをインポートしそのバージョンを確認しておきます。
学習をpytorchを利用し、ネットワーク関係はnetworkxを利用する。

In [4]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

import networkx as nx

import surprise
import matplotlib

from surprise import SVD, Reader
from surprise import Dataset as surprise_dataset

print('matplotlib  : {}'.format(matplotlib.__version__))
print('networkdx   : {}'.format(nx.__version__))
print('numpy       : {}'.format(np.__version__))
print('torch       : {}'.format(torch.__version__))
print('surprise    : {}'.format(surprise.__version__))

matplotlib  : 3.5.1
networkdx   : 2.7.1
numpy       : 1.22.3
torch       : 1.12.0
surprise    : 1.1.1


In [5]:
import random

seed = 123

def init_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

init_seed(seed)

In [6]:
df = pd.DataFrame({
    'user_id': [0, 0, 0, 1, 1, 1, 2, 2],
    'item_id': [0, 1, 4, 0, 1, 3, 1, 2],
    'rating': [1, 1, 1, 1, 1, 1, 1, 1]
})

df

Unnamed: 0,user_id,item_id,rating
0,0,0,1
1,0,1,1
2,0,4,1
3,1,0,1
4,1,1,1
5,1,3,1
6,2,1,1
7,2,2,1


In [7]:
reader = Reader(rating_scale=(1, 2))
train = surprise_dataset.load_from_df(
    df[["user_id", "item_id", "rating"]], reader
).build_full_trainset()

n_factors = 4
n_epochs = 300
lr_all = 5e-2
biased = False

mf = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, biased=biased)
mf.fit(train)

display(df)

testset = train.build_anti_testset()
predictions = mf.test(testset)
predictions

print(mf.pu)
print(mf.qi)
print(mf.bu)
print(mf.bi)

torch.matmul(torch.tensor(mf.pu), torch.tensor(mf.qi).T)

# testset
#
# def get_top_n(predictions, n=10):
#     '''
#     予測セットに基いて各ユーザにトップN件のレコメンデーションを返す。
#     '''
#
#     # まず各ユーザに予測値をマップする。
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))
#
#     # そして各ユーザに対して予測値をソートして最も高いk個を返す。
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]
#
#     return top_n
#
# top_n = get_top_n(predictions, n=10)
# print(top_n)
#
# # 各ユーザにレコメンドされるアイテムを表示する。
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])
#

Unnamed: 0,user_id,item_id,rating
0,0,0,1
1,0,1,1
2,0,4,1
3,1,0,1
4,1,1,1
5,1,3,1
6,2,1,1
7,2,2,1


[[ 0.3965334   0.82364193 -0.30476351 -0.37181569]
 [ 0.26612714  0.70783758 -0.4151934  -0.57362266]
 [ 0.66737401  0.3382212  -0.26079618 -0.70329754]]
[[ 0.32961261  0.70313976 -0.35658768 -0.4312383 ]
 [ 0.50253984  0.61136693 -0.2852638  -0.51425626]
 [ 0.34504869  0.81659032 -0.26245357 -0.23947281]
 [ 0.09845349  0.62906891 -0.39950072 -0.59367298]
 [ 0.56518677  0.12663786 -0.21856059 -0.71453742]]
[0. 0. 0.]
[0. 0. 0. 0. 0.]


tensor([[0.9789, 0.9810, 0.9784, 0.8997, 0.6607],
        [0.9808, 0.9799, 0.9162, 0.9779, 0.7407],
        [0.8541, 0.9782, 0.7433, 0.8002, 0.9796]], dtype=torch.float64)

In [57]:
n_test_users = 3
n_test_items = 5

n_users = 1000
n_items = 1000

from tokenize import Double
from scipy import sparse
from scipy.sparse import coo_matrix


class TestDataset(torch.nn.Module):
    def __init__(self):
        self.row = torch.tensor([0, 0, 0, 1, 1, 1, 2, 2])
        self.col = torch.tensor([0, 1, 4, 0, 1, 3, 1, 2])
        self.data = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1], dtype=torch.float)

        self.shape = (n_test_users, n_test_items)
        # self.double()

    def __call__(self):
        return coo_matrix((self.data, (self.row, self.col)), shape=self.shape)

    def __str__(self):
        return 'array : \n{}'.format(coo_matrix((self.data, (self.row, self.col)), shape=self.shape).toarray())

    def to_numpy(self):
        return coo_matrix((self.data, (self.row, self.col)), shape=self.shape).toarray()

    def __iter__(self):
        for d, r, c in zip(self.data, self.row, self.col):
            yield (d, (r, c))


TestDataset().to_numpy()


array([[1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0.]], dtype=float32)

In [58]:
class MF(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20, config=None, dataset=None):
        super(MF, self).__init__()
        self.user_embedding = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_embedding = torch.nn.Embedding(n_items, n_factors, sparse=True)


    def forward(self, user_idx, item_idx):
        return torch.matmul(self.user_embedding(user_idx), self.item_embedding(item_idx).T)

    def to_matrix(self):
        return torch.matmul(self.user_embedding.weight, self.item_embedding.weight.T)

    def get_regloss(self):
        return 0.02 * (torch.pow(self.user_embedding.weight, 2).sum() + torch.pow(self.item_embedding.weight, 2).sum())


In [60]:

from torch import autograd


model = MF(n_test_users, n_test_items, n_factors=4)
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr_all)

last_accum_loss = np.Inf
for _idx in range(n_epochs):
    accum_loss = 0.

    for data, (user_idx, item_idx) in TestDataset():

        model.zero_grad()
        prediction = model(user_idx, item_idx)
        loss = loss_function(prediction, data.clone().detach()) + model.get_regloss()
        accum_loss += loss.item()
        loss.backward()
        optimizer.step()

    print('loss ', _idx + 1, accum_loss)
    if abs(accum_loss - last_accum_loss) < 1e-3:
        break
    last_accum_loss = accum_loss


print(model.to_matrix().to('cpu').detach().numpy().copy().round(3))

TestDataset().to_numpy()


loss  1 19.586370706558228
loss  2 8.284650295972824
loss  3 4.8067761063575745
loss  4 3.768440306186676
loss  5 3.189052641391754
loss  6 2.8815841376781464
loss  7 2.7084111869335175
loss  8 2.545808583498001
loss  9 2.410690814256668
loss  10 2.309592306613922
loss  11 2.2165973782539368
loss  12 2.1269237995147705
loss  13 2.0438553541898727
loss  14 1.9677636623382568
loss  15 1.8981266170740128
loss  16 1.8345715552568436
loss  17 1.7765497714281082
loss  18 1.7234667837619781
loss  19 1.6749128848314285
loss  20 1.6306789070367813
loss  21 1.5905316919088364
loss  22 1.5541362166404724
loss  23 1.521167442202568
loss  24 1.4913658797740936
loss  25 1.4644881039857864
loss  26 1.4402796477079391
loss  27 1.4184948056936264
loss  28 1.3989096283912659
loss  29 1.3813174366950989
loss  30 1.365525245666504
loss  31 1.3513532727956772
loss  32 1.338634267449379
loss  33 1.3272148370742798
loss  34 1.3169567734003067
loss  35 1.3077347576618195
loss  36 1.2994355261325836
loss  37 1

array([[1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0.]], dtype=float32)

## 確認 

In [11]:
from scipy.sparse import coo_matrix

row = torch.tensor([0, 0, 0, 1, 1, 1, 2, 2])
col = torch.tensor([0, 1, 4, 0, 1, 3, 1, 2])
data = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1])

mat = coo_matrix((data, (row, col)), shape=(3, 5), dtype=float)
mat.toarray()


array([[1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0.]])

In [12]:

MSE = nn.MSELoss()
MSE(torch.tensor(np.matmul(np.array(mf.pu), np.array(mf.qi).T)), torch.tensor(mat.toarray()))

# np.matmul(np.array(mf.pu), np.array(mf.qi).T)
# mat.toarray()

tensor(0.3778, dtype=torch.float64)

In [13]:

MSE(model.to_matrix(), torch.tensor(mat.toarray()))

tensor(0.2869, dtype=torch.float64, grad_fn=<MseLossBackward0>)

In [14]:
MSE(torch.tensor([1,2,3], dtype=float), torch.tensor([2,2,4], dtype=float))

tensor(0.6667, dtype=torch.float64)

In [42]:
import numpy as np

np.random.seed(123)
print(np.random.random())
np.random.random()

0.6964691855978616


0.28613933495037946

In [43]:
np.random.random()

0.2268514535642031

In [39]:
np.random.random()

0.2268514535642031