## [PyTorch] Matrix Factorization

pytorchで行列分解(Matrix Factorization)をやってみる。



## ソースコード

### github
- jupyter notebook形式のファイルは[こちら](https://github.com/hiroshi0530/wa-src/blob/master/rec/gr/07/07_nb.ipynb)

### google colaboratory
- google colaboratory で実行する場合は[こちら](https://colab.research.google.com/github/hiroshi0530/wa-src/blob/master/rec/gr/07/07_nb.ipynb)


## 実行環境
OSはmacOSである。LinuxやUnixのコマンドとはオプションが異なりますので注意していただきたい。

In [1]:
!sw_vers

ProductName:	macOS
ProductVersion:	11.6.7
BuildVersion:	20G630


In [2]:
!python -V

Python 3.8.13


pandasのテーブルを見やすいようにHTMLのテーブルにCSSの設定を行います。

In [None]:
from IPython.core.display import HTML

style = """
<style>
    .dataframe thead tr:only-child th {
        text-align: right;
    }

    .dataframe thead th {
        text-align: left;
        padding: 5px;
    }

    .dataframe tbody tr th {
        vertical-align: top;
        padding: 5px;
    }

    .dataframe tbody tr:hover {
        background-color: #ffff99;
    }

    .dataframe {
        background-color: white;
        color: black;
        font-size: 16px;
    }

</style>
"""
HTML(style)

基本的なライブラリをインポートしそのバージョンを確認しておきます。
学習をpytorchを利用し、ネットワーク関係はnetworkxを利用する。

In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import networkx as nx

from tabulate import tabulate

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

import matplotlib

print('matplotlib  : {}'.format(matplotlib.__version__))
print('networkdx   : {}'.format(nx.__version__))
print('numpy       : {}'.format(np.__version__))
print('torch       : {}'.format(torch.__version__))

seed = 123
random_state = 123

random.seed(seed)
np.random.seed(seed)


from watermark import watermark

print(watermark(python=True, watermark=True, iversions=True, globals_=globals()))

matplotlib  : 3.5.1
networkdx   : 2.7.1
numpy       : 1.22.3
torch       : 1.12.0


## dataの読み込み

In [4]:
in_ml_1m_file_name = "../dataset/ml-1m/ml-1m.csv"

df = pd.read_csv(in_ml_1m_file_name)
df.groupby("user_id").filter(lambda x: x["item_id"].count() > 100).groupby("user_id").count().sort_values(
    "item_id", ascending=False
)

Unnamed: 0,user_id,item_id
0,1,1193
1,1,661
2,1,914
3,1,3408
4,1,2355
...,...,...
1000204,6040,1091
1000205,6040,1094
1000206,6040,562
1000207,6040,1096


In [6]:
df.groupby("user_id").agg({"item_id": len}).sort_values(by="item_id", ascending=False).tail()
df.groupby("user_id").agg({"item_id": len}).agg({"item_id": [max, min, np.mean, len]})

Unnamed: 0,item_id
max,2314.0
min,20.0
mean,165.597517
len,6040.0


In [7]:
df.groupby("user_id").count().sort_values(by="item_id", ascending=False)

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
4169,2314
1680,1850
4277,1743
1941,1595
1181,1521
...,...
5725,20
3407,20
1664,20
4419,20


In [8]:
@np.vectorize
def v_avg_2_mod(x, y):
    """xが20でなければ平均値を計算する。
    前と同様だがデコレータでベクトル化する
    """
    if x == 20:
        return np.NaN
    else:
        return (x + y) / 2


v_avg_2_mod(1, 3)
# v_avg_2_mod([3,1],[2,33])

array(2.)

In [112]:
class RegLoss(nn.Module):

    def __init__(self):
        super(RegLoss, self).__init__()

    def forward(self, parameters):
        reg_loss = None
        for W in parameters:
            if reg_loss is None:
                reg_loss = W.norm(2)
            else:
                reg_loss = reg_loss + W.norm(2)
        return reg_loss


class MF(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20, config=None, dataset=None):
        super(MF, self).__init__()
        self.user_embedding = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_embedding = torch.nn.Embedding(n_items, n_factors, sparse=True)

        self.reg_loss = RegLoss()

        # self.reg_loss = self.reg_loss(u_ego_embeddings, pos_ego_embeddings, neg_ego_embeddings)
        # self.loss = mf_loss + self.reg_weight * reg_loss

    def forward(self, user_idx, item_idx):
        # return (self.user_embedding(user) * self.item_embedding(item)).sum(1)
        return torch.matmul(self.user_embedding(user_idx), self.item_embedding(item_idx).T)

    def to_matrix(self):
        return torch.matmul(self.user_embedding.weight, self.item_embedding.weight.T)

    def get_regloss(self):
        return 0.02 * (torch.pow(self.user_embedding.weight, 2).sum() + torch.pow(self.item_embedding.weight, 2).sum())


# model = MF(n_test_users, n_test_items, n_factors=128)
# loss_function = nn.MSELoss()
# optimizer = optim.SGD(model.parameters(), lr=1e-2)
# optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [113]:
a = torch.nn.Embedding(1, 3, sparse=True)
b = torch.nn.Embedding(1, 3, sparse=True)
print(torch.mul(a(torch.tensor([i for i in range(1)])), b(torch.tensor([i for i in range(1)]))).sum(1))
print(torch.matmul(a(torch.tensor([i for i in range(1)])), b(torch.tensor([i for i in range(1)])).T))

RegLoss()(torch.tensor([[1, 2], [3, 5]], dtype=float))
# RegLoss()(torch.tensor([[1,2]], dtype=float))

tensor([1.9750], grad_fn=<SumBackward1>)
tensor([[1.9750]], grad_fn=<MmBackward0>)


tensor(8.0670, dtype=torch.float64)

## train

In [117]:
from torch import autograd


model = MF(n_test_users, n_test_items, n_factors=4)
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=5e-2)
# optimizer = optim.Adam(model.parameters(), lr=1e-2, weight_decay=0.0001)


last_accum_loss = 10000000
for _idx in range(epochs):
    accum_loss = 0.0

    for data, (user_idx, item_idx) in TestDataset():

        model.zero_grad()
        prediction = model(user_idx, item_idx).double()
        loss = loss_function(prediction, torch.tensor(data, dtype=float)) + model.get_regloss()

        print("epoch id : {}".format(_idx))
        print("  prediciton : {}".format(prediction))
        print("  data       : {}".format(data))
        print("  loss       : {}".format(loss))

        accum_loss += loss.item()

        loss.backward()
        optimizer.step()

    print("loss ", _idx + 1, accum_loss)
    if abs(accum_loss - last_accum_loss) < 1e-3:
        break
    last_accum_loss = accum_loss


print(model.to_matrix().to("cpu").detach().numpy().copy().round(4))

TestDataset().to_numpy()

  loss = loss_function(prediction, torch.tensor(data, dtype=float)) + model.get_regloss()


epoch id : 0
  prediciton : -0.10191839933395386
  data       : 1
  loss       : 1.817411004570122
epoch id : 0
  prediciton : 2.638495922088623
  data       : 1
  loss       : 3.286476600209653
epoch id : 0
  prediciton : -5.131297588348389
  data       : 1
  loss       : 38.16143866646257
epoch id : 0
  prediciton : 0.34914860129356384
  data       : 1
  loss       : 0.8352840096631207
epoch id : 0
  prediciton : 0.46298545598983765
  data       : 1
  loss       : 0.7005006711986574
epoch id : 0
  prediciton : -0.6704730987548828
  data       : 1
  loss       : 3.2030238492916396
epoch id : 0
  prediciton : 0.43308961391448975
  data       : 1
  loss       : 0.7257402594153888
epoch id : 0
  prediciton : 0.04556405544281006
  data       : 1
  loss       : 1.316070639813816
loss  1 50.04594570062497
epoch id : 1
  prediciton : -0.7232134342193604
  data       : 1
  loss       : 3.3747780913000156
epoch id : 1
  prediciton : 0.7476077675819397
  data       : 1
  loss       : 0.45972919

array([[1, 1, 0, 0, 1],
       [1, 1, 0, 1, 0],
       [0, 1, 1, 0, 0]])

## evaluate

In [143]:
from surprise import SVD, Reader
from surprise import Dataset as SurpriseDataset

train_df = pd.DataFrame(
    {"user_id": [0, 0, 0, 1, 1, 1, 2, 2], "item_id": [0, 1, 4, 0, 1, 3, 1, 2], "rating": [1, 1, 1, 1, 1, 1, 1, 1]}
)

# train_df = pd.DataFrame({'user_id':np.random.choice(['1','2','3','4'],2),
#                          'item_id':np.random.choice(['101','102','103','104'],2),
#                          'rating':np.random.uniform(1,5,2)})


reader = Reader(rating_scale=(1, 2))
data_train = SurpriseDataset.load_from_df(train_df[["user_id", "item_id", "rating"]], reader).build_full_trainset()


n_factors = 4
n_epochs = 300
lr_all = 5e-2
biased = False

mf = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, biased=biased)
# mf = SVD()
mf.fit(data_train)

# display(train_df)
# mf.predict('1','101')
# train_df

display(train_df)

testset = data_train.build_anti_testset()
predictions = mf.test(testset)
predictions

print(mf.pu)
print(mf.qi)
print(mf.bu)
print(mf.bi)

torch.matmul(torch.tensor(mf.pu), torch.tensor(mf.qi).T)

# testset
#
# def get_top_n(predictions, n=10):
#     '''
#     予測セットに基いて各ユーザにトップN件のレコメンデーションを返す。
#     '''
#
#     # まず各ユーザに予測値をマップする。
#     top_n = defaultdict(list)
#     for uid, iid, true_r, est, _ in predictions:
#         top_n[uid].append((iid, est))
#
#     # そして各ユーザに対して予測値をソートして最も高いk個を返す。
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]
#
#     return top_n
#
# top_n = get_top_n(predictions, n=10)
# print(top_n)
#
# # 各ユーザにレコメンドされるアイテムを表示する。
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])
#

Unnamed: 0,user_id,item_id,rating
0,0,0,1
1,0,1,1
2,0,4,1
3,1,0,1
4,1,1,1
5,1,3,1
6,2,1,1
7,2,2,1


[[ 0.34026516 -0.3812599   0.86831833  0.23279721]
 [ 0.54512834 -0.63448514  0.55168978  0.30616053]
 [ 0.11878297 -0.43863877  0.90247906  0.27909642]]
[[ 0.44105927 -0.47926558  0.68640405  0.19886165]
 [ 0.3255163  -0.49533026  0.70836218  0.30300773]
 [ 0.21392641 -0.23687857  0.8836253   0.2046864 ]
 [ 0.58344188 -0.61463119  0.33355343  0.2807894 ]
 [ 0.03005704 -0.36274684  0.83665026  0.21983648]]
[0. 0. 0.]
[0. 0. 0. 0. 0.]


tensor([[0.9751, 0.9852, 0.9780, 0.7879, 0.9262],
        [0.9841, 0.9753, 0.8171, 0.9780, 0.7754],
        [0.9376, 0.9798, 0.9839, 0.7183, 0.9791]], dtype=torch.float64)

In [None]:
## Sparse情報からinteractio matrix を作る

In [132]:
from scipy.sparse import coo_matrix

row = torch.tensor([0, 0, 0, 1, 1, 1, 2, 2])
col = torch.tensor([0, 1, 4, 0, 1, 3, 1, 2])
data = torch.tensor([1, 1, 1, 1, 1, 1, 1, 1])

mat = coo_matrix((data, (row, col)), shape=(3, 5), dtype=float)
mat.toarray()

array([[1., 1., 0., 0., 1.],
       [1., 1., 0., 1., 0.],
       [0., 1., 1., 0., 0.]])

In [145]:
MSE = nn.MSELoss()
# MSE(torch.tensor([1,2,3], dtype=float), torch.tensor([2,3,4], dtype=float))
#
MSE(torch.tensor(np.matmul(np.array(mf.pu), np.array(mf.qi).T)), torch.tensor(mat.toarray()))

# np.matmul(np.array(mf.pu), np.array(mf.qi).T)
# mat.toarray()

tensor(0.3472, dtype=torch.float64)

## defaultdictの使い方

In [43]:
from collections import defaultdict

d = defaultdict(int)

d["das"]
d["a"]
d[0]
d

defaultdict(int, {'das': 0, 'a': 0, 0: 0})

In [None]:
from torch import optim

model = MF(n_user, n_item, k=20)
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [None]:
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [None]:
from torch import autograd

for epoch in range(10):  # 最大10反復
    accum_loss = 0.0

    # 学習データのシャッフル
    random.shuffle(samples_train)

    for u, i, r in samples_train:
        # PyTorchでは勾配は累積するのでサンプルごとに初期化
        model.zero_grad()

        # 入力値を `torch.Tensor` でラップして `autograd.Variable` 化
        user = autograd.Variable(as_long_tensor(u))  # user index
        item = autograd.Variable(as_long_tensor(i))  # item index
        rating = autograd.Variable(as_float_tensor(r))  # target

        # forward pass (prediction)
        prediction = model(user, item)

        # compute loss
        loss = loss_function(prediction, rating)
        accum_loss += loss.data[0]

        # gradient of loss
        loss.backward()

        # update model parameters
        optimizer.step()

    print("MF (PyTorch)", epoch + 1, accum_loss)
    if abs(accum_loss - last_accum_loss) < 1e-3:  # 収束判定
        break
    last_accum_loss = accum_loss