# 특잇값 분해(Singular Value Decomposition, SVD)

실수나 복소수로 이루어진 체 K의 원소로 구성되는 m × n 행렬 M에 대해, M은 다음과 같은 세 행렬의 곱으로 분해할 수 있다.

$$\displaystyle M=U\Sigma V^{*}\!$$
여기에서 각 행렬은 다음과 같은 성질을 가진다.

- U는 m × m 크기를 가지는 유니터리 행렬이다.
- Σ는 m × n 크기를 가지며, 대각선상에 있는 원소의 값은 음수가 아니며 나머지 원소의 값이 모두 0인 대각행렬이다.
- $V^{*}$는 V의 켤레전치 행렬로, n × n 유니터리 행렬이다.
- 행렬 M을 이와 같은 세 행렬의 곱으로 나타내는 것을 M의 특잇값 분해라고 한다.

![특이값](https://upload.wikimedia.org/wikipedia/commons/thumb/c/c8/Singular_value_decomposition_visualisation.svg/360px-Singular_value_decomposition_visualisation.svg.png)

In [1]:
# 부모 폴더의 경로 추가
import sys; sys.path.insert(0, '..')

from util.data_loader import DataLoader
from util.metric_calculator import MetricCalculator

In [2]:
# Movielens 데이터 로딩
data_loader = DataLoader(num_users=1000, num_test_items=5, data_path='../data/ml-10M100K/')
movielens = data_loader.load()

In [3]:
user_movie_matrix = movielens.train.pivot(index='user_id', columns='movie_id', values='rating')
user_movie_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.0,,,,,,3.0,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048,,,,,,,,,,,...,,,,,,,,,,
1050,,3.0,,,,3.0,,,,3.0,...,,,,,,,,,,
1051,5.0,,3.0,,3.0,,4.0,,,,...,,,,,,,,,,
1052,,,,,,,,,,,...,,,,,,,,,,


In [4]:
# 희소 정보
user_num = len(user_movie_matrix.index)
item_num = len(user_movie_matrix.columns)
non_null_num = user_num*item_num - user_movie_matrix.isnull().sum().sum()
non_null_ratio = non_null_num / (user_num*item_num)

print(f'사용자 수={user_num}, 아이템 수={item_num}, 정밀도={non_null_ratio:.2f}')

사용자 수=1000, 아이템 수=6673, 정밀도=0.02


In [5]:
user_movie_matrix.fillna(0)

movie_id,1,2,3,4,5,6,7,8,9,10,...,62000,62113,62293,62344,62394,62801,62803,63113,63992,64716
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1050,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051,5.0,0.0,3.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
import scipy
import numpy as np

# 평갓값을 사용자 x 영화의 행렬로 변환. 결손값은 평균값으로 채운다
user_movie_matrix = movielens.train.pivot(index='user_id', columns='movie_id', values='rating')
user_id2index = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
movie_id2index = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))
matrix = user_movie_matrix.fillna(movielens.train.rating.mean()).to_numpy()


# 인자 수 x 특이값 분해를 수행한다
P, S, Qt = scipy.sparse.linalg.svds(matrix, k=5)

# 예측 평갓값 행렬
pred_matrix = np.dot(np.dot(P, np.diag(S)), Qt)

print(f"P: {P.shape}, S: {S.shape}, Qt: {Qt.shape}, pred_matrix: {pred_matrix.shape}")

P: (1000, 5), S: (5,), Qt: (5, 6673), pred_matrix: (1000, 6673)


In [7]:
# SVD 추천
from src.svd import SVDRecommender
recommender = SVDRecommender()
recommend_result = recommender.recommend(movielens)

In [8]:
# 평가
metric_calculator = MetricCalculator()
metrics = metric_calculator.calc(
    movielens.test.rating.tolist(), recommend_result.rating.tolist(),
    movielens.test_user2items, recommend_result.user2items, k=10)
print(metrics)

rmse=3.335, Precision@K=0.009, Recall@K=0.029


In [9]:
# 결손값을 평균값으로 채운다
recommend_result = recommender.recommend(movielens, fillna_with_zero=False)
metrics = metric_calculator.calc(
movielens.test.rating.tolist(), recommend_result.rating.tolist(),
movielens.test_user2items, recommend_result.user2items, k=10)
print(metrics)

rmse=1.046, Precision@K=0.013, Recall@K=0.043


In [10]:
# 인자 수와 정밀도의 관계
for factors in [5, 10, 30]:
    recommend_result = recommender.recommend(movielens, factors=factors, fillna_with_zero=False)
    metrics = metric_calculator.calc(
    movielens.test.rating.tolist(), recommend_result.rating.tolist(),
    movielens.test_user2items, recommend_result.user2items, k=10)
    print(metrics)

rmse=1.046, Precision@K=0.013, Recall@K=0.043
rmse=1.042, Precision@K=0.011, Recall@K=0.039
rmse=1.038, Precision@K=0.011, Recall@K=0.036
