# 추천시스템: 협업필터링

## 3. 잠재요인필터링(Latent Factor Filtering)
잠재요인필터링은 최근 아이템 기반 최근접이웃필터링보다 선호되는 추천시스템 알고리즘이다. 여기저 '잠재요인'이 무엇인지는 정확히 정의가 되지 않는다. 이 방법은 간단히 설명하면 **행렬분해(Singular value decomposition)** 연산을 기반으로 하는 필터링 방법이다. 하지만 사용자-아이템 평점행렬의 경우 누락값이 많아 희소행렬(sparse matrix)의 특성을 띄기 때문에 일반적으로 우리가 아는 선형대수학의 행렬분해를 사용하지 않는다. 알고리즘에 대해서는 코드와 함께 설명하도록 하겠다.


(전체적인 데이터 및 사용자-아이템 평점 행렬 생성 방법은 최근접이웃필터링 필사에서 다룬 것과 동일하다.)

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error

### Step 1: 사용자-아이템 평점 행렬 생성

In [7]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

df = pd.merge(ratings, movies, how = 'left', on = 'movieId')
df.head(2)

rating_mat = df.pivot_table('rating', index = 'userId', columns = 'title')
# fill NaN values with 0
rating_mat.fillna(0, inplace = True)
rating_mat

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Step 2: SGD 구현
잠재요인필터링에서의 행렬분해는 확률적경사하강법(SGD) 알고리즘을 이용해 구현한다.
$$ R = P \cdot Q^{T}$$
 + R: 사용자-아이템 평점 행렬(M x N)
 + P: 사용자-잠재요인 평점 행렬(M x K)
 + Q: 아이템-잠재요인 평점 행렬(N x K)
<br>
<br>
<br>
[알고리즘]
 1. $P$와 $Q$ 를 임의의 값을 가진 행렬로 초기화한다.
 2. $P$와 $Q^{T}$를 내적한 후 예측 R 행렬을 계산하고 예측 R과 실제 R의 error을 계산한다.
 3. error을 최소화하는 적절한 $P$와 $Q$를 찾을 때 까지 1-2를 반복한다.

In [4]:
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # pred matrix 틀 생성
    full_pred_mat = np.dot(P, Q.T)
    
    # 실제 R 행렬과 예측 R 행렬이 nonzero인 값만 추출하여 RMSE를 계산한다
    x_nonzero = [non_zero[0] for non_zero in non_zeros]
    y_nonzero = [non_zero[1] for non_zero in non_zeros]
    
    R_nonzero = R[x_nonzero, y_nonzero]
    full_mat_nonzero = full_pred_mat[x_nonzero, y_nonzero]
    
    mse = mean_squared_error(R_nonzero, full_mat_nonzero)
    
    return np.sqrt(mse)

In [13]:
def SGD(R, K, steps = 200, learning_rate = 0.01, r_lambda = 0.01):
    num_user, num_item = R.shape
    
    np.random.seed(1)
    
    # P와 Q 초깃값 설정하기(normal distribution)
    P = np.random.normal(scale = 1/K, size = (num_user, K))
    Q = np.random.normal(scale = 1/K, size = (num_item, K))
    
    # 평점행렬 R에서 누락값이 아닌 데이터만 새로 추출하여 non_zeros를 생성한다.
    non_zeros = [(i, j, R[i, j]) for i in range(num_user) for j in range(num_item) if R[i, j]>0]
    
    # 지정한 step 수만큼 반복
    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, :], Q[j, :].T)
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        
        if (step % 10) == 0:
            print('### Iteration step: ', step, "rmse: ", rmse)
            
    return P, Q

In [14]:
P, Q = SGD(rating_mat.values, K = 50, steps = 200, learning_rate = 0.01, r_lambda = 0.01)

pred_matrix = np.dot(P, Q.T)

### Iteration step:  0 rmse:  2.9023619751336867
### Iteration step:  10 rmse:  0.7335768591017927
### Iteration step:  20 rmse:  0.5115539026853442
### Iteration step:  30 rmse:  0.37261628282537446
### Iteration step:  40 rmse:  0.29608182991810134
### Iteration step:  50 rmse:  0.2520353192341642
### Iteration step:  60 rmse:  0.22487503275269852
### Iteration step:  70 rmse:  0.20685455302331537
### Iteration step:  80 rmse:  0.19413418783028685
### Iteration step:  90 rmse:  0.18470082002720403
### Iteration step:  100 rmse:  0.17742927527209104
### Iteration step:  110 rmse:  0.1716522696470749
### Iteration step:  120 rmse:  0.1669518194687172
### Iteration step:  130 rmse:  0.1630529219199754
### Iteration step:  140 rmse:  0.15976691929679643
### Iteration step:  150 rmse:  0.1569598699945732
### Iteration step:  160 rmse:  0.15453398186715425
### Iteration step:  170 rmse:  0.15241618551077643
### Iteration step:  180 rmse:  0.1505508073962831
### Iteration step:  190 rmse:  

최종 예측 행렬을 생성 후, 편의를 위해 index에 사용자, column에 영화제목을 설정한다.

In [16]:
pred_matrix = pd.DataFrame(pred_matrix, index = rating_mat.index, columns = rating_mat.columns)
pred_matrix

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.564130,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.311890,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.780760,1.997043,0.924908,2.970700,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.887580,1.042618,2.293890,0.396941
4,2.628629,3.035550,2.575746,3.706912,3.430636,0.706441,3.330280,1.978826,4.560368,2.775710,...,1.046116,2.912178,2.479592,2.231915,1.888629,2.211364,0.645603,1.585734,3.542892,0.591540
5,2.116148,3.084761,2.747679,3.783490,3.946990,0.883259,1.958953,1.757317,2.054312,2.775258,...,0.956159,3.893975,2.717024,2.002443,2.053337,3.983639,2.099626,1.423718,2.490428,0.531403
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.153469,3.536398,3.128222,3.875599,3.821362,1.010045,2.284450,2.287194,4.148796,3.487517,...,1.105429,3.647394,3.342759,2.446343,2.406133,3.613071,1.849264,1.526109,2.860815,0.673099
607,2.502048,3.577547,3.095692,4.081789,4.038996,1.095224,3.698198,1.904581,5.347442,3.114650,...,0.963407,3.230442,3.526719,2.084052,2.119598,4.135631,3.124109,2.362195,3.421672,0.758574
608,2.154503,3.019060,2.679379,3.567550,3.483444,0.909505,2.387003,1.644313,3.090541,3.197815,...,0.818661,3.372644,2.936734,2.049884,2.097775,4.452331,3.504461,1.906708,2.419560,0.701739
609,2.566479,3.285659,2.910122,3.717481,3.665580,1.036884,2.908776,1.844237,2.868990,3.175409,...,1.003584,3.033540,2.943151,2.173251,2.234396,3.839926,2.520343,1.603675,2.970382,0.636880


### Step3: 안 본 영화중에 추천하기
앞서 최근접이웃필터링과 동일하다.

In [17]:
# 안본 영화중에 추천
def get_unseen_list(ratings_matrix, userId):
    user_rating = ratings_matrix.iloc[userId, :]
    seen_movies = user_rating[user_rating > 0].index.tolist()
    all_movies = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in all_movies if movie not in seen_movies]
    
    return unseen_list

In [26]:
def get_top_movies(ratings_matrix, pred_matrix, userId, top):
    unseen_list = get_unseen_list(ratings_matrix, userId)
    
    top_movies = pred_matrix.loc[userId, unseen_list].sort_values(ascending = False)[:top]
    top_movies = pd.DataFrame(top_movies.values, index = top_movies.index, columns = ['pred_score'])
    return top_movies

In [27]:
get_top_movies(rating_mat, pred_matrix, 3, 10)

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Naked Gun 33 1/3: The Final Insult (1994),5.872875
Blazing Saddles (1974),5.171396
Troll 2 (1990),4.989779
Hangar 18 (1980),4.963188
Galaxy of Terror (Quest) (1981),4.958124
Android (1982),4.957553
Death Race 2000 (1975),4.956821
Alien Contamination (1980),4.956363
"Road Warrior, The (Mad Max 2) (1981)",4.953514
Saturn 3 (1980),4.949588
