In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import warnings
warnings.filterwarnings('ignore')

# 경사하강법을 이용한 행렬 분해
원본 행렬 R 및 R을 분해할 P와 Q를 임의의 정규분포를 가진 랜덤값으로 초기화
$$
R=PQ^{T}
$$

In [3]:
import numpy as np

# 원본 행렬 R은 희소행렬
R = np.array([[4, np.nan, np.nan, 2, np.nan],
              [np.nan, 5, np.nan, 3, 1],
              [np.nan, np.nan, 3, 4, 4],
              [5, 2, 1, 2, np.nan]])

# 사용자 수, 아이템 수
num_users, num_items = R.shape

L = 3 # 잠재(Latent)의 개수

# P와 Q 매트릭스의 크기를 지정하고 정규 분포를 가진 Random한 값으로 입력
np.random.seed(1)

P = np.random.normal(scale=1./L, size=(num_users, L))
Q = np.random.normal(scale=1./L, size=(num_items, L))

R.shape, P.shape, Q.shape

((4, 5), (4, 3), (5, 3))

## 비용 계산 함수
분해된 행렬 P와 Q.T를 내적하여 예측 행렬 생성하고, 실제 행렬 R에서 Nan이 아닌 값의 위치에 있는 값만 예측 행렬의 값과 비교하여 RMSE 값을 계산 후 반환|

In [4]:
# R에서 Nan이 아닌 위치 구하기(0이 아닌 위치)
non_zeros = [(i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
non_zeros

# 행번호, 열번호, 실제 값

[(0, 0, 4.0),
 (0, 3, 2.0),
 (1, 1, 5.0),
 (1, 3, 3.0),
 (1, 4, 1.0),
 (2, 2, 3.0),
 (2, 3, 4.0),
 (2, 4, 4.0),
 (3, 0, 5.0),
 (3, 1, 2.0),
 (3, 2, 1.0),
 (3, 3, 2.0)]

In [5]:
# MSE(비용함수) 구하기 - 실제 행렬 R과 PQ^T의 실제 값이 있던 위치에서의 차이 구하기
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
  # 두 개의 분해된 행렬 P와 Q.T를 내적하여 예측 행렬 R 행렬 생성
  full_pred_matrix = P @ Q.T

  # 실제 행렬 R에서 NaN이 아닌 값의 위치 인덱스를 추출하여 실제 R 행렬과 예측 행렬의 RMSE 추출
  x_non_zero_index = [non_zero[0] for non_zero in non_zeros] # 행 번호만 추출
  y_non_zero_index = [non_zero[1] for non_zero in non_zeros] # 열 번호만 추출

  # 실제 행렬에서 NaN이 아닌 값
  R_non_zeros = R[x_non_zero_index, y_non_zero_index]

  # 실제 행렬 R에서 NaN이 아니었던 곳에 대한 예측 값
  full_matrix_non_zeros = full_pred_matrix[x_non_zero_index, y_non_zero_index]

  mse = mean_squared_error(R_non_zeros, full_matrix_non_zeros)
  rmse = np.sqrt(mse)

  return rmse

경사하강법에 기반하여 P와 Q의 원소들에 대한 업데이트 수행
$$
p^{\prime}_u = p_u + \eta(e_{u, i} \cdot q_i - \lambda \cdot p_u)\\
q^{\prime}_i = q_i + \eta(e_{u, i} \cdot p_u - \lambda \cdot q_i)
$$

In [7]:
# 학습 횟수, 학습률, 규제 계수 정의
steps = 1000
learning_rate = 0.01
r_lambda = 0.01 # Ridge 에서의 alpha와 비슷한 역할

for step in range(steps):
  for i, j, r in non_zeros:
    # P, Q`T,
    # 실제 값과 예측 값의 차이인 오륫값 구하기
    e_ij = r - (P[i, :] @ Q[j, :].T)

    # 경사 하강법 계산 (미분값을 이용한 경사하강법)
    P[i, :] = P[i, :] + learning_rate * (e_ij * Q[j, :] - r_lambda * P[i, :])
    Q[j, :] = Q[j, :] + learning_rate * (e_ij * P[i, :] - r_lambda * Q[j, :])

  rmse = get_rmse(R, P, Q, non_zeros)

  # 50회 훈련 마다 로깅
  if (step % 50) == 0:
    print("반복 단계 : {}, RMSE : {}".format(step, rmse))

반복 단계 : 0, RMSE : 3.2388050277987723
반복 단계 : 50, RMSE : 0.4876723101369648
반복 단계 : 100, RMSE : 0.1564340384819247
반복 단계 : 150, RMSE : 0.07455141311978046
반복 단계 : 200, RMSE : 0.04325226798579314
반복 단계 : 250, RMSE : 0.029248328780878973
반복 단계 : 300, RMSE : 0.022621116143829466
반복 단계 : 350, RMSE : 0.019493636196525135
반복 단계 : 400, RMSE : 0.018022719092132704
반복 단계 : 450, RMSE : 0.01731968595344266
반복 단계 : 500, RMSE : 0.016973657887570753
반복 단계 : 550, RMSE : 0.016796804595895633
반복 단계 : 600, RMSE : 0.01670132290188466
반복 단계 : 650, RMSE : 0.01664473691247669
반복 단계 : 700, RMSE : 0.016605910068210026
반복 단계 : 750, RMSE : 0.016574200475705
반복 단계 : 800, RMSE : 0.01654431582921597
반복 단계 : 850, RMSE : 0.01651375177473524
반복 단계 : 900, RMSE : 0.01648146573819501
반복 단계 : 950, RMSE : 0.016447171683479155


In [8]:
pred_matrix = P @ Q.T

print("원본 행렬 : \n", R)
print()
print("예측 행렬 : \n", np.round(pred_matrix, 3))

원본 행렬 : 
 [[ 4. nan nan  2. nan]
 [nan  5. nan  3.  1.]
 [nan nan  3.  4.  4.]
 [ 5.  2.  1.  2. nan]]

예측 행렬 : 
 [[3.991 0.897 1.306 2.002 1.663]
 [6.696 4.978 0.979 2.981 1.003]
 [6.677 0.391 2.987 3.977 3.986]
 [4.968 2.005 1.006 2.017 1.14 ]]


# 행렬 분해 기반의 잠재 요인 협업 필터링
경사하강법 기반의 행렬 분해 함수 생성

In [9]:
def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda = 0.01):
    num_users, num_items = R.shape

    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    break_count = 0

    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]

    for step in range(steps):
        for i, j, r in non_zeros:
            e_ij = r - ( P[i, :] @ Q[j, :].T )

            P[i,:] = P[i,:] + learning_rate*(e_ij * Q[j, :] - r_lambda*P[i,:])
            Q[j,:] = Q[j,:] + learning_rate*(e_ij * P[i, :] - r_lambda*Q[j,:])

        rmse = get_rmse(R, P, Q, non_zeros)
        if (step % 10) == 0 :
            print("### iteration step : ", step," rmse : ", rmse)

    return P, Q

In [10]:
import pandas as pd
import numpy as np

movies = pd.read_csv("/content/drive/MyDrive/DATA-33/1.ML/data/movies.csv")
ratings = pd.read_csv("/content/drive/MyDrive/DATA-33/1.ML/data/ratings.csv")
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')

# title 컬럼을 얻기 이해 movies 와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')
rating_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [11]:
ratings_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [12]:
# columns='title' 로 title 컬럼으로 pivot 수행.
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')
ratings_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [13]:
P, Q = matrix_factorization(ratings_matrix.values, 50, 200, 0.01, 0.01)

### iteration step :  0  rmse :  2.9023619751336867
### iteration step :  10  rmse :  0.7335768591017927
### iteration step :  20  rmse :  0.5115539026853442
### iteration step :  30  rmse :  0.37261628282537446
### iteration step :  40  rmse :  0.2960818299181014
### iteration step :  50  rmse :  0.2520353192341642
### iteration step :  60  rmse :  0.22487503275269854
### iteration step :  70  rmse :  0.2068545530233154
### iteration step :  80  rmse :  0.19413418783028685
### iteration step :  90  rmse :  0.18470082002720406
### iteration step :  100  rmse :  0.17742927527209104
### iteration step :  110  rmse :  0.1716522696470749
### iteration step :  120  rmse :  0.16695181946871726
### iteration step :  130  rmse :  0.16305292191997542
### iteration step :  140  rmse :  0.15976691929679646
### iteration step :  150  rmse :  0.1569598699945732
### iteration step :  160  rmse :  0.15453398186715425
### iteration step :  170  rmse :  0.15241618551077643
### iteration step :  180  rm

In [14]:
# 예측 행렬
pred_matrix = P @ Q.T
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941
4,2.628629,3.03555,2.575746,3.706912,3.430636,0.706441,3.33028,1.978826,4.560368,2.77571,...,1.046116,2.912178,2.479592,2.231915,1.888629,2.211364,0.645603,1.585734,3.542892,0.59154
5,2.116148,3.084761,2.747679,3.78349,3.94699,0.883259,1.958953,1.757317,2.054312,2.775258,...,0.956159,3.893975,2.717024,2.002443,2.053337,3.983639,2.099626,1.423718,2.490428,0.531403


In [15]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함.
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임.
    user_rating = ratings_matrix.loc[userId,:]

    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()

    # 모든 영화명을 list 객체로 만듬.
    movies_list = ratings_matrix.columns.tolist()

    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함.
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]

    return unseen_list

In [16]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함.
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [17]:
# 사용자가 관람하지 않는 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 78)

# 잠재요인 기반 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 78, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성.
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Shakespeare in Love (1998),5.552192
Animal House (1978),5.279583
Lawrence of Arabia (1962),4.98925
Goodfellas (1990),4.962458
Highlander (1986),4.882037
Planet of the Apes (1968),4.789879
"Santa Clause, The (1994)",4.668173
"Fifth Element, The (1997)",4.66493
Caddyshack (1980),4.644259
It's a Wonderful Life (1946),4.64389
