# 행렬분해를 이용한 잠재요인 협업 필터링 

In [4]:
import pandas as pd
import numpy as np

movies = pd.read_csv('./ml-latest-small/movies.csv')
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
ratings = ratings[['userId','movieId','rating']]
# ratings_mtx = ratings.pivot_table('rating', index= 'userId',columns='movieId')

rating_movies = ratings.merge(movies, on = 'movieId')
ratings_mtx = rating_movies.pivot_table('rating', index= 'userId', columns= 'title')



In [43]:

def matrix_factorization(R, K, steps = 200, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매드릭스의 크기를 지정하고 정규분포를 가진 랜덤한 값으로 입력
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users,K))
    Q = np.random.normal(scale=1./K, size=(num_items,K))

    # prev_rmse = 10000
    # break_count = 0

    # R > 0 인 행 위치, 열 위치, 값을 non_zeros 리스트 객체에 저장
    non_zeros = [(i,j,R[i,j]) for i in range(num_users) for j in range(num_items) if R[i,j] > 0 ]
    
    # SGD 기법으로 P와 Q 매트릭스를 계속 업데이트
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제값 과 예측값 차이인 오류 구함
            e_ij = r - np.dot(P[i,:], Q[j,:].T)
            # Regularization을 반영한 SGD 업데이트 공식 적용
            P[i,:] = P[i,:] + learning_rate * (e_ij * Q[j,:] - r_lambda * P[i,:])
            Q[j,:] = Q[j,:] + learning_rate * (e_ij * P[i,:] - r_lambda * Q[j,:])

        rmse = get_rmse(R, P, Q, non_zeros)

        if (step % 10 ) ==0:
            print("iteration step:", step, "rmse : " , rmse)

    return P,Q

from sklearn.metrics import mean_squared_error
def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두 개의 분해한 행렬 P와 Q.T의 내적으로 예측행렬 생성
    full_pred_mtx = np.dot(P, Q.T)

    # 실제 R 행렬에서 null이 아닌 값의 위치 인덱스 추출하여 실제 R행렬과 예측행렬 간의 RMSE 추출  
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind,y_non_zero_ind]
    full_pred_mtx_non_zeros = full_pred_mtx[x_non_zero_ind,y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_mtx_non_zeros)
    rmse = np.sqrt(mse)
    return rmse


In [44]:
P,Q = matrix_factorization(ratings_mtx.values, K= 50, steps= 200, learning_rate= 0.01, r_lambda= 0.01)
pred_mtx = np.dot(P, Q.T)

iteration step: 0 rmse :  2.9023619751336867
iteration step: 10 rmse :  0.7335768591017927
iteration step: 20 rmse :  0.5115539026853442
iteration step: 30 rmse :  0.37261628282537446
iteration step: 40 rmse :  0.29608182991810134
iteration step: 50 rmse :  0.2520353192341642
iteration step: 60 rmse :  0.2248750327526985
iteration step: 70 rmse :  0.20685455302331537
iteration step: 80 rmse :  0.19413418783028683
iteration step: 90 rmse :  0.184700820027204
iteration step: 100 rmse :  0.17742927527209104
iteration step: 110 rmse :  0.1716522696470749
iteration step: 120 rmse :  0.1669518194687172
iteration step: 130 rmse :  0.1630529219199754
iteration step: 140 rmse :  0.1597669192967964
iteration step: 150 rmse :  0.15695986999457318
iteration step: 160 rmse :  0.15453398186715428
iteration step: 170 rmse :  0.1524161855107764
iteration step: 180 rmse :  0.15055080739628304
iteration step: 190 rmse :  0.14889470913232092


In [45]:
ratings_pred_mtx = pd.DataFrame(pred_mtx, index= ratings_mtx.index, columns= ratings_mtx.columns)

# 사용자가 관람하지 않은 영화명 추출
def get_unseen_movies(rating_mtx, userId):
    # userId로 입력받은 사용자의 모든 영화정보를 추출하여 Series로 반환
    # 반환된 user_rating은 영화명(title)을 인덱스로 가지는 Series객체임
    user_rating = rating_mtx.loc[userId,:]

    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 인덱스를 추출해 list객체로 만듬
    already_list = user_rating[user_rating>0].index.tolist()

    # 모든 영화명을 list객체로 만듬
    movies_list = rating_mtx.columns.to_list()

    # list conprehension으로 already_list에 해당하는 영화는 movie_list에서 제외함
    unseen_list = [movie for movie in movies_list if movie not in already_list]

    return unseen_list

usr_id = 9
unseen_list = get_unseen_movies(ratings_mtx, usr_id)

# 잠재요인 협업필터링으로 영화추천
def recomm_movie_by_userId(pred_df, userId, unseen_list, top_n = 10):
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending = False)[:top_n]
    return recomm_movies

recomm_movies = recomm_movie_by_userId(ratings_pred_mtx,usr_id,unseen_list,10)

recomm_movies = pd.DataFrame(recomm_movies.values, index= recomm_movies.index, columns= ['pred_score'])
recomm_movies


Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Rear Window (1954),5.704612
"South Park: Bigger, Longer and Uncut (1999)",5.4511
Rounders (1998),5.298393
Blade Runner (1982),5.244951
Roger & Me (1989),5.191962
Gattaca (1997),5.183179
Ben-Hur (1959),5.130463
Rosencrantz and Guildenstern Are Dead (1990),5.087375
"Big Lebowski, The (1998)",5.03869
Star Wars: Episode V - The Empire Strikes Back (1980),4.989601
