**Movielens 데이터를 가지고 SVD 기반 잠재 요인 협업 필터링 연습**

In [1]:
import numpy as np
import pandas as pd
import warnings 
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [3]:
def get_rmse(R, P, Q, non_zeros):
    err = 0
    
    # 두 개의 분해된 행렬 P와 Q.T의 내적 곱으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)
    
    x_non_zero_idx = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_idx = [non_zero[1] for non_zero in non_zeros]
    
    R_non_zeros = R[x_non_zero_idx, y_non_zero_idx]
    
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_idx, y_non_zero_idx]
    
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    
    return mse
    

In [12]:
def matrix_factorization(R, K, steps=200, lr = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    
    np.random.seed(1)
    
    P = np.random.normal(scale=1 / K, size=(num_users, K))
    Q= np.random.normal(scale=1 / K, size=(num_items, K))
    
    # R > 0인 행 위치, 열 위치 값을 non_zeros 리스트 객체에 저장
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]
    
    for step in tqdm(range(steps)):
        for i, j, r in non_zeros:
            # 실제 값과 예측값의 차이인 오류 값 계산
            eij = r - np.dot(P[i, :], Q[j, :].T)
            
            P[i, :] = P[i, :] + lr * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + lr * (eij * P[i, :] - r_lambda * Q[j, :])
            
        rmse = get_rmse(R, P, Q, non_zeros)
        
        if step % 20 == 0:
            print(f'Iter : {step}, RMSE: {rmse}')
    return P, Q

In [5]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')

# title 컬럼을 얻기 이해 movies 와 조인 수행
rating_movies = pd.merge(ratings, movies, on='movieId')

# columns='title' 로 title 컬럼으로 pivot 수행. 
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')

In [13]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, lr=0.01, r_lambda = 0.01)
pred_matrix = np.dot(P, Q.T)

  0%|▍                                                                                 | 1/200 [00:01<04:45,  1.44s/it]

Iter : 0, RMSE: 8.423705034701914


 10%|████████▌                                                                        | 21/200 [00:28<04:03,  1.36s/it]

Iter : 20, RMSE: 0.2616873953526066


 20%|████████████████▌                                                                | 41/200 [00:56<03:47,  1.43s/it]

Iter : 40, RMSE: 0.08766445000765151


 30%|████████████████████████▋                                                        | 61/200 [01:59<10:09,  4.38s/it]

Iter : 60, RMSE: 0.05056878035552725


 40%|████████████████████████████████▊                                                | 81/200 [03:30<08:28,  4.27s/it]

Iter : 80, RMSE: 0.0376880828845251


 50%|████████████████████████████████████████▍                                       | 101/200 [04:56<07:23,  4.48s/it]

Iter : 100, RMSE: 0.031481147723579454


 60%|████████████████████████████████████████████████▍                               | 121/200 [06:25<05:44,  4.36s/it]

Iter : 120, RMSE: 0.027872910023915145


 70%|████████████████████████████████████████████████████████▍                       | 141/200 [07:09<01:57,  1.99s/it]

Iter : 140, RMSE: 0.02552546850158906


 80%|████████████████████████████████████████████████████████████████▍               | 161/200 [07:46<01:12,  1.87s/it]

Iter : 160, RMSE: 0.02388075155171797


 90%|████████████████████████████████████████████████████████████████████████▍       | 181/200 [08:24<00:34,  1.81s/it]

Iter : 180, RMSE: 0.02266554560767272


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [09:02<00:00,  2.71s/it]


In [14]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index= ratings_matrix.index, columns = ratings_matrix.columns)

ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941


In [15]:
def get_unseen_movies(ratings_matrix, userId):
    
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    user_rating = ratings_matrix.loc[userId,:]
    
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬
    already_seen = user_rating[ user_rating > 0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list


def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함. 
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

In [16]:
# 사용자가 관람하지 않는 영화명 추출   
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# 평점 데이타를 DataFrame으로 생성. 
recomm_movies = pd.DataFrame(data=recomm_movies.values,index=recomm_movies.index,columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Rear Window (1954),5.704612
"South Park: Bigger, Longer and Uncut (1999)",5.4511
Rounders (1998),5.298393
Blade Runner (1982),5.244951
Roger & Me (1989),5.191962
Gattaca (1997),5.183179
Ben-Hur (1959),5.130463
Rosencrantz and Guildenstern Are Dead (1990),5.087375
"Big Lebowski, The (1998)",5.03869
Star Wars: Episode V - The Empire Strikes Back (1980),4.989601
