<a href="https://colab.research.google.com/github/jayarnim/RS/blob/main/CollaborativeFiltering/(2)_Matrix_Factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

In [None]:
path_links = "https://raw.githubusercontent.com/jayarnim/RS/main/data/links.csv"
path_movies = "https://raw.githubusercontent.com/jayarnim/RS/main/data/movies.csv"
path_ratings = "https://raw.githubusercontent.com/jayarnim/RS/main/data/ratings.csv"
path_tags = "https://raw.githubusercontent.com/jayarnim/RS/main/data/tags.csv"

In [None]:
links = pd.read_csv(path_links)
movies = pd.read_csv(path_movies)
ratings = pd.read_csv(path_ratings)
tags = pd.read_csv(path_tags)

In [None]:
# User-Item Matrix 생성
user_item_matrix = ratings.pivot_table('rating', index = 'userId', columns = 'movieId')

# 결측치 0으로 채우기
user_item_matrix = user_item_matrix.fillna(0)

# Shape
user_item_matrix.shape

(671, 9066)

# Model Design

In [None]:
def get_rmse(R, P, Q, non_zeros):
    error = 0

    if np.isnan(P).sum() != 0: P = np.nan_to_num(P)
    if np.isnan(Q).sum() != 0: Q = np.nan_to_num(Q)
    full_pred_matrix = np.dot(P, Q.T)

    x_non_zero_ind = [non_zeros[0] for non_zeros in non_zeros]
    y_non_zero_ind = [non_zeros[1] for non_zeros in non_zeros]

    # 원 행렬 R에서 0이 아닌 값들만 추출
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]

    # 예측 행렬에서 원 행렬 R에서 0이 아닌 위치의 값들만 추출하여 저장
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]

    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)

    return rmse

In [None]:
def matrix_factorization(R, K, steps, learning_rate = 0.01, r_lambda=0.01):
    num_users, num_items = R.shape

    np.random.seed(1)
    P = np.random.normal(scale = 1.0/K, size = (num_users, K))
    Q = np.random.normal(scale = 1.0/K, size = (num_items, K))

    # R>0인 행 위치, 열 위치, 값을 non_zeros 리스트에 저장
    non_zeros = [(u, i, R[u, i]) for u in range(num_users) for i in range(num_items) if R[u, i] > 0]

    # SGD 기법으로 P, Q 매트릭스 업데이트
    for step in tqdm(range(steps)):
        if np.isnan(P).sum() != 0: P = np.nan_to_num(P)
        if np.isnan(Q).sum() != 0: Q = np.nan_to_num(Q)

        for u, i, r in non_zeros:
            # 잔차 구하기
            eui = r - np.dot(P[u, :], Q[i, :].T)

            # 확률적 경사하강법을 적용하여 모델파라미터 갱신
            P[u, :] = P[u, :] + learning_rate*(eui * Q[i, :] - r_lambda*P[u, :])
            Q[i, :] = Q[i, :] + learning_rate*(eui * P[u, :] - r_lambda*Q[i, :])

        rmse = get_rmse(R, P, Q, non_zeros)

        if step % 10 == 0: print(f"iter step: {step}, rmse: {rmse}")

    return P, Q

# Learning

In [None]:
P, Q = matrix_factorization(user_item_matrix.values, K=100, steps=50)
predicted_matrix = np.dot(P, Q.T)

  2%|▏         | 1/50 [00:01<01:23,  1.70s/it]

iter step: 0, rmse: 2.852961036616503


 22%|██▏       | 11/50 [00:19<01:07,  1.73s/it]

iter step: 10, rmse: 0.7942619655432082


 42%|████▏     | 21/50 [00:38<00:53,  1.85s/it]

iter step: 20, rmse: 0.5597026920858378


 62%|██████▏   | 31/50 [00:56<00:33,  1.74s/it]

iter step: 30, rmse: 0.37758995933610895


 82%|████████▏ | 41/50 [01:16<00:16,  1.85s/it]

iter step: 40, rmse: 0.2658245824487776


100%|██████████| 50/50 [01:32<00:00,  1.86s/it]


In [None]:
predicted_df = pd.DataFrame(predicted_matrix, index = user_item_matrix.index, columns = user_item_matrix.columns)
predicted_df

movieId,1,2,3,4,5,6,7,8,9,10,...,161084,161155,161594,161830,161918,161944,162376,162542,162672,163949
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.862048,2.545665,2.669003,1.473998,2.449499,2.806511,2.174200,2.171303,2.176480,2.280407,...,1.042588,0.247203,1.483670,0.515817,0.792772,2.516065,2.231368,3.159084,1.905684,2.072894
2,3.035868,3.404810,1.812353,2.377089,2.424039,3.594086,3.643860,2.952377,2.563558,3.912245,...,1.327911,0.259376,1.894361,0.637339,1.002508,3.290543,2.918332,4.151649,2.455332,2.767774
3,2.836235,3.130450,3.175177,2.065942,2.271966,3.893897,3.503871,2.937875,2.969968,3.427088,...,1.489122,0.302352,2.139691,0.747200,1.112490,3.489419,3.250415,4.388252,2.649621,2.992869
4,4.465329,4.641176,3.718999,3.315402,3.667164,3.343913,4.211116,3.311140,2.906580,4.309818,...,1.709778,0.352537,2.564828,0.890046,1.422814,3.879800,3.768254,4.985445,2.972922,3.455076
5,3.835445,3.965718,3.661853,2.335153,2.950381,3.097794,3.914294,3.468331,3.309351,3.568865,...,1.505315,0.274887,2.134446,0.776219,1.190914,3.646647,3.198215,4.606730,2.763974,3.021768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,3.799416,3.548524,2.930668,2.016984,2.324489,3.829280,3.621154,3.037160,2.486559,2.821966,...,1.318761,0.240220,1.963972,0.696063,1.069642,3.347023,2.950014,4.180996,2.505146,2.603361
668,3.749078,3.187730,2.253768,2.183236,2.753531,3.646313,2.715619,2.706966,2.558609,3.386557,...,1.514676,0.302895,2.037719,0.711483,1.058641,2.948853,3.075668,3.924048,2.358137,3.024250
669,4.361787,3.068350,2.366132,2.378154,2.854216,3.533280,3.180171,2.651376,2.307620,3.307014,...,1.446611,0.313106,2.104793,0.766603,1.116537,3.137774,3.175687,4.099801,2.460249,2.904998
670,4.194951,3.717980,3.448439,1.673645,2.508906,2.674795,2.887335,2.952292,2.512035,3.625831,...,1.398783,0.262908,2.118040,0.770212,1.134049,3.459299,3.215640,4.238288,2.587579,2.856409


# Recommendation

In [None]:
def recommenders(user_id, K, user_item_matrix=user_item_matrix, predicted_df=predicted_df):
    target_user_vector = user_item_matrix.iloc[[user_id]]
    target_items = target_user_vector[target_user_vector == 0].columns
    sorted_items = predicted_df.loc[user_id, target_items].sort_values(ascending=False)
    top_k_items = movies[movies["movieId"].isin(sorted_items[:K].index)]["title"]
    return top_k_items

In [None]:
recommenders(10, 10)

632             Maya Lin: A Strong Clear Vision (1994)
1001                              Graduate, The (1967)
1102                                Sling Blade (1996)
1486               There's Something About Mary (1998)
1626                            Out of the Past (1947)
2062                                Matrix, The (1999)
2248                         Christmas Story, A (1983)
2357                       Man Facing Southeast (1986)
3498                        Lion in Winter, The (1968)
5523    Gun Crazy (a.k.a. Deadly Is the Female) (1949)
Name: title, dtype: object