In [4]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, num_users, num_items, n_factors=20):
        super(MatrixFactorization, self).__init__()
        self.n_factors = n_factors

        self.user_factors = torch.randn(num_users, n_factors).requires_grad_()
        self.item_factors = torch.randn(n_factors, num_items).requires_grad_()

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [6]:
def NMF(input_matrix, factors, epochs=200, learning_rate=0.008, weight_decay=0.01):
    num_users, num_items = input_matrix.shape

    model = MatrixFactorization(num_users, num_items, factors)
    optimizer = torch.optim.SGD([model.item_factors, model.user_factors], lr=learning_rate)
    loss_function = torch.nn.MSELoss()

    loss_curve = []

    for epoch in range(epochs):
        optimizer.zero_grad()
        mask = ~torch.isnan(input_matrix)

        base = (model.user_factors @ model.item_factors)[mask].clone().detach().requires_grad_(True)
        loss = loss_function((model.user_factors @ model.item_factors)[mask], input_matrix[mask])
        loss.backward()
        optimizer.step()

        model.user_factors.data.clamp_(min=0)
        model.item_factors.data.clamp_(min=0)

        loss_curve.append(loss.item())

        if epoch % 25 == 0:
            print(f'[{epoch}] loss: {loss.item():.05f}')
            print("query matrix:", base)
            print("target matrix:", input_matrix[mask])

    return model.user_factors @ model.item_factors

# Data Preprocessing

In [7]:
movies = pd.read_table(
    'dataset/ml-1m/movies.dat',
    sep="::",
    engine="python",
    names=["MovieID", "Title", "Genres"],
    encoding='latin1'
)

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [9]:
ratings = pd.read_table(
    'dataset/ml-1m/ratings.dat',
    sep="::",
    engine="python",
    names=["UserID", "MovieID", "Rating", "Timestamp"]
)

In [10]:
ratings = ratings[['UserID', 'MovieID', 'Rating']]

In [11]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   UserID   1000209 non-null  int64
 1   MovieID  1000209 non-null  int64
 2   Rating   1000209 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


In [12]:
ratings[ratings['UserID'] == 1612]

Unnamed: 0,UserID,MovieID,Rating
265393,1612,589,4
265394,1612,1266,5
265395,1612,590,5
265396,1612,599,5
265397,1612,1283,4
265398,1612,3030,5
265399,1612,2401,3
265400,1612,3253,3
265401,1612,3624,4
265402,1612,3671,3


In [13]:
from sklearn.model_selection import train_test_split

# split train dataset and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(ratings[["UserID", "MovieID"]], ratings[["Rating"]], test_size=0.33, stratify=ratings[["Rating"]])

In [14]:
train_dataset = pd.concat([X_train, Y_train], axis=1)
train_ratings_movies = pd.merge(train_dataset, movies, on='MovieID', how="outer")
train_ratings_matrix = train_ratings_movies.pivot_table('Rating', index='UserID', columns='MovieID')

train_ratings_movies_no_review = train_ratings_movies[train_ratings_movies['UserID'].isna()]["MovieID"].values
print(train_ratings_movies_no_review)

for i in train_ratings_movies_no_review:
    train_ratings_matrix[i] = np.nan

train_ratings_matrix

[  51  109  115  143  284  285  286  311  395  396  398  399  400  401
  403  545  584  604  620  625  629  636  644  654  675  676  683  684
  690  693  699  713  721  723  727  729  738  739  752  763  768  770
  772  773  777  792  794  795  797  812  814  816  819  822  825  845
  855  856  857  859  868  871  872  873  890  894  979  983  989 1001
 1045 1052 1065 1075 1106 1108 1109 1110 1122 1133 1137 1140 1141 1143
 1146 1155 1156 1157 1158 1159 1165 1166 1308 1309 1314 1316 1318 1319
 1364 1368 1386 1400 1424 1443 1448 1462 1467 1524 1557 1558 1559 1568
 1577 1578 1628 1697 1698 1705 1706 1708 1709 1710 1716 1723 1724 1738
 1740 1742 1757 1765 1768 1773 1774 1776 1781 1787 1789 1819 1820 1830
 1832 1847 1915 2030 2039 2198 2199 2213 2216 2218 2220 2222 2223 2224
 2225 2226 2228 2229 2230 2235 2251 2270 2274 2277 2319 2438 2489 2508
 2547 2556 2564 2588 2592 2595 2601 2603 2604 2680 2684 2698 2742 2811
 2832 2838 2910 2954 2957 2958 2980 3009 3023 3059 3065 3080 3170 3172
 3191 

  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matr

MovieID,1,2,3,4,5,6,7,8,9,10,...,3583,3589,3607,3630,3650,3750,3829,3856,3888,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037.0,,,,,,,,,,,...,,,,,,,,,,
6038.0,,,,,,,,,,,...,,,,,,,,,,
6039.0,,,,,,,,,,,...,,,,,,,,,,


In [15]:
train_ratings_matrix_tensor = torch.Tensor(train_ratings_matrix.values)

train_ratings_matrix_tensor

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [3., nan, nan,  ..., nan, nan, nan]])

In [18]:
nmf_run = NMF(train_ratings_matrix_tensor, 50, epochs=1000, learning_rate=0.05)

[0] loss: 63.88146
query matrix: tensor([ -0.6690, -14.0964,  -5.4521,  ...,  -0.8872, -14.2313,   6.4256],
       requires_grad=True)
target matrix: tensor([5., 4., 5.,  ..., 4., 4., 5.])
[25] loss: 31.20033
query matrix: tensor([ 7.1579,  3.1850,  5.6819,  ..., 12.1479,  4.6623, 13.6308],
       requires_grad=True)
target matrix: tensor([5., 4., 5.,  ..., 4., 4., 5.])
[50] loss: 30.51765
query matrix: tensor([ 7.1331,  3.1137,  5.6448,  ..., 12.0285,  4.6237, 13.5713],
       requires_grad=True)
target matrix: tensor([5., 4., 5.,  ..., 4., 4., 5.])
[75] loss: 29.86369
query matrix: tensor([ 7.1087,  3.0452,  5.6086,  ..., 11.9156,  4.5869, 13.5128],
       requires_grad=True)
target matrix: tensor([5., 4., 5.,  ..., 4., 4., 5.])
[100] loss: 29.23651
query matrix: tensor([ 7.0846,  2.9792,  5.5733,  ..., 11.8095,  4.5514, 13.4552],
       requires_grad=True)
target matrix: tensor([5., 4., 5.,  ..., 4., 4., 5.])
[125] loss: 28.63456
query matrix: tensor([ 7.0608,  2.9168,  5.5389,  ...

In [19]:
nmf_run_df = pd.DataFrame(nmf_run.detach().numpy())
nmf_run_df.index = train_ratings_matrix.index
nmf_run_df.columns = train_ratings_matrix.columns

nmf_run_df

MovieID,1,2,3,4,5,6,7,8,9,10,...,3583,3589,3607,3630,3650,3750,3829,3856,3888,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,7.707134,6.548327,2.288252,3.880290,10.959547,7.636924,6.360629,2.342788,8.280111,1.857353,...,7.115367,13.141130,5.922108,5.876001,3.390322,4.868571,4.550985,4.799603,4.706707,2.525662
2.0,6.485924,7.497237,7.838967,7.568127,13.973622,9.116263,9.779565,7.565547,9.055064,7.569561,...,7.038107,16.724285,8.680594,7.819685,7.258132,6.729010,7.655116,6.531838,7.043122,5.965865
3.0,8.739446,8.096189,8.108049,6.765227,11.133134,9.539337,7.415297,7.497614,10.398784,10.183462,...,4.505540,12.517785,9.879208,5.879658,6.797884,8.970874,10.461646,10.146134,8.245473,4.718046
4.0,3.319355,9.359033,6.938433,11.342104,10.230756,6.188112,12.438416,9.353330,10.580575,6.368698,...,13.043844,14.436787,5.763062,7.979859,4.938890,7.495041,3.074807,5.233018,8.035683,5.340857
5.0,3.048154,1.907055,3.536581,5.132673,8.118852,2.141488,3.528019,2.347401,6.642125,5.705870,...,3.911346,9.359712,4.832688,2.768316,4.937232,3.974353,5.844777,1.631952,6.980395,3.740635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,7.606223,8.624387,3.887067,4.602986,9.111892,5.316177,8.566813,8.261905,7.502414,3.517835,...,10.043756,15.902511,4.259898,4.885484,6.793024,11.931664,8.691433,6.429267,15.309528,7.506709
6037.0,6.515486,3.931666,4.839049,8.534121,5.008006,2.807758,12.110806,8.492108,6.455790,7.087258,...,5.513833,21.615683,4.894081,8.274766,5.294689,9.584567,10.720985,6.520925,10.493561,10.294248
6038.0,3.630445,4.599528,1.242643,7.517707,5.852960,3.353960,3.698405,2.825566,5.638689,7.194855,...,2.945055,8.995493,4.161080,2.639357,4.698537,9.005645,4.479011,4.898257,5.205176,2.470534
6039.0,5.211269,7.306359,7.575188,12.366103,16.818428,5.614626,15.085896,9.835941,12.267246,6.594033,...,9.738016,17.027313,8.929976,8.286286,8.198546,10.830801,12.442554,10.041059,9.356113,9.209588


In [20]:
train_ratings_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3583,3589,3607,3630,3650,3750,3829,3856,3888,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037.0,,,,,,,,,,,...,,,,,,,,,,
6038.0,,,,,,,,,,,...,,,,,,,,,,
6039.0,,,,,,,,,,,...,,,,,,,,,,


In [21]:
def get_watched_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    already_seen = user_rating[user_rating > 0].index.tolist()
    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie in already_seen]
    return unseen_list

In [22]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    already_seen = user_rating[user_rating > 0].index.tolist()
    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    return unseen_list

In [23]:
def recommend_movie_by_userid(ratings_matrix, prd_df, userId, top_n=10):
    unseen_list = get_unseen_movies(ratings_matrix, userId)
    recommend_movies = prd_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recommend_movies

In [24]:
recommend_movie_by_userid(train_ratings_matrix, nmf_run_df, 4, 10)

MovieID
374     22.045078
3390    21.643080
2083    21.571043
779     21.514101
2661    19.751644
2845    19.603918
739     19.477932
399     18.982264
2235    18.860210
2484    18.827888
Name: 4.0, dtype: float32

Validation

In [25]:
test_dataset = pd.concat([X_test, Y_test], axis=1)
test_ratings_movies = pd.merge(test_dataset, movies, on='MovieID', how="outer")
test_ratings_matrix = test_ratings_movies.pivot_table('Rating', index='UserID', columns='MovieID')

test_ratings_movies_no_review = test_ratings_movies[test_ratings_movies['UserID'].isna()]["MovieID"].values

for i in test_ratings_movies_no_review:
    test_ratings_matrix[i] = np.nan

test_ratings_matrix


# train_dataset = pd.concat([X_train, Y_train], axis=1)
# train_ratings_movies = pd.merge(train_dataset, movies, on='MovieID', how="outer")
# train_ratings_matrix = train_ratings_movies.pivot_table('Rating', index='UserID', columns='MovieID')
#
# train_ratings_movies_no_review = train_ratings_movies[train_ratings_movies['UserID'].isna()]["MovieID"].values
# print(train_ratings_movies_no_review)
#
# for i in train_ratings_movies_no_review:
#     train_ratings_matrix[i] = np.nan

  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings

MovieID,1,2,3,4,5,6,7,8,9,10,...,3762,3779,3829,3842,3856,3881,3890,3891,3904,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,5.0,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,,,,,,,,,,,...,,,,,,,,,,
6037.0,,,,,,,,,,,...,,,,,,,,,,
6038.0,,,,,,,,,,,...,,,,,,,,,,
6039.0,,,,,,,,,,,...,,,,,,,,,,


In [26]:
test_ratings_matrix_tensor = torch.Tensor(test_ratings_matrix.values)

test_ratings_matrix_tensor

tensor([[5., nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]])

In [27]:
def get_evaluation(query_matrix, target_matrix):

    mask = ~torch.isnan(target_matrix)

    print(query_matrix[mask])
    print(target_matrix[mask])

    mse = mean_squared_error(query_matrix[mask].detach().numpy(), target_matrix[mask].detach().numpy())
    r2 = r2_score(query_matrix[mask].detach().numpy(), target_matrix[mask].detach().numpy())
    rmse = np.sqrt(mse)

    return rmse, r2

In [28]:
eval_rmse, eval_r2 = get_evaluation(nmf_run, test_ratings_matrix_tensor)

tensor([ 7.7071,  3.0979,  3.9012,  ...,  3.4748, 14.3660, 13.5652],
       grad_fn=<IndexBackward0>)
tensor([5., 5., 4.,  ..., 4., 4., 4.])


In [29]:
print("RMSE:", eval_rmse, "R2 Score:", eval_r2)

RMSE: 4.6760654 R2 Score: -1.4221170889154027
