In [14]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import mean_squared_error, r2_score

In [15]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, num_users, num_items, n_factors=20):
        super(MatrixFactorization, self).__init__()
        self.n_factors = n_factors

        self.user_factors = torch.randn(num_users, n_factors).requires_grad_()
        self.item_factors = torch.randn(n_factors, num_items).requires_grad_()

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [16]:
def NMF(input_matrix, factors, epochs=200, learning_rate=0.008, weight_decay=0.01):
    num_users, num_items = input_matrix.shape

    model = MatrixFactorization(num_users, num_items, factors)
    optimizer = torch.optim.Adam([model.item_factors, model.user_factors], lr=learning_rate)
    loss_function = torch.nn.MSELoss()

    loss_curve = []

    for epoch in range(epochs):
        optimizer.zero_grad()
        mask = ~torch.isnan(input_matrix)

        base = (model.user_factors @ model.item_factors)[mask].clone().detach().requires_grad_(True)
        loss = loss_function((model.user_factors @ model.item_factors)[mask], input_matrix[mask])
        loss.backward()
        optimizer.step()

        model.user_factors.data.clamp_(min=0)
        model.item_factors.data.clamp_(min=0)

        loss_curve.append(loss.item())

        if epoch % 25 == 0:
            print(f'[{epoch}] loss: {loss.item():.05f}')
            print("query matrix:", base)
            print("target matrix:", input_matrix[mask])

    return model.user_factors @ model.item_factors

# Data Preprocessing

In [17]:
movies = pd.read_table(
    'dataset/ml-1m/movies.dat',
    sep="::",
    engine="python",
    names=["MovieID", "Title", "Genres"],
    encoding='latin1'
)

In [18]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [19]:
ratings = pd.read_table(
    'dataset/ml-1m/ratings.dat',
    sep="::",
    engine="python",
    names=["UserID", "MovieID", "Rating", "Timestamp"]
)

In [20]:
ratings = ratings[['UserID', 'MovieID', 'Rating']]

In [21]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype
---  ------   --------------    -----
 0   UserID   1000209 non-null  int64
 1   MovieID  1000209 non-null  int64
 2   Rating   1000209 non-null  int64
dtypes: int64(3)
memory usage: 22.9 MB


In [22]:
ratings[ratings['UserID'] == 1612]

Unnamed: 0,UserID,MovieID,Rating
265393,1612,589,4
265394,1612,1266,5
265395,1612,590,5
265396,1612,599,5
265397,1612,1283,4
265398,1612,3030,5
265399,1612,2401,3
265400,1612,3253,3
265401,1612,3624,4
265402,1612,3671,3


In [23]:
from sklearn.model_selection import train_test_split

# split train dataset and test dataset
X_train, X_test, Y_train, Y_test = train_test_split(ratings[["UserID", "MovieID"]], ratings[["Rating"]], test_size=0.33, stratify=ratings[["Rating"]])

In [24]:
train_dataset = pd.concat([X_train, Y_train], axis=1)
train_ratings_movies = pd.merge(train_dataset, movies, on='MovieID', how="outer")
train_ratings_matrix = train_ratings_movies.pivot_table('Rating', index='UserID', columns='MovieID')

train_ratings_movies_no_review = train_ratings_movies[train_ratings_movies['UserID'].isna()]["MovieID"].values
print(train_ratings_movies_no_review)

for i in train_ratings_movies_no_review:
    train_ratings_matrix[i] = np.nan

train_ratings_matrix

[  51  109  115  133  138  139  143  284  285  395  399  400  401  402
  403  439  576  584  604  620  625  629  636  642  651  654  675  676
  679  683  684  693  699  701  713  717  721  723  727  738  739  752
  758  768  770  772  773  777  792  794  795  797  812  816  819  822
  825  845  855  856  857  865  868  871  873  878  890  894  979  983
 1001 1045 1052 1065 1070 1075 1106 1108 1109 1110 1115 1122 1137 1140
 1141 1143 1146 1155 1156 1157 1158 1159 1166 1308 1309 1314 1316 1318
 1319 1368 1386 1400 1424 1430 1443 1448 1462 1467 1524 1548 1557 1559
 1568 1577 1578 1628 1630 1697 1698 1705 1706 1708 1709 1710 1714 1716
 1723 1724 1738 1740 1742 1757 1765 1768 1774 1776 1781 1789 1795 1819
 1842 1847 1908 2030 2199 2216 2218 2220 2222 2224 2225 2228 2229 2230
 2235 2270 2274 2308 2319 2489 2508 2547 2556 2563 2564 2576 2588 2595
 2601 2603 2604 2619 2680 2684 2698 2832 2838 2910 2954 2957 2958 2980
 3009 3023 3059 3080 3151 3164 3170 3172 3191 3193 3195 3202 3209 3216
 3220 

  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matrix[i] = np.nan
  train_ratings_matr

MovieID,1,2,3,4,5,6,7,8,9,10,...,3650,3651,3687,3722,3750,3829,3856,3881,3904,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,5.0,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,,,,2.0,,,,,,,...,,,,,,,,,,
6037.0,,,,,,,,,,,...,,,,,,,,,,
6038.0,,,,,,,,,,,...,,,,,,,,,,
6039.0,,,,,,,,,,,...,,,,,,,,,,


In [25]:
train_ratings_matrix_tensor = torch.Tensor(train_ratings_matrix.values)

train_ratings_matrix_tensor

tensor([[5., nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]])

In [26]:
nmf_run = NMF(train_ratings_matrix_tensor, 50, epochs=1000, learning_rate=0.015)

[0] loss: 63.83737
query matrix: tensor([  3.2974, -12.1233,   3.9091,  ...,  -2.4112,   5.8023,  -2.0179],
       requires_grad=True)
target matrix: tensor([5., 5., 5.,  ..., 4., 4., 4.])
[25] loss: 4.60081
query matrix: tensor([3.4695, 2.0561, 4.9114,  ..., 4.7292, 2.2032, 2.3928],
       requires_grad=True)
target matrix: tensor([5., 5., 5.,  ..., 4., 4., 4.])
[50] loss: 2.30418
query matrix: tensor([3.9592, 3.4150, 5.0681,  ..., 5.0668, 3.6508, 3.1090],
       requires_grad=True)
target matrix: tensor([5., 5., 5.,  ..., 4., 4., 4.])
[75] loss: 1.59695
query matrix: tensor([4.1442, 3.7306, 4.9758,  ..., 4.6533, 3.5841, 3.1010],
       requires_grad=True)
target matrix: tensor([5., 5., 5.,  ..., 4., 4., 4.])
[100] loss: 1.25771
query matrix: tensor([4.5826, 3.9507, 4.9814,  ..., 4.7583, 3.7294, 3.1790],
       requires_grad=True)
target matrix: tensor([5., 5., 5.,  ..., 4., 4., 4.])
[125] loss: 1.05683
query matrix: tensor([4.6821, 4.0936, 4.9516,  ..., 4.6734, 3.7815, 3.2016],
     

In [27]:
nmf_run_df = pd.DataFrame(nmf_run.detach().numpy())
nmf_run_df.index = train_ratings_matrix.index
nmf_run_df.columns = train_ratings_matrix.columns

nmf_run_df

MovieID,1,2,3,4,5,6,7,8,9,10,...,3650,3651,3687,3722,3750,3829,3856,3881,3904,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,4.848040,3.189363,2.599701,2.690886,2.953907,3.678515,3.402231,3.288023,3.323039,2.818041,...,12.815477,5.754415,5.867908,1.573529,7.264834,4.858797,7.227087,9.045208,8.839818,4.306300
2.0,5.184196,3.293521,4.060660,3.187594,3.678262,4.638399,4.006110,2.754123,2.728889,3.260011,...,7.168482,4.334311,6.159217,4.471703,9.590286,4.683125,5.551384,6.608367,7.004068,4.611779
3.0,4.603663,3.660726,3.496584,2.995654,1.843099,3.046447,4.167714,1.578693,2.922004,4.626464,...,7.681478,10.284344,7.429128,2.998708,9.455853,5.171427,6.520154,7.916936,7.808958,4.928069
4.0,6.499626,3.105597,3.058495,3.606492,1.636151,3.057977,3.793099,2.145013,2.955956,4.683189,...,7.948126,5.622811,8.894956,3.011746,5.312291,3.247801,9.198235,5.583578,7.543453,3.350332
5.0,2.995261,2.670078,1.695522,2.369923,2.496359,3.105526,1.326250,3.117325,1.659283,1.886879,...,8.577444,3.394811,6.682137,1.812343,4.125773,0.663778,7.230131,6.825421,5.622892,3.093388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,4.508447,1.963145,2.215807,2.131046,1.930634,3.393058,2.899270,2.352770,1.273264,2.753289,...,7.603426,5.365980,6.680629,3.139781,5.240784,2.933779,6.155086,5.026023,6.491913,3.457594
6037.0,3.734843,3.183382,2.735752,2.945406,2.873638,3.903618,2.387072,3.928575,2.921319,2.714924,...,10.053030,4.552134,5.344594,2.279794,3.588158,5.819675,6.160648,6.099261,7.344584,5.512695
6038.0,4.741640,3.173543,1.882627,5.334695,2.996348,3.710971,3.187050,2.869212,2.387277,2.812062,...,6.236632,4.914486,10.420261,3.624340,4.529824,1.948795,7.209620,7.638494,5.576539,4.213588
6039.0,4.663660,3.484951,2.564434,2.689116,2.723353,3.012788,3.103903,2.083166,2.095345,3.290568,...,8.155901,7.123187,5.781430,2.202739,7.233784,3.454859,7.355768,7.458550,7.475792,5.371361


In [28]:
train_ratings_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3650,3651,3687,3722,3750,3829,3856,3881,3904,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,5.0,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,,,,2.0,,,,,,,...,,,,,,,,,,
6037.0,,,,,,,,,,,...,,,,,,,,,,
6038.0,,,,,,,,,,,...,,,,,,,,,,
6039.0,,,,,,,,,,,...,,,,,,,,,,


In [29]:
def get_watched_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    already_seen = user_rating[user_rating > 0].index.tolist()
    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie in already_seen]
    return unseen_list

In [30]:
def get_unseen_movies(ratings_matrix, userId):
    user_rating = ratings_matrix.loc[userId, :]
    already_seen = user_rating[user_rating > 0].index.tolist()
    movies_list = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in movies_list if movie not in already_seen]
    return unseen_list

In [31]:
def recommend_movie_by_userid(ratings_matrix, prd_df, userId, top_n=10):
    unseen_list = get_unseen_movies(ratings_matrix, userId)
    recommend_movies = prd_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recommend_movies

In [32]:
recommend_movie_by_userid(train_ratings_matrix, nmf_run_df, 4, 10)

MovieID
138     13.889814
1774    13.615558
604     13.323882
402     13.278839
819     13.213208
890     12.978251
1319    12.744153
3220    12.725655
773     12.675980
3023    12.626591
Name: 4.0, dtype: float32

Validation

In [33]:
test_dataset = pd.concat([X_test, Y_test], axis=1)
test_ratings_movies = pd.merge(test_dataset, movies, on='MovieID', how="outer")
test_ratings_matrix = test_ratings_movies.pivot_table('Rating', index='UserID', columns='MovieID')

test_ratings_movies_no_review = test_ratings_movies[test_ratings_movies['UserID'].isna()]["MovieID"].values

for i in test_ratings_movies_no_review:
    test_ratings_matrix[i] = np.nan

test_ratings_matrix


# train_dataset = pd.concat([X_train, Y_train], axis=1)
# train_ratings_movies = pd.merge(train_dataset, movies, on='MovieID', how="outer")
# train_ratings_matrix = train_ratings_movies.pivot_table('Rating', index='UserID', columns='MovieID')
#
# train_ratings_movies_no_review = train_ratings_movies[train_ratings_movies['UserID'].isna()]["MovieID"].values
# print(train_ratings_movies_no_review)
#
# for i in train_ratings_movies_no_review:
#     train_ratings_matrix[i] = np.nan

  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings_matrix[i] = np.nan
  test_ratings

MovieID,1,2,3,4,5,6,7,8,9,10,...,3779,3800,3828,3829,3842,3856,3887,3888,3890,3907
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036.0,,,,,,3.0,,,,,...,,,,,,,,,,
6037.0,,,,,,,,,,,...,,,,,,,,,,
6038.0,,,,,,,,,,,...,,,,,,,,,,
6039.0,,,,,,,,,,,...,,,,,,,,,,


In [34]:
test_ratings_matrix_tensor = torch.Tensor(test_ratings_matrix.values)

test_ratings_matrix_tensor

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [3., nan, nan,  ..., nan, nan, nan]])

In [35]:
def get_evaluation(query_matrix, target_matrix):

    mask = ~torch.isnan(target_matrix)

    print(query_matrix[mask])
    print(target_matrix[mask])

    mse = mean_squared_error(query_matrix[mask].detach().numpy(), target_matrix[mask].detach().numpy())
    r2 = r2_score(query_matrix[mask].detach().numpy(), target_matrix[mask].detach().numpy())
    rmse = np.sqrt(mse)

    return rmse, r2

In [36]:
eval_rmse, eval_r2 = get_evaluation(nmf_run, test_ratings_matrix_tensor)

tensor([3.7976, 2.3852, 4.4166,  ..., 3.7037, 2.7374, 2.0808],
       grad_fn=<IndexBackward0>)
tensor([4., 4., 4.,  ..., 4., 4., 5.])


In [37]:
print("RMSE:", eval_rmse, "R2 Score:", eval_r2)

RMSE: 1.4187696 R2 Score: -0.7970131893439534
