In [21]:
import numpy as np
import pandas as pd

In [22]:
# Read in the data
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tag.csv')

In [23]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
movies['movieId'].unique().size

4999

In [25]:
ratings = ratings.sample(frac=1)
train_rating = ratings[:int(0.8*len(ratings))]
test_rating = ratings[int(0.8*len(ratings)):]

In [28]:
train_rating.shape, test_rating.shape

((652406, 4), (163102, 4))

In [29]:
movies['title'].unique().size

4999

In [30]:
train_rating['userId'].unique().size

7119

In [31]:
train_rating['userId'].unique().size*movies['title'].unique().size

35587881

In [32]:
# su = 0
# nu = 0
# for i, row in ratings.iterrows():
#     if(row['userId'] == 1):
#         su+=row['rating']
#         nu+=1
# su, nu, su/nu

In [33]:
user_item_matrix = train_rating.pivot(index='movieId', columns='userId', values='rating')

In [34]:
user_item_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,5.0,,4.0,,4.0,...,,,,,,4.0,4.0,,5.0,4.5
2,3.5,,,,,,,,,,...,,,,,,,,,,4.0
3,,4.0,,,,3.0,3.0,5.0,,,...,,,,,,,4.0,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,4.0,3.5,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5089,,,,,,,,,,,...,,,,,,,,,,
5090,,,,,,,,,,,...,,,,,,,,,,
5092,,,,,,,,,,,...,,,,,,,,,,
5093,,,,,,,,,,,...,,,,,,,,,,


The movies which did not have any ratings have been removed

In [35]:
um_mat_np = user_item_matrix.to_numpy()
masks = np.isnan(um_mat_np)
masked_arr = np.ma.masked_array(um_mat_np, masks)


In [36]:
rating_means = np.mean(masked_arr, axis=1)

In [37]:
filled_matrix = (masked_arr.T).filled(rating_means).T
print(filled_matrix)
filled_matrix = filled_matrix - rating_means.data[:,np.newaxis]

[[3.96819961 3.96819961 4.         ... 3.96819961 5.         4.5       ]
 [3.5        3.29044517 3.29044517 ... 3.29044517 3.29044517 4.        ]
 [3.17877095 4.         3.17877095 ... 3.17877095 3.17877095 3.17877095]
 ...
 [2.625      2.625      2.625      ... 2.625      2.625      2.625     ]
 [2.54545455 2.54545455 2.54545455 ... 2.54545455 2.54545455 2.54545455]
 [2.11904762 2.11904762 2.11904762 ... 2.11904762 2.11904762 2.11904762]]


In [38]:
rating_means

masked_array(data=[3.9681996086105675, 3.290445168295331,
                   3.17877094972067, ..., 2.625, 2.5454545454545454,
                   2.119047619047619],
             mask=[False, False, False, ..., False, False, False],
       fill_value=1e+20)

In [39]:
filled_matrix

array([[0.        , 0.        , 0.03180039, ..., 0.        , 1.03180039,
        0.53180039],
       [0.20955483, 0.        , 0.        , ..., 0.        , 0.        ,
        0.70955483],
       [0.        , 0.82122905, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [40]:
u, s, vh = np.linalg.svd(filled_matrix, full_matrices=False)

In [42]:
s_sqrt = s**0.5
s_sqrt_mt = np.diag(s_sqrt)

In [43]:
v = vh.T

In [44]:
k = 1000
U = u[:,:k]@s_sqrt_mt[:k,:k]

V = v[:,:k]@s_sqrt_mt[:k,:k]

In [45]:
U@V.T

array([[-2.05616550e-02, -1.03388740e-02,  3.09676074e-02, ...,
         1.14214773e-02,  1.01364961e+00,  5.20415399e-01],
       [ 2.30077226e-01, -6.35768394e-03, -2.17449539e-02, ...,
        -1.24089053e-02,  1.96123290e-02,  6.78838535e-01],
       [-2.56495481e-02,  7.73656505e-01,  6.04581331e-02, ...,
        -2.30053553e-03, -9.27158375e-04,  6.70333018e-03],
       ...,
       [ 6.16885523e-03,  1.72187164e-02, -7.09242893e-02, ...,
         4.84585906e-03,  1.27167814e-02, -5.54122469e-03],
       [ 5.85895277e-02, -2.64617231e-02, -2.02643211e-02, ...,
         1.59453798e-02, -1.17673279e-04,  2.45877912e-02],
       [ 3.48797385e-02, -5.19286733e-03,  4.56842672e-02, ...,
        -2.48882764e-02,  2.09142390e-02, -1.92634023e-03]])

In [59]:
svdout = U@V.T+ rating_means.data[:,np.newaxis]

In [60]:
def rmse(true, pred):
    return np.sqrt(np.mean((true-pred)**2))/len(true)

In [61]:
svdout.shape

(4788, 7119)

In [73]:
def rmse(true, pred):
    x = true - pred
    return sum([xi*xi for xi in x])/len(x)

pred = []
users_index = {user: i for i, user in enumerate(ratings['userId'].unique())}
items_index = {item: i for i, item in enumerate(ratings['movieId'].unique())}
for _,row in test_rating.iterrows():
    user = row['userId']
    item = row['movieId']
    u_index = users_index[user]
    if item in items_index:
        i_index = items_index[item]
        try: pred_rating = svdout[u_index, i_index]
        except: pass
    else:
        pred_rating = np.mean(svdout[u_index, :])
    pred.append(pred_rating)

print("Root Mean Squared Error", rmse(test_rating['rating'], pred))

Root Mean Squared Error 1.6429991642744506
