In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in the data
movies = pd.read_csv('movie.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tag.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['movieId'].unique().size

4999

In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,02/04/2005 23:53
1,1,29,3.5,02/04/2005 23:31
2,1,32,3.5,02/04/2005 23:33
3,1,47,3.5,02/04/2005 23:32
4,1,50,3.5,02/04/2005 23:29
...,...,...,...,...
815503,7120,168,5.0,02/04/2007 19:44
815504,7120,253,4.0,02/04/2007 19:30
815505,7120,260,5.0,02/04/2007 19:27
815506,7120,261,4.0,02/04/2007 19:49


In [6]:
movies['title'].unique().size

4999

In [7]:
ratings['userId'].unique().size

7119

In [8]:
ratings['userId'].unique().size*movies['title'].unique().size

35587881

In [9]:
# su = 0
# nu = 0
# for i, row in ratings.iterrows():
#     if(row['userId'] == 1):
#         su+=row['rating']
#         nu+=1
# su, nu, su/nu

In [10]:
user_item_matrix = ratings.pivot(index='movieId', columns='userId', values='rating')

In [11]:
user_item_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,7111,7112,7113,7114,7115,7116,7117,7118,7119,7120
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,4.0,,,5.0,,4.0,,4.0,...,,,,,,4.0,4.0,,5.0,4.5
2,3.5,,,,3.0,,,,,,...,,,,,,,,,,4.0
3,,4.0,,,,3.0,3.0,5.0,,,...,,,,,,,4.0,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,4.0,3.5,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5089,,,,,,,,,,,...,,,,,,,,,,
5090,,,,,,,,,,,...,,,,,,,,,,
5092,,,,,,,,,,,...,,,,,,,,,,
5093,,,,,,,,,,,...,,,,,,,,,,


The movies which did not have any ratings have been removed

In [12]:
um_mat_np = user_item_matrix.to_numpy()
masks = np.isnan(um_mat_np)
masked_arr = np.ma.masked_array(um_mat_np, masks)


In [13]:
rating_means = np.mean(masked_arr, axis=1)

In [14]:
filled_matrix = (masked_arr.T).filled(rating_means).T
print(filled_matrix)
filled_matrix = filled_matrix - rating_means.data[:,np.newaxis]

[[3.95932269 3.95932269 4.         ... 3.95932269 5.         4.5       ]
 [3.5        3.26839827 3.26839827 ... 3.26839827 3.26839827 4.        ]
 [3.18686131 4.         3.18686131 ... 3.18686131 3.18686131 3.18686131]
 ...
 [2.77272727 2.77272727 2.77272727 ... 2.77272727 2.77272727 2.77272727]
 [2.55633803 2.55633803 2.55633803 ... 2.55633803 2.55633803 2.55633803]
 [2.         2.         2.         ... 2.         2.         2.        ]]


In [15]:
rating_means

masked_array(data=[3.959322693655119, 3.2683982683982684,
                   3.1868613138686133, ..., 2.772727272727273,
                   2.556338028169014, 2.0],
             mask=[False, False, False, ..., False, False, False],
       fill_value=1e+20)

In [16]:
filled_matrix

array([[0.        , 0.        , 0.04067731, ..., 0.        , 1.04067731,
        0.54067731],
       [0.23160173, 0.        , 0.        , ..., 0.        , 0.        ,
        0.73160173],
       [0.        , 0.81313869, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [17]:
u, s, vh = np.linalg.svd(filled_matrix, full_matrices=False)

In [18]:
s_sqrt = s**0.5
s_sqrt_mt = np.diag(s_sqrt)

In [20]:
v = vh.T

In [21]:
k = 1000
U = u[:,:k]@s_sqrt_mt[:k,:k]

V = v[:,:k]@s_sqrt_mt[:k,:k]

In [25]:
U@V.T

array([[-0.01538027,  0.01618134,  0.04114364, ...,  0.0074478 ,
         1.0226725 ,  0.53218587],
       [ 0.24435209,  0.01400658,  0.01788566, ...,  0.01667478,
         0.00131165,  0.70989839],
       [-0.02412664,  0.85312412,  0.08008229, ...,  0.01655657,
         0.00572701, -0.00515875],
       ...,
       [-0.01559434, -0.02591435, -0.01862048, ...,  0.02754127,
         0.02184493, -0.00246039],
       [ 0.06428235, -0.01878038, -0.05657099, ...,  0.00766186,
        -0.02391253,  0.02084825],
       [ 0.04063342, -0.00437884,  0.07808655, ..., -0.0315639 ,
         0.02288696, -0.00334668]])

In [26]:
UsV = U@V.T+ rating_means.data[:,np.newaxis]

In [28]:
UsV

array([[3.94394242, 3.97550403, 4.00046633, ..., 3.96677049, 4.98199519,
        4.49150856],
       [3.51275036, 3.28240485, 3.28628393, ..., 3.28507305, 3.26970992,
        3.97829666],
       [3.16273467, 4.03998543, 3.26694361, ..., 3.20341789, 3.19258833,
        3.18170256],
       ...,
       [2.75713294, 2.74681292, 2.7541068 , ..., 2.80026854, 2.79457221,
        2.77026688],
       [2.62062038, 2.53755765, 2.49976704, ..., 2.56399989, 2.5324255 ,
        2.57718628],
       [2.04063342, 1.99562116, 2.07808655, ..., 1.9684361 , 2.02288696,
        1.99665332]])

In [27]:
def rmse(true, pred):
    return np.sqrt(np.mean((true-pred)**2))/len(true)

In [None]:
for _, row in ratings.iterrows():
    user = row['userId']
    movie = row['movieId']

    u_ind = np.where(user_item_matrix.columns == user)[0][0]
