# MovieLens / SVD

In [1]:
import pandas as pd
import numpy as np

In [17]:
df_movies = pd.read_csv("../data/movie-lens-small/movies.csv")
df_movies.sample(10)

Unnamed: 0,movieId,title,genres
5733,30745,Gozu (Gokudô kyôfu dai-gekijô: Gozu) (2003),Comedy|Crime|Drama|Horror|Mystery
4470,6598,Step Into Liquid (2002),Documentary
4415,6516,Anastasia (1956),Drama
2409,3198,Papillon (1973),Crime|Drama
8105,100527,Safe Haven (2013),Drama|Mystery|Romance
914,1213,Goodfellas (1990),Crime|Drama
9200,151315,Ride Along 2 (2016),Action|Comedy
4067,5797,"Company of Wolves, The (1984)",Fantasy|Horror
1883,2502,Office Space (1999),Comedy|Crime
5803,31851,Sons of the Desert (1933),Comedy


In [2]:
df = pd.read_csv("../data/movie-lens-small/ratings.csv")
df.shape

(100836, 4)

In [3]:
df.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
44719,298,3254,2.5,1466278105
60223,387,48780,4.5,1174624345
66443,428,1320,2.5,1111487532
9867,64,6874,4.0,1161520239
79658,495,46970,3.5,1458636913
8639,59,648,5.0,953609356
24936,177,344,4.0,1435533575
64025,414,5812,3.5,1095465993
4705,28,60040,3.0,1234572131
35640,240,339,3.0,849122226


## Pivot Dataframe into a Matrix

In [4]:
df_pivot = df.pivot(index="userId", columns="movieId", values="rating")
df_pivot.shape

(610, 9724)

In [5]:
df_pivot.sample(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
368,,,3.0,,,4.0,,,,3.0,...,,,,,,,,,,
311,,,,,,,,,,,...,,,,,,,,,,
30,,,,,,,,,,,...,,,,,,,,,,
320,,,,,,,,,,,...,,,,,,,,,,
228,,,,,,,,,,,...,,,,,,,,,,
153,2.0,2.0,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
170,,,,,3.0,,,,,3.0,...,,,,,,,,,,
343,,,,,,,,,,,...,,,,,,,,,,
17,4.5,,,,,,,,,,...,,,,,,,,,,


## Calculating Sparseness

In [6]:
userIDs = len(df["userId"].unique().tolist())
userIDs

610

In [7]:
MovieIDs = len(df["movieId"].unique().tolist())
MovieIDs

9724

In [8]:
sparseness = 1 - (df.shape[0] / (MovieIDs * userIDs))
sparseness

0.9830003169443864

## Removing Rating Bias

In [9]:
users_avg_rating = df_pivot.mean(axis=1)
users_avg_rating = np.array(users_avg_rating)
users_avg_rating

array([4.36637931, 3.94827586, 2.43589744, 3.55555556, 3.63636364,
       3.49363057, 3.23026316, 3.57446809, 3.26086957, 3.27857143,
       3.78125   , 4.390625  , 3.64516129, 3.39583333, 3.44814815,
       3.7244898 , 4.20952381, 3.73207171, 2.60739687, 3.59090909,
       3.26072235, 2.57142857, 3.64876033, 3.65      , 4.80769231,
       3.23809524, 3.54814815, 3.02017544, 4.14197531, 4.73529412,
       3.92      , 3.75490196, 3.78846154, 3.41860465, 4.08695652,
       2.63333333, 4.14285714, 3.21794872, 4.        , 3.76699029,
       3.25345622, 3.56590909, 4.55263158, 3.35416667, 3.87593985,
       4.        , 3.05357143, 4.03030303, 4.26190476, 2.78064516,
       3.77576602, 4.47692308, 5.        , 3.03030303, 2.84      ,
       3.80434783, 3.39285714, 3.90178571, 4.35514019, 3.72727273,
       4.05128205, 4.08196721, 3.63099631, 3.7688588 , 4.02941176,
       4.02028986, 3.97222222, 3.23373016, 4.36956522, 4.32258065,
       3.6       , 4.15555556, 3.71190476, 4.27118644, 3.23188

In [10]:
pivot = df_pivot.to_numpy()
pivot

array([[4. , nan, 4. , ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [2.5, 2. , 2. , ..., nan, nan, nan],
       [3. , nan, nan, ..., nan, nan, nan],
       [5. , nan, nan, ..., nan, nan, nan]])

In [11]:
normalized_matrix = pivot - users_avg_rating.reshape(-1, 1)
normalized_matrix

array([[-0.36637931,         nan, -0.36637931, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       ...,
       [-0.63417569, -1.13417569, -1.13417569, ...,         nan,
                nan,         nan],
       [-0.27027027,         nan,         nan, ...,         nan,
                nan,         nan],
       [ 1.31144393,         nan,         nan, ...,         nan,
                nan,         nan]])

In [12]:
normalized_matrix = np.nan_to_num(normalized_matrix, nan=0)
normalized_matrix

array([[-0.36637931,  0.        , -0.36637931, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.63417569, -1.13417569, -1.13417569, ...,  0.        ,
         0.        ,  0.        ],
       [-0.27027027,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.31144393,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

## Apply SVD

In [13]:
# SVDS is SVD version for Sparse Matrices

from scipy.sparse.linalg import svds

u, s, vt = svds(normalized_matrix, k=50)
s = np.diag(s)

print(u.shape, s.shape, vt.shape)

(610, 50) (50, 50) (50, 9724)


In [14]:
regenerated = np.dot(np.dot(u, s), vt) + users_avg_rating.reshape(-1, 1)
regenerated

array([[4.30616055, 4.29639677, 4.40595086, ..., 4.36676653, 4.36676653,
        4.36724272],
       [3.94598477, 3.93225963, 3.9367516 , ..., 3.94819836, 3.94819836,
        3.94793259],
       [2.42520627, 2.4731044 , 2.40444357, ..., 2.43590095, 2.43590095,
        2.43595447],
       ...,
       [2.40823265, 1.92951625, 1.90144494, ..., 3.13490895, 3.13490895,
        3.11973288],
       [3.31689671, 3.27012154, 3.27617457, ..., 3.270299  , 3.270299  ,
        3.27067519],
       [5.03909959, 3.68865446, 3.63611918, ..., 3.6879992 , 3.6879992 ,
        3.70389295]])

In [15]:
df_predicted = pd.DataFrame(data=regenerated, columns=df_pivot.columns, index=df_pivot.index)
df_predicted

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.306161,4.296397,4.405951,4.370563,4.342908,4.163808,4.204276,4.341084,4.379745,4.277329,...,4.366767,4.367710,4.365823,4.365823,4.366767,4.365823,4.366767,4.366767,4.366767,4.367243
2,3.945985,3.932260,3.936752,3.953256,3.956456,3.932349,3.931181,3.950689,3.952856,3.980600,...,3.948198,3.948010,3.948387,3.948387,3.948198,3.948387,3.948198,3.948198,3.948198,3.947933
3,2.425206,2.473104,2.404444,2.442252,2.462691,2.432406,2.441692,2.451316,2.427452,2.472199,...,2.435901,2.435910,2.435892,2.435892,2.435901,2.435892,2.435901,2.435901,2.435901,2.435954
4,3.591391,3.542791,3.567743,3.501296,3.618028,3.607064,3.677209,3.523268,3.549899,3.693275,...,3.556530,3.558903,3.554156,3.554156,3.556530,3.554156,3.556530,3.556530,3.556530,3.550811
5,3.826923,3.623178,3.619512,3.618412,3.607475,3.735154,3.644817,3.634681,3.622817,3.590988,...,3.636455,3.636679,3.636232,3.636232,3.636455,3.636232,3.636455,3.636455,3.636455,3.636856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.009458,3.507370,3.666368,3.681474,3.544303,3.565989,2.483123,3.630113,3.709401,3.767946,...,3.657625,3.658174,3.657075,3.657075,3.657625,3.657075,3.657625,3.657625,3.657625,3.654925
607,3.828102,3.820760,3.968557,3.798083,3.856450,3.849551,3.745672,3.783262,3.761308,3.698775,...,3.785918,3.785485,3.786352,3.786352,3.785918,3.786352,3.785918,3.785918,3.785918,3.784522
608,2.408233,1.929516,1.901445,3.131823,3.122469,3.380301,3.126344,3.065574,3.095069,4.166806,...,3.134909,3.136695,3.133122,3.133122,3.134909,3.133122,3.134909,3.134909,3.134909,3.119733
609,3.316897,3.270122,3.276175,3.254087,3.262333,3.332641,3.277084,3.271505,3.260878,3.281328,...,3.270299,3.270369,3.270229,3.270229,3.270299,3.270229,3.270299,3.270299,3.270299,3.270675


# Test

In [18]:
user = 22
df_user = df[df["userId"] == 22]
df_user = df_user.merge(df_movies, how="inner", on="movieId")
df_user

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,22,107,4.5,1268330607,Muppet Treasure Island (1996),Adventure|Children|Comedy|Musical
1,22,216,4.0,1268726081,Billy Madison (1995),Comedy
2,22,253,4.0,1268726368,Interview with the Vampire: The Vampire Chroni...,Drama|Horror
3,22,318,5.0,1268726193,"Shawshank Redemption, The (1994)",Crime|Drama
4,22,356,5.0,1268726309,Forrest Gump (1994),Comedy|Drama|Romance|War
...,...,...,...,...,...,...
114,22,69757,0.5,1268726748,(500) Days of Summer (2009),Comedy|Drama|Romance
115,22,70286,0.5,1268726596,District 9 (2009),Mystery|Sci-Fi|Thriller
116,22,71464,0.5,1268727107,"Serious Man, A (2009)",Comedy|Drama
117,22,72998,3.5,1268726172,Avatar (2009),Action|Adventure|Sci-Fi|IMAX
