In [174]:
import numpy as np 
import pandas as pd

K = 50

In [175]:
users = pd.read_csv('users.dat',names=['userId','gender','age','occupation','zipCode'],sep='::', engine='python')
users['stdUserId'] = users.index
users.tail()

Unnamed: 0,userId,gender,age,occupation,zipCode,stdUserId
6035,6036,F,25,15,32603,6035
6036,6037,F,45,1,76006,6036
6037,6038,F,56,1,14706,6037
6038,6039,F,45,0,1060,6038
6039,6040,M,25,6,11106,6039


In [176]:
movies = pd.read_csv('movies.dat',names=['movieId','title','genres'],sep='::', engine='python')
movies['stdMovieId'] = movies.index
movies.tail()

Unnamed: 0,movieId,title,genres,stdMovieId
3878,3948,Meet the Parents (2000),Comedy,3878
3879,3949,Requiem for a Dream (2000),Drama,3879
3880,3950,Tigerland (2000),Drama,3880
3881,3951,Two Family House (2000),Drama,3881
3882,3952,"Contender, The (2000)",Drama|Thriller,3882


In [177]:
ratings = pd.read_csv('ratings.dat',names=['userId','movieId','rating','timestamp'],sep='::', engine='python')
ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569


In [178]:
# Merge useful information
merged = movies.merge(ratings, on='movieId').drop(['movieId','timestamp','title','genres'], axis=1)
merged = users.merge(merged, on='userId').drop(['userId','gender','age','occupation','zipCode'], axis=1)
merged.set_index(['stdUserId', 'stdMovieId'])
merged.tail()

Unnamed: 0,stdUserId,stdMovieId,rating
1000204,6039,3614,4
1000205,6039,3634,4
1000206,6039,3666,4
1000207,6039,3682,4
1000208,6039,3749,5


In [179]:
# Generating the matrix
matrix = np.ndarray(shape=(users.shape[0],movies.shape[0]), dtype=np.float64)
matrix[merged.stdUserId.values,merged.stdMovieId.values] = merged.rating.values
matrix[:10, :10]


array([[5., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 2., 0., 0., 0., 0.],
       [4., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 4., 0., 0., 0., 0.],
       [4., 0., 0., 3., 0., 0., 0., 0., 0., 0.],
       [5., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [5., 5., 0., 0., 0., 0., 4., 0., 0., 0.]])

In [180]:
# Check if matrix was built correctly
def checkMatrix(mu: int, mm: int) -> bool:
  global matrix, merged
  for u in range(0, mu):
    for m in range(0, mm):
      query = merged.query(f'stdUserId == {u} and stdMovieId == {m}')
      check = (matrix[u,m] == 0 and query.empty) \
        or (len(query.rating.values) > 0 and matrix[u,m] == query.rating.values[0])
      if not check:
        return False
  return True

checkMatrix(30, 30)

True

In [181]:
# Computing SVD

U,s,V = np.linalg.svd(matrix)
s.shape


(3883,)

In [182]:
# Truncating the data

sliced = V.T[:, :K]

In [183]:
# Getting top N most similar instances by cosine similarity

def topCosineSimilarity(data, stdMovieId, topN):
  idx = stdMovieId
  movieRow = data[idx, :]
  magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
  similarity = np.dot(movieRow, data.T) / (magnitude[idx] * magnitude)
  sortIdxs = np.argsort(-similarity)
  return sortIdxs[:topN]

def topSimiliarMovies(data, movies, stdMovieId, topN):
  topIdxs = topCosineSimilarity(data, stdMovieId, topN)
  topMovies = movies[movies.stdMovieId.isin(topIdxs)]
  topMovies.set_index('stdMovieId', inplace=True)
  return topMovies.loc[topIdxs]

In [184]:
# Running search Batman Forever (1995)

stdMovieId = movies.query(f'title == "Batman Forever (1995)"')['stdMovieId'].values[0]

topMovies = topSimiliarMovies(sliced,movies,stdMovieId,5)
topMovies.reset_index(inplace=True)
topMovies.drop(['movieId', 'stdMovieId'], axis=1, inplace=True)
topMovies.rename(columns={'title':'Título', 'genres': 'Gêneros'})

  similarity = np.dot(movieRow, data.T) / (magnitude[idx] * magnitude)


Unnamed: 0,Título,Gêneros
0,Batman Forever (1995),Action|Adventure|Comedy|Crime
1,Batman & Robin (1997),Action|Adventure|Crime
2,Batman Returns (1992),Action|Adventure|Comedy|Crime
3,"Avengers, The (1998)",Action|Adventure
4,Waterworld (1995),Action|Adventure


In [185]:
# Running search Bambi (1942)

stdMovieId = movies.query(f'title == "Bambi (1942)"')['stdMovieId'].values[0]

topMovies = topSimiliarMovies(sliced,movies,stdMovieId,5)
topMovies.reset_index(inplace=True)
topMovies.drop(['movieId', 'stdMovieId'], axis=1, inplace=True)
topMovies.rename(columns={'title':'Título', 'genres': 'Gêneros'})

  similarity = np.dot(movieRow, data.T) / (magnitude[idx] * magnitude)


Unnamed: 0,Título,Gêneros
0,Bambi (1942),Animation|Children's
1,Pinocchio (1940),Animation|Children's
2,101 Dalmatians (1961),Animation|Children's
3,Charlotte's Web (1973),Animation|Children's
4,"Sword in the Stone, The (1963)",Animation|Children's


In [186]:
# Running search Fight Club (1999)	

stdMovieId = movies.query(f'title == "Fight Club (1999)"')['stdMovieId'].values[0]

topMovies = topSimiliarMovies(sliced,movies,stdMovieId,5)
topMovies.reset_index(inplace=True)
topMovies.drop(['movieId', 'stdMovieId'], axis=1, inplace=True)
topMovies.rename(columns={'title':'Título', 'genres': 'Gêneros'})

  similarity = np.dot(movieRow, data.T) / (magnitude[idx] * magnitude)


Unnamed: 0,Título,Gêneros
0,Fight Club (1999),Drama
1,Eyes Wide Shut (1999),Drama
2,Man on the Moon (1999),Comedy|Drama
3,Bringing Out the Dead (1999),Drama|Horror
4,Sleepy Hollow (1999),Horror|Romance
