In [None]:
# Read the movies database
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

ratings_df = pd.read_csv('u.data', sep='\t', index_col=False, names=['userId', 'movieId', 'rating'])
movies_df = pd.read_csv('u.item', sep='|', header=0, index_col=False, encoding='ISO-8859-1')

# Add a column with the centered ratings
average_df = ratings_df[['userId', 'rating']].groupby(['userId'], as_index=False).mean().rename(columns={'rating': 'average'})
ratings_df = pd.merge(ratings_df, average_df, on='userId', how='left')
ratings_df['rating_centered'] = ratings_df['rating'] - ratings_df['average']
ratings_df

In [None]:
# Build the matrix of user's scores to movies
# Remember: The rows are the users and the movies the columns
# We can use the ID's as the indexes of the array but the first column and row will be 0
nUsers = ratings_df.userId.unique().size
nMovies = ratings_df.movieId.unique().size

ratingsM = np.zeros((nUsers+1, nMovies+1))
for index, rating in ratings_df.iterrows():
    ratingsM[int(rating.userId), int(rating.movieId)] = rating.rating_centered
    
    

In [None]:
# Collaborative filtering - Memory based
# --------------------------------------

# Item view: I will recommend movies based on user choice

# Let's say that user's chooses movie 102 (check 50/500/449)
# What movies can we recommend him?

# Get the distances between movies 
# Because we are dealing with the columns (movies) we have to transpose the matrix
distances = cosine_distances(ratingsM.T)

myMovie = 50
print("My movie is: ", movies_df[movies_df.movieId == myMovie].title.iloc[0])
print()

# ###################################################################
# Insert here the code to get the 10 movies closer to my movie
# ###################################################################


In [None]:
# Collaborative filtering - Model based
# --------------------------------------
# First calculate the latent factors matrix using the Alternate Least Squares algorithm
# Then make recommendations based on similarity

# Metaparameters
k = 100        # number of latent factors
l = 0.1        # lambda. The same value for x and y
accuracy = 0.999

# X and Y initialization
np.random.seed(42)
X = np.random.normal(size=(ratingsM.shape[0], k))
Y = np.random.normal(size=(k, ratingsM.shape[1]))

# Alternate Least Squares algorithm
converged = False
pL = np.Inf
while not converged:
    y = Y.T
    inv = np.linalg.inv(y.T.dot(y) + l*np.eye(k))
    for u in range(0, X.shape[0]):
        X[u] = ratingsM[u,:].dot(y).dot(inv)
    
    inv = np.linalg.inv(X.T.dot(X) + l*np.eye(k))    
    for i in range(0, Y.shape[1]):
        Y[:,i] = ratingsM[:,i].dot(X).dot(inv)
        
    L = np.square(ratingsM - X.dot(Y)).sum()
    L = L + l * (np.square(np.linalg.norm(X)) + np.square(np.linalg.norm(Y)))
                     
    # Improvement stop criteria
    converged = (L / pL) > accuracy
    
    pL = L
    

In [None]:
# Let's make predictions
# Get the similarity matrix with the items latent factors
myMovie = 50
print("My movie is: ", movies_df[movies_df.movieId == myMovie].title.iloc[0])
print()

distances = cosine_distances(Y.T)

# #####################################################################
# Insert here the code to print the 10 movies closest to my movie
# #####################################################################