In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

In [2]:
ml = MovieLens()

# Algorithm

In [3]:
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter

In [4]:
ratings = ml.ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Method from the Surprise library to load the DataFrame 
# Define the Reader object to parse the dataframe
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load the dataframe as a ratings dataset
ratingsDataset = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Build the full trainset
trainSet = ratingsDataset.build_full_trainset()

## User item rating matrix

Matriz en la que encontramos los ratings por usuario para cada una de las películas existentes. Una columna para los usuarios y una fila en la que se encuentran todas las películas disponibles. El valor de la celda corresponde con el rating otorgado por el usuario a la película

In [6]:
# Cosine similarity function
sim_options = {'name': 'cosine',  # alternative: pearson
               'user_based': True # compute  similarities between users
               }

model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


## User similarity matrix

Comparamos usuarios para detectar aquellos con perfil similar al de referencia (al que queremos recomendar)

In [7]:
trainSet.all_users()

range(0, 610)

In [8]:
len(ratings[ratings['userId']==1])

232

In [9]:
# Reference user = user to recommend to
referenceUser = 1 

# Set the number of desired similar users
k = 10

# Get top N similar users to our reference user
referenceUserInnerID = trainSet.to_inner_uid(referenceUser)
similarityRow = simsMatrix[referenceUserInnerID]

## Look up similar users

In [13]:
similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != referenceUserInnerID):
        similarUsers.append( (innerID, score) )

print(len(similarUsers))

# Get top N
# Sort the elements in decreasing order by score and select top N
kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
kNeighbors

609


[(1, 1.0),
 (11, 1.0),
 (76, 1.0),
 (84, 1.0),
 (183, 1.0),
 (244, 1.0),
 (252, 1.0),
 (290, 1.0),
 (314, 1.0),
 (357, 1.0)]

### Alternative. Select users up to a similarity threshold of 0.9

In [15]:
from surprise import KNNWithMeans

# Build the similarity matrix
model2 = KNNWithMeans(sim_options=sim_options)
model2.fit(trainSet)
simsMatrix = model2.compute_similarities()

# Get the inner ID of the reference user
referenceUserInnerID = trainSet.to_inner_uid(referenceUser)

# Select users up to a similarity threshold of 0.9
similarityThreshold = 0.9
similarUsers = [(innerID, score) for (innerID, score) in enumerate(simsMatrix[referenceUserInnerID])
                if innerID != referenceUserInnerID and score > similarityThreshold]

# Get top N
# Sort the elements in decreasing order by score and select top N
kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
kNeighbors

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


[(1, 1.0),
 (11, 1.0),
 (76, 1.0),
 (84, 1.0),
 (183, 1.0),
 (244, 1.0),
 (252, 1.0),
 (290, 1.0),
 (314, 1.0),
 (357, 1.0)]

Same results as the threshold is set to 0.9 and all users have a coincidence of 1.0

## Candidate generation and scoring

Selecionamos las películas que podríamos recomendar en primera instancia 

### First approach

Normalizamos los ratings y multiplicamos por el coeficiente de semejanza entre el usuario elegido y el de referencia

In [22]:
# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
        
# Sort the candidates by score
candidates = sorted(candidates.items(), key=lambda t: t[1], reverse=True)
candidates

[(238, 3.9),
 (763, 2.5),
 (253, 2.4),
 (244, 2.2),
 (252, 2.0),
 (993, 2.0),
 (987, 2.0),
 (1103, 2.0),
 (887, 1.9),
 (1066, 1.9),
 (219, 1.8),
 (254, 1.8),
 (1000, 1.8),
 (809, 1.8),
 (1055, 1.8),
 (362, 1.7000000000000002),
 (457, 1.7),
 (917, 1.7),
 (753, 1.6),
 (774, 1.6),
 (1054, 1.6),
 (1330, 1.6),
 (1345, 1.6),
 (398, 1.6),
 (331, 1.6),
 (233, 1.5),
 (249, 1.5),
 (444, 1.5),
 (1125, 1.5),
 (1365, 1.5),
 (1312, 1.4),
 (1324, 1.4),
 (926, 1.4),
 (1352, 1.3),
 (1129, 1.2000000000000002),
 (2677, 1.2000000000000002),
 (239, 1.0),
 (246, 1.0),
 (248, 1.0),
 (257, 1.0),
 (258, 1.0),
 (532, 1.0),
 (304, 1.0),
 (570, 1.0),
 (701, 1.0),
 (996, 1.0),
 (999, 1.0),
 (1001, 1.0),
 (734, 1.0),
 (1003, 1.0),
 (877, 1.0),
 (895, 1.0),
 (912, 1.0),
 (15, 1.0),
 (68, 1.0),
 (70, 1.0),
 (73, 1.0),
 (166, 1.0),
 (431, 1.0),
 (756, 1.0),
 (757, 1.0),
 (764, 1.0),
 (782, 1.0),
 (794, 1.0),
 (3711, 1.0),
 (595, 1.0),
 (28, 1.0),
 (3712, 1.0),
 (3545, 1.0),
 (2079, 1.0),
 (2457, 1.0),
 (365, 1.0),
 (2

### Second approach

In [23]:
# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates2 = defaultdict(float)
similaritySum = sum([similarity[1] for similarity in kNeighbors])
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1] / similaritySum
    theirRatings = trainSet.ur[innerID]
    for itemID, rating in theirRatings:
        candidates2[itemID] += round(userSimilarityScore * rating,2)

# Sort the candidates by score
candidates2 = sorted(candidates2.items(), key=lambda t: t[1], reverse=True)
#candidates2

El primer enfoque es más sencillo pero puede estar sesgado hacia películas muy valorados, ya que sólo tiene en cuenta la suma de las valoraciones. El segundo enfoque tiene en cuenta tanto la valoración como la puntuación de similitud, lo que puede hacerlo más preciso y menos sesgado. Sin embargo, requiere más cálculos y su aplicación puede resultar más compleja.

## Candidate filtering and recommendations

Filtramos aquellas recomendaciones con un score pequeño y que ya haya visto el usuario

In [24]:
 # Build a set of movies the user has already seen
watched = set(trainSet.ur[referenceUserInnerID])

Utilizamos un set porque únicamente nos interesa saber los items que el reference user ya ha visto, plus es un objeto eficiente para datasets largos

In [30]:
# Initialize a list to store the recommendations
recommendations = []

# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in candidates:
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        recommendation = ml.getMovieName(int(movieID)), ratingSum
        recommendations.append(recommendation)
        pos += 1
        if (pos > 10):
            break

rec_movies = [rec[0] for rec in recommendations]
rec_movies

['Dark Knight, The',
 'Harry Potter and the Chamber of Secrets',
 'Interstellar',
 'Inception',
 'Wolf of Wall Street, The',
 'Shine',
 'Titanic',
 'To Kill a Mockingbird',
 'Notebook, The',
 'Avengers, The',
 'Gladiator']

ratingSum represents the total similarity of the reference user to all other users who rated that item

### Results for second approach

In [29]:
# Initialize a list to store the recommendations
recommendations = []

# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in candidates2:
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        recommendation = ml.getMovieName(int(movieID)), ratingSum
        recommendations.append(recommendation)
        pos += 1
        if (pos > 10):
            break

rec_movies2 = [rec[0] for rec in recommendations]
rec_movies2

['Dark Knight, The',
 'Harry Potter and the Chamber of Secrets',
 'Interstellar',
 'Inception',
 'Wolf of Wall Street, The',
 'Shine',
 'Titanic',
 'To Kill a Mockingbird',
 'Notebook, The',
 'Avengers, The',
 'Gladiator']

In [31]:
print(rec_movies == rec_movies2)

True


Mismas recomendaciones para ambos métodos