In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

In [2]:
ml = MovieLens()

# Algorithm

In [3]:
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
import heapq
from collections import defaultdict
from operator import itemgetter

In [4]:
ratings = ml.ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Method from the Surprise library to load the DataFrame 
# Define the Reader object to parse the dataframe
reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))

# Load the dataframe as a ratings dataset
ratingsDataset = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Build the full trainset
trainSet = ratingsDataset.build_full_trainset()

## User item rating matrix

Matriz en la que encontramos los ratings por usuario para cada una de las películas existentes. Una columna para los usuarios y una fila en la que se encuentran todas las películas disponibles. El valor de la celda corresponde con el rating otorgado por el usuario a la película

In [6]:
# Cosine similarity function
sim_options = {'name': 'pearson',  # alternative: cosine
               'user_based': True, # compute  similarities between users
               'min_support':5     # minimum number of common items between users
               }

model = KNNWithMeans(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


La métrica establecida corresponde con el coeficiente de pearson, dado funciona mejor que la métrica "cosine similarity" en terminos de similitud entre usuarios porque tiene en cuenta como "puntúan" los usuarios las películas

## User similarity matrix

Comparamos usuarios para detectar aquellos con perfil similar al de referencia (al que queremos recomendar)

In [7]:
trainSet.all_users()

range(0, 610)

In [8]:
len(ratings[ratings['userId']==1])

232

In [9]:
# Reference user = user to recommend to
referenceUser = 1 

# Set the number of desired similar users
k = 10

# Get top N similar users to our reference user
# referenceUserInnerID = trainSet.to_inner_uid(referenceUser)

## Look up similar users

In [10]:
def getNeighbors(referenceUser,k,trainSet,threshold=None):
    
    # Get top N similar users to our reference user
    referenceUserInnerID = trainSet.to_inner_uid(referenceUser)
    
    if threshold is None:
        similarityRow = simsMatrix[referenceUserInnerID]
        similarUsers = []
        for innerID, score in enumerate(similarityRow):
            if (innerID != referenceUserInnerID):
                similarUsers.append((innerID, score))
                
    # Alternative. Select users up to a similarity threshold
    else:
        similarUsers = [(innerID, score) for (innerID, score) in enumerate(simsMatrix[referenceUserInnerID])
                if innerID != referenceUserInnerID and score > threshold]
    
    #print(len(similarUsers))
   
    # Get top N
    # Sort the elements in decreasing order by score and select top N
    kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
    return kNeighbors
    

In [11]:
# Option 1. No threshold
kNeighbors = getNeighbors(referenceUser,10,trainSet)
kNeighbors

[(510, 0.9258200997725514),
 (8, 0.9185586535436918),
 (12, 0.8783100656536799),
 (365, 0.8728715609439696),
 (400, 0.8669214468630108),
 (534, 0.8664002254439634),
 (89, 0.8215838362577491),
 (156, 0.8017837257372732),
 (138, 0.7905694150420948),
 (475, 0.7869358789643607)]

In [12]:
# Option 2. Set a threshold value
kNeighbors2 = getNeighbors(referenceUser,10,trainSet,0.8)
kNeighbors2

[(510, 0.9258200997725514),
 (8, 0.9185586535436918),
 (12, 0.8783100656536799),
 (365, 0.8728715609439696),
 (400, 0.8669214468630108),
 (534, 0.8664002254439634),
 (89, 0.8215838362577491),
 (156, 0.8017837257372732)]

Vemos que el número de perfiles similares disminuye como consecuencia de estabecer un coeficiente de similitud mínimo (threshold = 0.8)

## Candidate generation and scoring

Selecionamos las películas que podríamos recomendar en primera instancia 

### First approach

Normalizamos los ratings y multiplicamos por el coeficiente de semejanza entre el usuario elegido y el de referencia

In [13]:
# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
        
# Sort the candidates by score
candidates = sorted(candidates.items(), key=lambda t: t[1], reverse=True)
#candidates

### Second approach

In [14]:
# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates2 = defaultdict(float)
similaritySum = sum([similarity[1] for similarity in kNeighbors])
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1] / similaritySum
    theirRatings = trainSet.ur[innerID]
    for itemID, rating in theirRatings:
        candidates2[itemID] += round(userSimilarityScore * rating,2)

# Sort the candidates by score
candidates2 = sorted(candidates2.items(), key=lambda t: t[1], reverse=True)
#candidates2

El primer enfoque es más sencillo pero puede estar sesgado hacia películas muy valorados, ya que sólo tiene en cuenta la suma de las valoraciones. El segundo enfoque tiene en cuenta tanto la valoración como la puntuación de similitud, lo que puede hacerlo más preciso y menos sesgado. Sin embargo, requiere más cálculos y su aplicación puede resultar más compleja.

## Candidate filtering and recommendations

Filtramos aquellas recomendaciones con un score pequeño y que ya haya visto el usuario. Para ello utilizamos un set porque únicamente nos interesa saber los items que el reference user ya ha visto, plus es un objeto eficiente para datasets largos

In [15]:
def filterRec(referenceUser,trainSet, candidates):
    
    # Get top N similar users to our reference user
    referenceUserInnerID = trainSet.to_inner_uid(referenceUser)
    
    # Build a set of movies the user has already seen
    watched = set(trainSet.ur[referenceUserInnerID])
    
    # Initialize a list to store the recommendations
    recommendations = []

    # Get top-rated items from similar users:
    pos = 0
    for itemID, ratingSum in candidates:
        if not itemID in watched:
            movieID = trainSet.to_raw_iid(itemID)
            recommendation = ml.getMovieName(int(movieID)), ratingSum
            recommendations.append(recommendation)
            pos += 1
            if (pos >= 10):
                break

    rec_movies = [rec[0] for rec in recommendations]
    return rec_movies

ratingSum represents the total similarity of the reference user to all other users who rated that item

In [16]:
# Results for the first approach
rec_movies = filterRec(referenceUser,trainSet, candidates)
rec_movies

['Lord of the Rings: The Fellowship of the Ring, The',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)',
 'Lord of the Rings: The Return of the King, The',
 'Fight Club',
 'Matrix, The',
 'Dark Knight Rises, The',
 'Forrest Gump',
 'Lord of the Rings: The Two Towers, The',
 'Gladiator',
 'Incredibles, The']

In [17]:
# Results for the second approach
rec_movies2 = filterRec(referenceUser,trainSet, candidates2)
rec_movies2

['Lord of the Rings: The Fellowship of the Ring, The',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)',
 'Lord of the Rings: The Return of the King, The',
 'Fight Club',
 'Matrix, The',
 'Dark Knight Rises, The',
 'Forrest Gump',
 'Lord of the Rings: The Two Towers, The',
 'Gladiator',
 'Inception']

In [18]:
print(rec_movies == rec_movies2)

False


Vemos que ambos métodos obtienen recomendaciones muy similares. No obstante, difieren en la última película sugerida

# Metrics