In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

In [2]:
ml = MovieLens()

## Algorithm

Recomendador basado en contenido consderando los diferentes géneros asocaidos a las películas y año de estreno 

In [3]:
import math
import numpy as np
import heapq

# Cosine similarity metric (genres)
def getGenreSimilarityScore(movieId_1, movieId_2, genres):
    genres1 = genres[movieId_1]
    genres2 = genres[movieId_2]

    sumxx,sumxy,sumyy = 0,0,0

    for i in range(len(genres1)):
        x = genres1[i]
        y = genres2[i]

        sumxx += x*x
        sumyy += y*y
        sumxy += x*y

    res =sumxy/math.sqrt(sumxx*sumyy)
    return res


# Exponential decay function
def getYearSimilarityScore(year_movie1, year_movie2):
    if year_movie1 == None or year_movie2 == None:
        res = 0
    else:
        dif = abs(int(year_movie1)-int(year_movie2))
        res = math.exp(-dif / 10.0)
    return res

In [4]:
# KNN
def getNeighbors(movie_title,k,ml):

    # Load up genre vectors for every movie
    # ml = MovieLens()
    #genres = ml.getGenres()
    genres = ml.movies_genres

    neighbors = []
    #movie_title = "Toy Story"
    movie_df = ml.movies[ml.movies['title'] == movie_title]
    filtered_df = ml.movies[ml.movies['title'] != movie_title]

    for index, row in filtered_df.iterrows():
        genre_score = getGenreSimilarityScore(movie_df['movieId'].values[0],row['movieId'],genres)
        #year_score = getYearSimilarityScore(movie_df['year'].values[0],row['year'])

        #Compute the global score
        score = genre_score #* year_score

        # Add the result
        neighbors.append((score,row['movieId']))

    # Extract the top-K most-similar ratings based on the similarity score)
    # Returns a list of tuples(score,movieId)
    top = heapq.nlargest(k, neighbors, key=lambda t: t[0])
    
    top_movies = []
    
    for item in top:
        title = ml.movieID_to_name.get(item[1])
        top_movies.append(title)
    
    # Returns the list with the title of the top k recommendations
    return top_movies


#     # Compute average sim score of K neighbors weighted by user ratings
#     simTotal = weightedSum = 0
#     for (simScore, rating) in k_neighbors:
#         if (simScore > 0):
#             simTotal += simScore
#             weightedSum += simScore * rating

#     if (simTotal == 0):
#         raise PredictionImpossible('No neighbors')

#     predictedRating = weightedSum / simTotal

#     return predictedRating

Tras analizar los resultados del modelo que considera ambas variables, se ha identificado que la inclusión del año de estreno de la película afecta negativamente a los resultados, ya que elimina recomendaciones intuitivas, como puede se la segunda parte. Como consecuencia, "year_score" no se incluye en los cálculos de las recomendaciones.

## Evaluation

In [5]:
movie_title = "Toy Story"

recommendations = getNeighbors(movie_title,15,ml)
recommendations

['Antz',
 'Toy Story 2',
 'Adventures of Rocky and Bullwinkle, The',
 "Emperor's New Groove, The",
 'Monsters, Inc.',
 'Wild, The',
 'Shrek the Third',
 'Tale of Despereaux, The',
 'Asterix and the Vikings (Astérix et les Vikings)',
 'Turbo',
 'The Good Dinosaur',
 'Moana',
 'Space Jam',
 'Shrek',
 "Twelve Tasks of Asterix, The (Les douze travaux d'Astérix)"]

### Another way to obtain recommendations

Método equivalente al anterior empleando las librerías de python. Recomendaciones de películas basadas en la similitud de géneros utilizando la técnica de similitud coseno.

In [6]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

def getRecommendations(movie_title,k,ml): 
    
    movies = ml.movies.copy()
    movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
    
    # TF-IDF vectorization of the movie genres
    tfidf = TfidfVectorizer(stop_words='english') 
    tfidf_matrix = tfidf.fit_transform(movies['genres'])
    
    # Cosine similarity matrix between the genre vectors
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Get the index of the movie that matches the movie_title 
    idx = movies[movies['title'] == movie_title].index[0]         
    # Get the cosine similarity of the movie with all other movies 
    sim_scores = list(enumerate(cosine_sim[idx]))         
    # Sort the movies by cosine similarity 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)         
    # Get the indices of the top k most similar movies 
    sim_scores = sim_scores[1:k+1]    
    movie_indices = [i[0] for i in sim_scores]         
    
    return movies['title'].iloc[movie_indices]

In [None]:
recommendations = getRecommendations("Toy Story",15,ml)
recommendations

## Metrics

In [7]:
def computePopularity(movie_title,recommendations,ml,shared_genres = False):
    
    ratings = ml.ratings.copy()
    movie_id = ml.name_to_movieID[movie_title]
    
    # Get users who liked the original movie
    # Select only the rows corresponding to a rating equal or higher than 4 for the desired movie
    movie_ratings = ratings[(ratings['movieId'] == movie_id)& (ratings['rating'] >= 4.0)]
    len(movie_ratings)

    # Select the id of the users who have liked the reference movie and select the other movies they liked
    user_ids = movie_ratings['userId'].unique()
    filtered_df = ratings[ratings['userId'].isin(user_ids)]
 
    if shared_genres is False:
        # Select only films other than the one chosen as the basis for recommendation and with a rating >=4
        filtered_df = filtered_df[(filtered_df['movieId'] != movie_id)& (filtered_df['rating'] >= 4.0)]
    else:
        # Filter the dataset
        filtered_movies = ml.filterMoviesByGenres(movie_title)
        filtered_movie_ids = filtered_movies['movieId'].unique()

        # Select only films other than the one chosen as the basis for recommendation and with a rating >=4
        # and that share at least one genre with the reference film
        filtered_df = filtered_df[(filtered_df['movieId'] != movie_id)& (filtered_df['rating'] >= 4.0) & (filtered_df['movieId'].isin(filtered_movie_ids))]

    # Count the number of times each film appears and select the top N (15)
    k = len(recommendations)
    movie_counts = filtered_df.groupby('movieId').size().reset_index(name='count')
    movie_counts = movie_counts.sort_values('count', ascending=False)
    top_n = movie_counts.head(k)

    # Translate the movieId to title
    popular_movies = [ml.movieID_to_name[key] for key in top_n['movieId']]

    # Check how many of these films appear in the recommendations
    intersection = set(recommendations).intersection(set(popular_movies))
    score = round(len(intersection)/len(recommendations),4)*100
    
    return popular_movies,score

### Popularity

Comparación de las recomendaciones con las películas mejor valoradas por aquellos usuarios a los que les haya gustado la película de referencia para las recomendaciones.

In [8]:
popular_movies,score = computePopularity("Toy Story",recommendations,ml)
print("Similarity between recommendations and popular movies:"+str(score)+"%")

Similarity between recommendations and popular movies:0.0%


### Popularity based only on genres

Comparación de las recomendaciones con las películas mejor valoradas por aquellos usuarios a los que les haya gustado la película de referencia para las recomendaciones pero consierando únicamente aquellas películas que tengan al menos un género en común con la película de referencia.

In [9]:
popular_movies2,score2 = computePopularity("Toy Story",recommendations,ml,True)
print("Similarity between recommendations and popular movies:"+str(score2)+"%")

Similarity between recommendations and popular movies:6.67%


Vemos que el algoritmo obtiene mejores resultados cuando filtramos las películas populares por géneros en común con la película de referencia. En este caso, pasamos de tener cero a una coincidencia entre las recomendaciones y las películas populares

Por otro lado, podemos examinar las diferencias entre las películas populares según se filtre o no

In [14]:
print(popular_movies == popular_movies2)
list(set(popular_movies) - set(popular_movies2))

False


["Schindler's List",
 'Sixth Sense, The',
 'Matrix, The',
 'Usual Suspects, The',
 'Forrest Gump',
 'Pulp Fiction',
 'Shawshank Redemption, The',
 'Silence of the Lambs, The',
 'Saving Private Ryan']

In [15]:
list(set(popular_movies2) - set(popular_movies))

['Lord of the Rings: The Return of the King, The',
 'Apollo 13',
 'Shrek',
 'Back to the Future',
 'Princess Bride, The',
 'Lord of the Rings: The Fellowship of the Ring, The',
 'Finding Nemo',
 'Aladdin',
 'Monty Python and the Holy Grail']