In [1]:
import os
os.chdir('../movies')
from movieLens import MovieLens

In [2]:
ml = MovieLens()

## Algorithm

In [3]:
import math
import numpy as np
import heapq

# Cosine similarity metric (genres)
def getGenreSimilarityScore(movieId_1, movieId_2, genres):
    genres1 = genres[movieId_1]
    genres2 = genres[movieId_2]

    sumxx,sumxy,sumyy = 0,0,0

    for i in range(len(genres1)):
        x = genres1[i]
        y = genres2[i]

        sumxx += x*x
        sumyy += y*y
        sumxy += x*y

    res =sumxy/math.sqrt(sumxx*sumyy)
    return res


# Exponential decay function
def getYearSimilarityScore(year_movie1, year_movie2):
    if year_movie1 == None or year_movie2 == None:
        res = 0
    else:
        dif = abs(int(year_movie1)-int(year_movie2))
        res = math.exp(-dif / 10.0)
    return res

Content based recommender of movies based on the genre and the year of the movies

In [4]:
# KNN
def getNeighbors(movie_title,ml):

    # Load up genre vectors for every movie
    # ml = MovieLens()
    #genres = ml.getGenres()
    genres = ml.movies_genres

    neighbors = []
    #movie_title = "Toy Story"
    movie_df = ml.movies[ml.movies['title'] == movie_title]
    filtered_df = ml.movies[ml.movies['title'] != movie_title]

    for index, row in filtered_df.iterrows():
        #print(row['movieId'])
        #print(index)
        genre_score = getGenreSimilarityScore(movie_df['movieId'].values[0],row['movieId'],genres)
        #year_score = getYearSimilarityScore(movie_df['year'].values[0],row['year'])

        #Compute the global score
        score = genre_score #* year_score
        #print(score)

        # Add the result
        neighbors.append((score,row['movieId']))

    # Extract the top-K most-similar ratings based on the similarity score)
    # Returns a list of tuples(score,movieId)
    top = heapq.nlargest(15, neighbors, key=lambda t: t[0])
    
    top_movies = []
    
    for item in top:
        title = ml.movieID_to_name.get(item[1])
        #print(title)
        top_movies.append(title)
    
    # Returns the list with the title of the top k recommendations
    return top_movies


#     # Compute average sim score of K neighbors weighted by user ratings
#     simTotal = weightedSum = 0
#     for (simScore, rating) in k_neighbors:
#         if (simScore > 0):
#             simTotal += simScore
#             weightedSum += simScore * rating

#     if (simTotal == 0):
#         raise PredictionImpossible('No neighbors')

#     predictedRating = weightedSum / simTotal

#     return predictedRating

## Evaluation

In [5]:
movie_name = "Toy Story"

In [6]:
recommendations = getNeighbors(movie_name,ml)
recommendations

['Antz',
 'Toy Story 2',
 'Adventures of Rocky and Bullwinkle, The',
 "Emperor's New Groove, The",
 'Monsters, Inc.',
 'Wild, The',
 'Shrek the Third',
 'Tale of Despereaux, The',
 'Asterix and the Vikings (Astérix et les Vikings)',
 'Turbo',
 'The Good Dinosaur',
 'Moana',
 'Space Jam',
 'Shrek',
 "Twelve Tasks of Asterix, The (Les douze travaux d'Astérix)"]

Another way to obtain recommendations

In [7]:
movies = ml.movies
movies.head()

Unnamed: 0,movieId,title,year,genres
0,1,Toy Story,1995.0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,1995.0,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,1995.0,"[Comedy, Romance]"
3,4,Waiting to Exhale,1995.0,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II,1995.0,[Comedy]


In [8]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity

movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
tfidf = TfidfVectorizer(stop_words='english') 
tfidf_matrix = tfidf.fit_transform(movies['genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def getRecommendations(title, cosine_sim=cosine_sim, movies=movies): 
    # Obtiene el índice de la película que coincide con el título 
    idx = movies[movies['title'] == title].index[0]         
    # Obtiene la similitud coseno de la película con todas las demás películas 
    sim_scores = list(enumerate(cosine_sim[idx]))         
    # Ordena las películas por similitud coseno 
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)         
    # Obtiene los índices de las 10 películas más similares 
    sim_scores = sim_scores[1:11]    
    movie_indices = [i[0] for i in sim_scores]         
    # Devuelve las 10 películas más similares 
    return movies['title'].iloc[movie_indices]

getRecommendations("Toy Story")

1706                                                Antz
2355                                         Toy Story 2
2809             Adventures of Rocky and Bullwinkle, The
3000                           Emperor's New Groove, The
3568                                      Monsters, Inc.
6194                                           Wild, The
6486                                     Shrek the Third
6948                             Tale of Despereaux, The
7760    Asterix and the Vikings (Astérix et les Vikings)
8219                                               Turbo
Name: title, dtype: object

## Metrics

### Popularity

Compare the recommendations with the best rated movies by those users who liked the reference movie.

In [9]:
# Get users who liked the original movie
ratings = ml.ratings
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
movie_id = ml.name_to_movieID[movie_name]

# Select only the rows corresponding to a rating equal or higher than 4 for the desired movie
movie_ratings = ratings[(ratings['movieId'] == movie_id)& (ratings['rating'] >= 4.0)]
len(movie_ratings)

# Select the id of the users who have liked the reference movie and select the other movies they liked
user_ids = movie_ratings['userId'].unique()
filtered_df = ratings[ratings['userId'].isin(user_ids)]

# Select only films other than the one chosen as the basis for recommendation and with a rating >=4
filtered_df = filtered_df[(filtered_df['movieId'] != movie_id)& (filtered_df['rating'] >= 4.0)]

# Count the number of times each film appears and select the top N (15)
n=15
movie_counts = filtered_df.groupby('movieId').size().reset_index(name='count')
movie_counts = movie_counts.sort_values('count', ascending=False)
top_n = movie_counts.head(n)

# Translate the movieId to title
popular_movies = [ml.movieID_to_name[key] for key in top_n['movieId']]
popular_movies

['Shawshank Redemption, The',
 'Forrest Gump',
 'Star Wars: Episode IV - A New Hope',
 'Pulp Fiction',
 'Silence of the Lambs, The',
 'Star Wars: Episode V - The Empire Strikes Back',
 'Matrix, The',
 'Jurassic Park',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)',
 'Star Wars: Episode VI - Return of the Jedi',
 'Sixth Sense, The',
 "Schindler's List",
 'Saving Private Ryan',
 'Usual Suspects, The',
 'Lion King, The']

In [11]:
# Check how many of these films appear in the recommendations
intersection = set(recommendations).intersection(set(popular_movies))
intersection

set()

In [12]:
score = round(len(intersection)/len(recommendations),4)*100
print("Similarity between recommendations and popular movies:"+str(score)+"%")

Similarity between recommendations and popular movies:0.0%


### Popularity based only on gender

Compare the recommendations with the best rated movies by those users who liked the reference movie. Consider only the movies that have at least one genre in common with the reference one.

In [13]:
def filterMoviesByGenres(df, desired_genre_ids, genre_to_genreID, genres):

    # Filter the dataframe to include only the movies that contain at least one of the desired genres
    mask = [any(genres[int(movie_id)][gid] for gid in desired_genre_ids) for movie_id in df['movieId'].astype(int)]
    filtered_df = df[mask]

    return filtered_df

In [14]:
desired_genre_ids = ml.movies_genres[movie_id]

# Filter the dataset
filtered_movies = filterMoviesByGenres(movies,desired_genre_ids,ml.genre_to_genreID,ml.movies_genres)
filtered_movie_ids = filtered_movies['movieId'].unique()

In [15]:
movie_id = ml.name_to_movieID[movie_name]

# Select only the rows corresponding to a rating equal or higher than 4 for the desired movie
movie_ratings = ratings[(ratings['movieId'] == movie_id)& (ratings['rating'] >= 4.0)]
len(movie_ratings)

# Select the id of the users who have liked the reference movie and select the other movies they liked
user_ids = movie_ratings['userId'].unique()
filtered_df = ratings[ratings['userId'].isin(user_ids)]

# Select only films other than the one chosen as the basis for recommendation and with a rating >=4
# and that share at least one genre with the reference film
filtered_df = filtered_df[(filtered_df['movieId'] != movie_id)& (filtered_df['rating'] >= 4.0) & (filtered_df['movieId'].isin(filtered_movie_ids))]
# len(filtered_df)

# Count the number of times each film appears and select the top N (15)
n=15
movie_counts = filtered_df.groupby('movieId').size().reset_index(name='count')
movie_counts = movie_counts.sort_values('count', ascending=False)
top_n = movie_counts.head(n)

# Translate the movieId to title
popular_movies = [ml.movieID_to_name[key] for key in top_n['movieId']]
popular_movies

['Star Wars: Episode IV - A New Hope',
 'Star Wars: Episode V - The Empire Strikes Back',
 'Jurassic Park',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark)',
 'Star Wars: Episode VI - Return of the Jedi',
 'Lion King, The',
 'Aladdin',
 'Back to the Future',
 'Shrek',
 'Apollo 13',
 'Lord of the Rings: The Return of the King, The',
 'Lord of the Rings: The Fellowship of the Ring, The',
 'Princess Bride, The',
 'Monty Python and the Holy Grail',
 'Finding Nemo']

In [16]:
# Check how many of these films appear in the recommendations
intersection = set(recommendations).intersection(set(popular_movies))
intersection

{'Shrek'}

In [17]:
score = round(len(intersection)/len(recommendations),4)*100
print("Similarity between recommendations and popular movies:"+str(score)+"%")

Similarity between recommendations and popular movies:6.67%


By considering only the movies that shared at least one genre with the reference movie we improve the results