# Movie Recommendation based on Genres using cosine similarity

Dataset - https://www.kaggle.com/datasets/ayushimishra2809/movielens-dataset<br>
This dataset contains 2 csv files.<br>
First one has movie id, title and genres.<br>
Second one has user id, movie id, rating and timestamp

In [2]:
# import necessary packages
import pandas as pd

In [3]:
# Movies dataframe
movies_df = pd.read_csv("movies.csv",header=0)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Ratings dataframe
ratings_df = pd.read_csv("ratings.csv",header=0)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


### Recommendation based on genre
For the first part of this recommendation, we are going to recommend movies based on genre <br>
i.e., if a user has watched an action/war movie, then suggest another similar movie to the user

In [5]:
# We need only movies_df for this section
movie_df = movies_df.copy()
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# We will use movie_df from now
movie_df['genres'] = movie_df['genres'].str.split("|")
genres = (movie_df['genres'].values) # separating the values in genres column
print(genres)

[list(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'])
 list(['Adventure', 'Children', 'Fantasy']) list(['Comedy', 'Romance'])
 ... list(['Comedy']) list(['Drama']) list(['(no genres listed)'])]


In [7]:
genres_unique = list(set(sum(genres,[])))
genres_unique

['Sci-Fi',
 'Thriller',
 'Children',
 'Adventure',
 'Animation',
 'IMAX',
 'Comedy',
 'Drama',
 '(no genres listed)',
 'Musical',
 'Mystery',
 'Western',
 'Horror',
 'Romance',
 'Crime',
 'Fantasy',
 'Film-Noir',
 'Action',
 'Documentary',
 'War']

In [8]:
for g in genres_unique:
    movie_df[g] = 0

In [9]:
def populate_genre(row):
    for g in genres_unique:
        if g in row['genres']:
            row[g] = 1
    return row

In [10]:
unique_geners = genres_unique
unique_geners[-1] = 'genres'
movie_df[unique_geners] = movie_df[unique_geners].apply(populate_genre,axis=1)

In [11]:
movie_df.head()

Unnamed: 0,movieId,title,genres,Sci-Fi,Thriller,Children,Adventure,Animation,IMAX,Comedy,...,Mystery,Western,Horror,Romance,Crime,Fantasy,Film-Noir,Action,Documentary,War
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,0,1,...,0,0,0,0,0,1,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# dropping the columns that we are not using 
movie_df = movie_df.drop(['(no genres listed)'],axis=1)

In [13]:
# check for null values
movie_df.isna().sum()

movieId        0
title          0
genres         0
Sci-Fi         0
Thriller       0
Children       0
Adventure      0
Animation      0
IMAX           0
Comedy         0
Drama          0
Musical        0
Mystery        0
Western        0
Horror         0
Romance        0
Crime          0
Fantasy        0
Film-Noir      0
Action         0
Documentary    0
War            0
dtype: int64

In [14]:
# calculating cosine similarity of genres between movies

from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(movie_df.drop(['movieId','title','genres'],axis=1))

In [15]:
print(similarity_matrix.shape)

(10329, 10329)


In [16]:
# mapping the default index of the dataframe with the movie titles
indices = pd.Series(movie_df.index, index=movie_df['title'])

In [17]:
print(indices)

title
Toy Story (1995)                           0
Jumanji (1995)                             1
Grumpier Old Men (1995)                    2
Waiting to Exhale (1995)                   3
Father of the Bride Part II (1995)         4
                                       ...  
Cosmic Scrat-tastrophe (2015)          10324
Le Grand Restaurant (1966)             10325
A Very Murray Christmas (2015)         10326
The Big Short (2015)                   10327
Marco Polo: One Hundred Eyes (2015)    10328
Length: 10329, dtype: int64


In [18]:
# function that returns movie recommendations based on cosine similarity
def recommend_movie(title, similarity_matrix, movie_df, num_results):
    # input: title -> movie title
    #        similarity_matrix -> cosine similarity matrix for the entire df
    #        num_results -> Number of movies to be recommended
    
    # getting the index of the input movie
    movie_index = indices[title]
    
    # getting the row from the similarity matrix corresponding to this movie
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # Adding a new column to the dataframe with the similarity score between this movie and all of the rest
    movie_df['similarity score'] = [s[1] for s in similarity_scores]
    
    # sort dataframe based on the similarity score
    movie_df = movie_df.sort_values(by=['similarity score'],ascending=False)
    
    return movie_df[['movieId','title','genres','similarity score']].head(num_results)
    


In [19]:
# To get a recommendation, pass the movie title and call the function 
title = "Waiting to Exhale (1995)"
num_results = 10
recommend_movie(title,similarity_matrix,movie_df,num_results)

Unnamed: 0,movieId,title,genres,similarity score
1777,2245,Working Girl (1988),"[Comedy, Drama, Romance]",1.0
5191,7129,Queen of Hearts (1989),"[Comedy, Drama, Romance]",1.0
4513,6037,Summer Lovers (1982),"[Comedy, Drama, Romance]",1.0
5888,8969,Bridget Jones: The Edge of Reason (2004),"[Comedy, Drama, Romance]",1.0
10004,112850,Words and Pictures (2013),"[Comedy, Drama, Romance]",1.0
1838,2324,Life Is Beautiful (La Vita è bella) (1997),"[Comedy, Drama, Romance, War]",1.0
7888,62434,Zack and Miri Make a Porno (2008),"[Comedy, Drama, Romance]",1.0
1036,1277,Cyrano de Bergerac (1990),"[Comedy, Drama, Romance]",1.0
4224,5525,Mostly Martha (Bella Martha) (2001),"[Comedy, Drama, Romance]",1.0
7822,61048,Expired (2007),"[Comedy, Drama, Romance]",1.0
