# Movie Recommendation based on Genres using cosine similarity

Dataset - https://www.kaggle.com/datasets/ayushimishra2809/movielens-dataset<br>
This dataset contains 2 csv files.<br>
First one has movie id, title and genres.<br>
Second one has user id, movie id, rating and timestamp

In [2]:
# import necessary packages
import pandas as pd

In [3]:
# Movies dataframe
movies_df = pd.read_csv("movies.csv",header=0)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Ratings dataframe
ratings_df = pd.read_csv("ratings.csv",header=0)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


### Recommendation based on genre
For the first part of this recommendation, we are going to recommend movies based on genre <br>
i.e., if a user has watched an action/war movie, then suggest another similar movie to the user

In [160]:
# We need only movies_df for this section
movie_df = movies_df.copy()
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [161]:
# We will use movie_df from now
movie_df['genres'] = movie_df['genres'].str.split("|")
genres = (movie_df['genres'].values) # separating the values in genres column
print(genres)

[list(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'])
 list(['Adventure', 'Children', 'Fantasy']) list(['Comedy', 'Romance'])
 ... list(['Comedy']) list(['Drama']) list(['(no genres listed)'])]


In [162]:
genres_unique = list(set(sum(genres,[])))
genres_unique

['Mystery',
 'Children',
 'Drama',
 'Comedy',
 'Documentary',
 'Romance',
 'IMAX',
 'Thriller',
 'Action',
 'Film-Noir',
 'Adventure',
 'Animation',
 'Sci-Fi',
 'Fantasy',
 'Crime',
 'War',
 '(no genres listed)',
 'Horror',
 'Western',
 'Musical']

In [163]:
for g in genres_unique:
    movie_df[g] = 0

In [164]:
def populate_genre(row):
    for g in genres_unique:
        if g in row['genres']:
            row[g] = 1
    return row

In [165]:
unique_geners = genres_unique
unique_geners[-1] = 'genres'
movie_df[unique_geners] = movie_df[unique_geners].apply(populate_genre,axis=1)

In [166]:
movie_df.head()

Unnamed: 0,movieId,title,genres,Mystery,Children,Drama,Comedy,Documentary,Romance,IMAX,...,Adventure,Animation,Sci-Fi,Fantasy,Crime,War,(no genres listed),Horror,Western,Musical
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,1,0,1,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,1,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [167]:
# dropping the columns that we are not using 
movie_df = movie_df.drop(['(no genres listed)'],axis=1)

In [168]:
# check for null values
movie_df.isnull().sum()

movieId        0
title          0
genres         0
Mystery        0
Children       0
Drama          0
Comedy         0
Documentary    0
Romance        0
IMAX           0
Thriller       0
Action         0
Film-Noir      0
Adventure      0
Animation      0
Sci-Fi         0
Fantasy        0
Crime          0
War            0
Horror         0
Western        0
Musical        0
dtype: int64

In [169]:
# calculating cosine similarity of genres between movies

from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(movie_df.drop(['movieId','title','genres'],axis=1))

In [170]:
print(similarity_matrix.shape)

(10329, 10329)


In [172]:
# mapping the default index of the dataframe with the movie titles
indices = pd.Series(movie_df.index, index=movie_df['title'])

In [173]:
print(indices)

title
Toy Story (1995)                           0
Jumanji (1995)                             1
Grumpier Old Men (1995)                    2
Waiting to Exhale (1995)                   3
Father of the Bride Part II (1995)         4
                                       ...  
Cosmic Scrat-tastrophe (2015)          10324
Le Grand Restaurant (1966)             10325
A Very Murray Christmas (2015)         10326
The Big Short (2015)                   10327
Marco Polo: One Hundred Eyes (2015)    10328
Length: 10329, dtype: int64


In [180]:
# function that returns movie recommendations based on cosine similarity
def recommend_movie(title, similarity_matrix, movie_df, num_results):
    # input: title -> movie title
    #        similarity_matrix -> cosine similarity matrix for the entire df
    #        num_results -> Number of movies to be recommended
    
    # getting the index of the input movie
    movie_index = indices[title]
    
    # getting the row from the similarity matrix corresponding to this movie
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # Adding a new column to the dataframe with the similarity score between this movie and all of the rest
    movie_df['similarity score'] = [s[1] for s in similarity_scores]
    
    # sort dataframe based on the similarity score
    movie_df = movie_df.sort_values(by=['similarity score'],ascending=False)
    
    return movie_df[['movieId','title','genres','similarity score']].head(num_results)
    


In [181]:
# To get a recommendation, pass the movie title and call the function 
title = "Waiting to Exhale (1995)"
num_results = 10
recommend_movie(title,similarity_matrix,movie_df,num_results)

Unnamed: 0,movieId,title,genres,similarity score
4615,6241,Pauline at the Beach (Pauline à la Plage) (1983),"[Comedy, Drama, Romance]",1.0
4807,6549,How to Deal (2003),"[Comedy, Drama, Romance]",1.0
4838,6596,"Divorce, Le (2003)","[Comedy, Drama, Romance]",1.0
4858,6626,"Cemetery Club, The (1993)","[Comedy, Drama, Romance]",1.0
8579,78174,Sex and the City 2 (2010),"[Comedy, Drama, Romance]",1.0
4899,6696,Bollywood/Hollywood (2002),"[Comedy, Drama, Musical, Romance]",1.0
4901,6700,"Other Side of the Bed, The (Otro lado de la ca...","[Comedy, Drama, Musical, Romance]",1.0
4907,6710,Dummy (2002),"[Comedy, Drama, Romance]",1.0
4908,6711,Lost in Translation (2003),"[Comedy, Drama, Romance]",1.0
631,753,"Month by the Lake, A (1995)","[Comedy, Drama, Romance]",1.0
