## Building a Movie Recommendation System
## Data set is downloaded from Kaggle
### Follow the link to download the dataset " https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?select=movies_metadata.csv "

In [1]:
# importing neccessary libraries.
import pandas as pd
import numpy as np

# Other libraries which are required will be imported whereever used.

In [2]:
# Loading dataset 'movies_metadata.csv'
data = pd.read_csv('movies_metadata.csv', low_memory = False)
data = data[0:12000]  #using only 12000 datapoints for faster computation and avoid space complexity.
data.head(5)  #printing 1st rows of the dataset
data.shape

(12000, 24)

## The data has unnessacary coloumns and null values so we use data cleaning techniques to acheive a meaning full data set 

In [3]:
data = data[['original_title', 'genres', 'overview', 'runtime', 'vote_average']] # extracting only required columns 
data.head(5)

Unnamed: 0,original_title,genres,overview,runtime,vote_average
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",81.0,7.7
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,104.0,6.9
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,101.0,6.5
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",127.0,6.1
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,106.0,5.7


In [4]:
# Extracting only the genre names from genre column and replacing them inplace of original column
data['genres'] = data['genres'].apply(lambda x : [i['name'] for i in eval(x)])
data['genres'] = data['genres'].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))
data.head(5)

Unnamed: 0,original_title,genres,overview,runtime,vote_average
0,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",81.0,7.7
1,Jumanji,Adventure Fantasy Family,When siblings Judy and Peter discover an encha...,104.0,6.9
2,Grumpier Old Men,Romance Comedy,A family wedding reignites the ancient feud be...,101.0,6.5
3,Waiting to Exhale,Comedy Drama Romance,"Cheated on, mistreated and stepped on, the wom...",127.0,6.1
4,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,106.0,5.7


In [5]:
data['genres'].isnull().sum()
data['genres'] = data['genres'].fillna('general')
data.head(5)

Unnamed: 0,original_title,genres,overview,runtime,vote_average
0,Toy Story,Animation Comedy Family,"Led by Woody, Andy's toys live happily in his ...",81.0,7.7
1,Jumanji,Adventure Fantasy Family,When siblings Judy and Peter discover an encha...,104.0,6.9
2,Grumpier Old Men,Romance Comedy,A family wedding reignites the ancient feud be...,101.0,6.5
3,Waiting to Exhale,Comedy Drama Romance,"Cheated on, mistreated and stepped on, the wom...",127.0,6.1
4,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,106.0,5.7


In [6]:
#Importing tfidf vectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer() # creating a tfidf vectorizer
tfidf_matrix = tfidf.fit_transform(data['genres'])
tfidf_matrix.shape

(12000, 20)

## For the above matrix we need to find the similarity score 

In [7]:
# caluculating dot product using sklearn's linear kernel
from sklearn.metrics.pairwise import linear_kernel
# calculating the cosine similarity on tfidf matrix
cosine_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
# Now lets map all the movie names('original_title') to the index
movie_index = pd.Series(data.index, index = data['original_title']).drop_duplicates()

In [9]:
movie_id = movie_index['Assassins']
movie_id

22

In [10]:
# defining a function to recommend movies
def get_recommendations(Name, topN):
    movie_id = movie_index[Name]
    # now we need to calculate the pair wise similarity score for all movies's with the selected movie
    cosine_scores = list(enumerate(cosine_sim_matrix[movie_id]))
    # Sorting the cosine_similarity based on score
    cosine_scores = sorted(cosine_scores, key = lambda x:x[1], reverse = True)
    # Get the scores of Top N most similar movies
    cosine_scores_topN = cosine_scores[0:topN+1]
    
    # mapping scores and movies and printing similar movies
    movie_idx = [i[0] for i in cosine_scores_topN]
    movie_scores = [i[1] for i in cosine_scores_topN]
    similar_movies = pd.DataFrame(columns = ["Name", "Score"])
    similar_movies['Name'] = data.loc[movie_idx, 'original_title']
    similar_movies['Score'] = movie_scores
    similar_movies.reset_index(inplace = True)
    print(similar_movies)

In [11]:
get_recommendations('Across the Sea of Time', topN = 10)

    index                      Name     Score
0      36    Across the Sea of Time  1.000000
1    1545      Seven Years in Tibet  0.840401
2    4559                Black Robe  0.840401
3    5093                  Barabbas  0.840401
4    6911  Aguirre, der Zorn Gottes  0.840401
5    6935        A Passage to India  0.840401
6    7628        Sodom and Gomorrah  0.840401
7    7994                   Tai-Pan  0.840401
8    8427      Passage to Marseille  0.840401
9    8556      Mutiny on the Bounty  0.840401
10  11233                   Ivanhoe  0.840401
