<a href="https://colab.research.google.com/github/insh-samnani/Movie-Recommendation-System-Python/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing dependencies.

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import difflib

In [None]:
#Converting datasets into pandas dataframe.

url=('https://raw.githubusercontent.com/insh-samnani/DataSets/main/titles.csv')
titles=pd.read_csv(url)
url=('https://raw.githubusercontent.com/insh-samnani/DataSets/main/credits.csv')
credits=pd.read_csv(url)

In [None]:
#Viewing the "titles" dataframe.

titles.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,48,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,27.612,8.2
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,18.216,7.8
3,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,17.505,7.8
4,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,95.337,7.7


In [None]:
#Viewing the "credits" dataframe.

credits.head()

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [None]:
#We will be requiring imdb_votes and imdb_score, so we are just dropping those rows where the imdb_votes and imdb_score are missing. The records are less, we can easily drop them from our dataframe.
#Initially the records were: 5806. After dropping we hav 5267 records which are much for recommending the movies to users.

print(titles.shape)
titles=titles[titles['imdb_score'].notna()]
titles=titles[titles['imdb_votes'].notna()]
print(titles.shape)
titles.isna().sum()

(5806, 15)
(5267, 15)


id                         0
title                      0
type                       0
description                4
release_year               0
age_certification       2278
runtime                    0
genres                     0
production_countries       0
seasons                 3391
imdb_id                    0
imdb_score                 0
imdb_votes                 0
tmdb_popularity           82
tmdb_score               226
dtype: int64

In [None]:
#Checking if we have any duplicated rows, so that we could drop them.

titles.duplicated().sum()

0

In [None]:
#Merging two dataframes on 'id' and checking for null values. We can see that we have null values for those columns which we will not be utilizing in our algorithm.

titles_with_credits=titles.merge(credits,on='id')
titles_with_credits.isna().sum()

id                          0
title                       0
type                        0
description                31
release_year                0
age_certification       28059
runtime                     0
genres                      0
production_countries        0
seasons                 59995
imdb_id                     0
imdb_score                  0
imdb_votes                  0
tmdb_popularity            10
tmdb_score                754
person_id                   0
name                        0
character                8395
role                        0
dtype: int64

In [None]:
#Let us filter the final dataframe for only those columns which we will be using. We are also removing duplicated rows.

titles_with_credits=titles_with_credits[['id','title','type','genres','imdb_score','imdb_votes','person_id']]
titles_with_credits.drop_duplicates()

Unnamed: 0,id,title,type,genres,imdb_score,imdb_votes,person_id
0,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,3748
1,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,14658
2,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,7064
3,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,3739
4,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,48933
...,...,...,...,...,...,...,...
72845,tm1098060,Shadow Parties,MOVIE,"['action', 'thriller']",6.2,9.0,1347054
72846,tm1098060,Shadow Parties,MOVIE,"['action', 'thriller']",6.2,9.0,157590
72847,tm1098060,Shadow Parties,MOVIE,"['action', 'thriller']",6.2,9.0,129059
72848,tm1098060,Shadow Parties,MOVIE,"['action', 'thriller']",6.2,9.0,2050199


### **POPULARITY BASED RECOMMENDATION SYSTEM**

In [None]:
#Couting that how many number of ratings we have for each movie.

titles_with_no_of_ratings=titles_with_credits.groupby('title')['imdb_score'].count().reset_index()
titles_with_no_of_ratings.rename(columns={'imdb_score':'no_of_ratings'},inplace=True)
print(titles_with_no_of_ratings.shape)
titles_with_no_of_ratings.head()

(4910, 2)


Unnamed: 0,title,no_of_ratings
0,#Alive,20
1,#AnneFrank. Parallel Stories,6
2,#FollowFriday,12
3,#FriendButMarried,29
4,#FriendButMarried 2,20


In [None]:
#Couting the average rating for each movie.

titles_with_avg_ratings=titles_with_credits.groupby('title')['imdb_score'].mean().reset_index()
titles_with_avg_ratings.rename(columns={'imdb_score':'avg_ratings'},inplace=True)
print(titles_with_avg_ratings.shape)
titles_with_avg_ratings.head()

(4910, 2)


Unnamed: 0,title,avg_ratings
0,#Alive,6.3
1,#AnneFrank. Parallel Stories,6.5
2,#FollowFriday,2.7
3,#FriendButMarried,6.9
4,#FriendButMarried 2,6.9


In [None]:
#Now we are merging both dataframes.

popularity=titles_with_no_of_ratings.merge(titles_with_avg_ratings,on='title')
print(popularity.shape)
popularity.head()

(4910, 3)


Unnamed: 0,title,no_of_ratings,avg_ratings
0,#Alive,20,6.3
1,#AnneFrank. Parallel Stories,6,6.5
2,#FollowFriday,12,2.7
3,#FriendButMarried,29,6.9
4,#FriendButMarried 2,20,6.9


In [None]:
#Now we have to filter for only those movies that are having the number of ratings equal or above 50.
#Then we are sorting by average ratings so that high rated movies could come at top.
#After that, we will filter for top 20 movies that are in trending.

popularity1=popularity[popularity['no_of_ratings']>=50]
popularity1=popularity1.sort_values('avg_ratings',ascending=False)
print(popularity1.shape)
popularity1=popularity1.head(20)
popularity1

(181, 3)


Unnamed: 0,title,no_of_ratings,avg_ratings
1405,Forrest Gump,77,8.8
3354,Saving Private Ryan,83,8.6
1123,Django Unchained,113,8.4
2877,Once Upon a Time in America,83,8.3
4904,jeen-yuhs,65,8.1
3283,Rush,78,8.1
4718,Warrior,71,8.052113
753,Casino Royale,57,8.0
4042,The Imitation Game,68,8.0
2467,Marriage Story,57,7.9


In [None]:
#Now we must add the additional required columns to our popularity1 dataframe for further descriptions.

popularity1=popularity1.merge(titles,on='title').drop_duplicates('title')[['title','no_of_ratings','avg_ratings','type','description','genres']]
print(popularity1.shape)
popularity1

(20, 6)


Unnamed: 0,title,no_of_ratings,avg_ratings,type,description,genres
0,Forrest Gump,77,8.8,MOVIE,A man with a low IQ has accomplished great thi...,"['drama', 'romance', 'comedy']"
1,Saving Private Ryan,83,8.6,MOVIE,"As U.S. troops storm the beaches of Normandy, ...","['drama', 'war']"
2,Django Unchained,113,8.4,MOVIE,"With the help of a German bounty hunter, a fre...","['western', 'drama']"
3,Once Upon a Time in America,83,8.3,MOVIE,A former Prohibition-era Jewish gangster retur...,"['drama', 'crime', 'european']"
4,jeen-yuhs,65,8.1,SHOW,"Kanye West docu-series, over 20 years in the m...","['documentation', 'music']"
5,Rush,78,8.1,MOVIE,A biographical drama centered on the rivalry b...,"['drama', 'action', 'sport', 'european']"
6,Warrior,71,8.052113,MOVIE,The youngest son of an alcoholic former boxer ...,"['drama', 'action', 'sport']"
8,Casino Royale,57,8.0,MOVIE,"Le Chiffre, a banker to the world's terrorists...","['thriller', 'action', 'european']"
9,The Imitation Game,68,8.0,MOVIE,Based on the real life story of legendary cryp...,"['thriller', 'drama', 'war']"
10,Marriage Story,57,7.9,MOVIE,A stage director and an actress struggle throu...,"['drama', 'romance', 'comedy']"


### **COLLABORATIVE FILTERING BASED RECOMMENDATION SYSTEM**

In [None]:
#First of all, let us go through the merged dataframe that we prepared initially.

print(titles_with_credits.shape)
titles_with_credits.head()

(72850, 7)


Unnamed: 0,id,title,type,genres,imdb_score,imdb_votes,person_id
0,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,3748
1,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,14658
2,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,7064
3,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,3739
4,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,48933


In [None]:
#Now let us find that each user has rated how many movies. We can see that out of 72850, 51656 users have rated the movies.

titles_with_credits.groupby('person_id')['imdb_score'].count()

person_id
7          3
8          3
11         3
16         1
18         2
          ..
2371458    1
2371513    1
2371527    1
2371563    1
2371585    1
Name: imdb_score, Length: 51656, dtype: int64

In [None]:
#But we will only consider those users who have rated more than 5 movies. We can see that only 717 users have rated more than 5 movies.

more_rated=titles_with_credits.groupby('person_id')['imdb_score'].count()>5
more_rated[more_rated]

person_id
58         True
61         True
130        True
132        True
135        True
           ... 
784991     True
836709     True
845008     True
1531470    True
2161480    True
Name: imdb_score, Length: 717, dtype: bool

In [None]:
#Now we have to fetch the ids of those users in a variable.

more_rated[more_rated]
users=more_rated[more_rated].index
users

Int64Index([     58,      61,     130,     132,     135,     138,     142,
                145,     160,     198,
            ...
             701746,  724830,  743974,  745013,  749431,  784991,  836709,
             845008, 1531470, 2161480],
           dtype='int64', name='person_id', length=717)

In [None]:
#Now we are filtering the merged datasets for those rows where we have their person_id from the users who rated more than 5 movies.
#This means we were having 72850 ratings, but only 5919 ratings were done by these users

filtered_ratings_by_users=titles_with_credits[titles_with_credits['person_id'].isin(users)]
filtered_ratings_by_users

Unnamed: 0,id,title,type,genres,imdb_score,imdb_votes,person_id
0,tm84618,Taxi Driver,MOVIE,"['crime', 'drama']",8.3,795222.0,3748
37,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11472
38,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,1549
39,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11474
40,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11473
...,...,...,...,...,...,...,...
72709,tm846586,Layla Majnun,MOVIE,"['drama', 'romance']",6.1,272.0,49309
72768,tm878575,The Heartbreak Club,MOVIE,"['comedy', 'drama', 'romance']",6.2,188.0,697818
72779,tm985215,Princess 'Daya'Reese,MOVIE,"['romance', 'comedy']",7.2,45.0,206514
72795,tm1004011,Time to Dance,MOVIE,"['drama', 'romance']",2.2,950.0,63367


In [None]:
#Now we are filtering 'filtered_ratings_by_users' dataset to filter for those movies that are having more than 5 ratings. We have 245 such ratings.
#We are also storing the indexes of such movies for later filtering

completely_filtered=filtered_ratings_by_users.groupby('title')['imdb_score'].count()>5
movies=completely_filtered[completely_filtered].index
movies

Index(['1898: Our Last Men in the Philippines', '7 Khoon Maaf', '7SEEDS', '83',
       'A Futile and Stupid Gesture', 'A Tale Dark & Grimm',
       'A Very Harold & Kumar Christmas', 'A Very Murray Christmas',
       'A Whisker Away', 'A.I.C.O. -Incarnation-',
       ...
       'Wet Hot American Summer: First Day of Camp',
       'Wet Hot American Summer: Ten Years Later', 'Wolverine', 'Yaar Gaddar',
       'Yuva', 'Zero', 'Zindagi Na Milegi Dobara', 'Zoolander',
       'anohana: The Flower We Saw That Day', 'jeen-yuhs'],
      dtype='object', name='title', length=245)

In [None]:
#Now we are filtering for those movies that are having total number of ratings more than 5. Inshort we have nit final filtered dataset further reduced fromm 5919 rows to 2025 rows

filtered_ratings=filtered_ratings_by_users[filtered_ratings_by_users['title'].isin(movies)]
filtered_ratings

Unnamed: 0,id,title,type,genres,imdb_score,imdb_votes,person_id
37,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11472
38,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,1549
39,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11474
40,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11473
41,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11475
...,...,...,...,...,...,...,...
70535,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,629401
70537,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,14986
70539,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,222
70541,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,18723


In [None]:
#Checking that if we are having any duplicates in our final filtered dataset so that we can drop them.
#We have 1984 unique rows in our final filtered dataset

print(filtered_ratings.duplicated().sum())
filtered_ratings.drop_duplicates()

41


Unnamed: 0,id,title,type,genres,imdb_score,imdb_votes,person_id
37,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11472
38,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,1549
39,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11474
40,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11473
41,tm127384,Monty Python and the Holy Grail,MOVIE,"['comedy', 'fantasy']",8.2,530877.0,11475
...,...,...,...,...,...,...,...
70535,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,629401
70537,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,14986
70539,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,222
70541,tm1006127,Arlo the Alligator Boy,MOVIE,"['animation', 'comedy', 'family']",6.5,1079.0,18723


In [None]:
#Let us convert it into pivot table with required columns

piv_tab=filtered_ratings.pivot_table(index='title',columns='person_id',values='imdb_score')

In [None]:
#It is obvious that each user has not rated each movie. Let us fill the NaN values with 0. Each movies is a vector in 531-dimentional space

piv_tab.fillna(0,inplace=True)
piv_tab

person_id,130,132,135,138,142,145,160,208,222,249,...,595436,600008,603750,617769,629401,724830,745013,749431,845008,2161480
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1898: Our Last Men in the Philippines,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7 Khoon Maaf,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SEEDS,0.0,0.0,0.0,6.3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.4
A Futile and Stupid Gesture,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zindagi Na Milegi Dobara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoolander,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
anohana: The Flower We Saw That Day,8.2,0.0,0.0,8.2,0.0,0.0,0.0,8.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#Now we will be calculating the euclidean distance from each to each movies

euc_dist=cosine_similarity(piv_tab)
cosine_similarity(piv_tab).shape 

(245, 245)

In [None]:
#Now we will be making our function of recommendations

def recommendation_system(title):
  ind=np.where(piv_tab.index==title)[0][0] #Fetching index of the title
  dist=sorted(list(enumerate(euc_dist[ind])),key=lambda x:x[1],reverse=True)[1:4] #Fetching the sorted (by descending) euclidean distances (similarity scores) for the title, dropping the first one, because the score of movie with itself will be one ofcourse
  for distances in dist:
    print(piv_tab.index[distances[0]])

In [None]:
#Now we can take input from user, validate the input and make changes accordingly (BECAUSE USER CAN MIS SPELL THE MOVIE TITLE), and finally displaying the recommended movies to user

movie=input("ENTER THE MOVIE. WE WILL SUGGEST YOU THE BEST: ")
available_titles=filtered_ratings['title'].tolist() #Extracting all possible movie titles
close_movie=difflib.get_close_matches(movie,available_titles) #Extracting those movie titles that matches closely with the available titles
close_movie_match=close_movie[0] #Extracting the first close match
print("\n\nTHE SUGGESTED MOVIES ARE: ")
recommendation_system(close_movie_match)

ENTER THE MOVIE. WE WILL SUGGEST YOU THE BEST: 833


THE SUGGESTED MOVIES ARE: 
PK
Paan Singh Tomar
Zero
