Importing the dependencies

In [2]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [3]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('movies.csv')


In [4]:
# printing the first 5 rows of the dataframe

movies_data = movies_data.loc[movies_data['popularity']>=10]
movies_data.head()

Unnamed: 0.1,Unnamed: 0,title,id,imdb_id,original_language,original_title,overview,tagline,popularity,release_date,runtime
0,0,Toy Story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,21.946943,1995-10-30,81.0
1,1,Jumanji,8844,113497,en,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,17.015539,1995-12-15,104.0
2,2,Grumpier Old Men,15602,113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,11.7129,1995-12-22,101.0
5,5,Heat,949,113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,17.924927,1995-12-15,170.0
9,9,GoldenEye,710,113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,14.686036,1995-11-16,130.0


In [5]:
crew_data = pd.read_csv('crew.csv')

In [6]:
# printing the first 5 rows of the dataframe
crew_data.head()

Unnamed: 0.1,Unnamed: 0,id,job,department,name,gender,credit_id
0,0,862,Director,Directing,John Lasseter,2.0,52fe4284c3a36847f8024f49
1,1,862,Screenplay,Writing,Joss Whedon,2.0,52fe4284c3a36847f8024f4f
2,2,862,Screenplay,Writing,Andrew Stanton,2.0,52fe4284c3a36847f8024f55
3,3,862,Screenplay,Writing,Joel Cohen,2.0,52fe4284c3a36847f8024f5b
4,4,862,Screenplay,Writing,Alec Sokolow,0.0,52fe4284c3a36847f8024f61


In [7]:
# get only the directors of the crews
director_data = crew_data[crew_data['job'] == 'Director']

director_data.head()


Unnamed: 0.1,Unnamed: 0,id,job,department,name,gender,credit_id
0,0,862,Director,Directing,John Lasseter,2.0,52fe4284c3a36847f8024f49
109,109,8844,Director,Directing,Joe Johnston,2.0,52fe44bfc3a36847f80a7c7d
122,122,15602,Director,Directing,Howard Deutch,2.0,52fe466a9251416c75077a89
126,126,31357,Director,Directing,Forest Whitaker,2.0,52fe44779251416c91011acb
141,141,11862,Director,Directing,Charles Shyer,2.0,52fe44959251416c75039eef


In [8]:
#rename the column
director_data.rename(columns={"name": "director"}, inplace=True)

director_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  director_data.rename(columns={"name": "director"}, inplace=True)


Unnamed: 0.1,Unnamed: 0,id,job,department,director,gender,credit_id
0,0,862,Director,Directing,John Lasseter,2.0,52fe4284c3a36847f8024f49
109,109,8844,Director,Directing,Joe Johnston,2.0,52fe44bfc3a36847f80a7c7d
122,122,15602,Director,Directing,Howard Deutch,2.0,52fe466a9251416c75077a89
126,126,31357,Director,Directing,Forest Whitaker,2.0,52fe44779251416c91011acb
141,141,11862,Director,Directing,Charles Shyer,2.0,52fe44959251416c75039eef


In [9]:
genres_data = pd.read_csv('genres.csv')[['id', 'original_title', 'genre']]
genres_data = genres_data.drop_duplicates()
genres_data = genres_data.groupby(['id', 'original_title'])['genre'].apply(' '.join)

genres_data = genres_data.apply(lambda x: ' '.join(sorted(x.split())))



In [10]:
# printing the first 5 rows of the dataframe
genres_data.head()

id  original_title      
2   Ariel                                        Crime Drama
3   Varjoja paratiisissa                        Comedy Drama
5   Four Rooms                                  Comedy Crime
6   Judgment Night                     Action Crime Thriller
11  Star Wars               Action Adventure Fiction Science
Name: genre, dtype: object

In [11]:
keywords_data = pd.read_csv('keywords.csv')[['id', 'keywords']]
keywords_data = keywords_data.drop_duplicates()
keywords_data = keywords_data.groupby('id')['keywords'].apply(' '.join)
keywords_data = keywords_data.apply(lambda x: ' '.join(sorted(x.split())))

In [12]:
# printing the first 5 rows of the dataframe
keywords_data.head(20)

id
2     factory falling film helsinki in independent l...
3          film garbage helsinki independent salesclerk
5     angeles bet director episode eve film hoodlum ...
6     boxing chicago dealer drug escape match night one
11    android death empire force galactic galaxy her...
12    aftercreditsstinger barrier child clownfish du...
13    amputee based bench bully disabled family flas...
14    adultery affair age agent bittersweet camcorde...
15    art banker capitalist child collector florida ...
16    and and blindness crime czech dance dancing de...
17    abuse adolescence child child daughter loss mo...
18    against alien ancient ancient archeologist arm...
19    babel chase class delirium depravity destructi...
20    and daughter daughter death director dying far...
21                             surfboard surfer surfing
22    aftercreditsstinger alcoholic blacksmith capuc...
24    animation arts blood bride coma female fu gore...
25    arabia corps golf marine marine petrol 

In [13]:
# join dataframes on id

# join 1
df1 = pd.merge(movies_data, director_data, on="id")

df1.loc[df1['original_title'] == 'It Comes at Night']

Unnamed: 0,Unnamed: 0_x,title,id,imdb_id,original_language,original_title,overview,tagline,popularity,release_date,runtime,Unnamed: 0_y,job,department,director,gender,credit_id
3292,43289,It Comes at Night,418078,4695012,en,It Comes at Night,Secure within a desolate home as an unnatural ...,,20.504587,2017-06-09,91.0,449984,Director,Directing,Trey Edward Shults,2.0,57e8c30f9251412ae3019076


In [14]:
# join 2
df2 = pd.merge(df1, keywords_data, on="id")

df2.head()

Unnamed: 0,Unnamed: 0_x,title,id,imdb_id,original_language,original_title,overview,tagline,popularity,release_date,runtime,Unnamed: 0_y,job,department,director,gender,credit_id,keywords
0,0,Toy Story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,21.946943,1995-10-30,81.0,0,Director,Directing,John Lasseter,2.0,52fe4284c3a36847f8024f49,boy boy comes door friends friendship jealousy...
1,1,Jumanji,8844,113497,en,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,17.015539,1995-12-15,104.0,109,Director,Directing,Joe Johnston,2.0,52fe44bfc3a36847f80a7c7d,based board book children's disappearance game...
2,2,Grumpier Old Men,15602,113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,11.7129,1995-12-22,101.0,122,Director,Directing,Howard Deutch,2.0,52fe466a9251416c75077a89,best duringcreditsstinger fishing friend men old
3,5,Heat,949,113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,17.924927,1995-12-15,170.0,143,Director,Directing,Michael Mann,2.0,52fe4292c3a36847f802916d,and bank bank betrayal cat chase crime crimina...
4,9,GoldenEye,710,113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,14.686036,1995-11-16,130.0,280,Director,Directing,Martin Campbell,2.0,52fe426ec3a36847f801e14b,accused army base bomb car computer cossack cu...


In [15]:
# join 3
df3 = pd.merge(df2, genres_data, on="id")

df3.head()

Unnamed: 0,Unnamed: 0_x,title,id,imdb_id,original_language,original_title,overview,tagline,popularity,release_date,runtime,Unnamed: 0_y,job,department,director,gender,credit_id,keywords,genre
0,0,Toy Story,862,114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,21.946943,1995-10-30,81.0,0,Director,Directing,John Lasseter,2.0,52fe4284c3a36847f8024f49,boy boy comes door friends friendship jealousy...,Animation Comedy Family
1,1,Jumanji,8844,113497,en,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,17.015539,1995-12-15,104.0,109,Director,Directing,Joe Johnston,2.0,52fe44bfc3a36847f80a7c7d,based board book children's disappearance game...,Adventure Family Fantasy
2,2,Grumpier Old Men,15602,113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,11.7129,1995-12-22,101.0,122,Director,Directing,Howard Deutch,2.0,52fe466a9251416c75077a89,best duringcreditsstinger fishing friend men old,Comedy Romance
3,5,Heat,949,113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,17.924927,1995-12-15,170.0,143,Director,Directing,Michael Mann,2.0,52fe4292c3a36847f802916d,and bank bank betrayal cat chase crime crimina...,Action Crime Drama Thriller
4,9,GoldenEye,710,113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.,14.686036,1995-11-16,130.0,280,Director,Directing,Martin Campbell,2.0,52fe426ec3a36847f801e14b,accused army base bomb car computer cossack cu...,Action Adventure Thriller


In [16]:
# number of rows and columns in the data frame

df3.shape

(3239, 19)

In [17]:

df3 = df3.loc[df3['popularity'] >= 10]
df3.shape
df1.loc[df1['popularity'] < 10]

Unnamed: 0,Unnamed: 0_x,title,id,imdb_id,original_language,original_title,overview,tagline,popularity,release_date,runtime,Unnamed: 0_y,job,department,director,gender,credit_id


In [18]:
# selecting the relevant features for recommendation

selected_features = ['genre','keywords','director', 'popularity', 'tagline']
print(selected_features)

['genre', 'keywords', 'director', 'popularity', 'tagline']


In [19]:
# replacing the null valuess with null string

for feature in selected_features:
  df3[feature] = df3[feature].fillna('')

In [20]:
df4 = df3[['id', 'genre','keywords','director', 'popularity', 'tagline', 'original_title', 'imdb_id']]
df4.head()


Unnamed: 0,id,genre,keywords,director,popularity,tagline,original_title,imdb_id
0,862,Animation Comedy Family,boy boy comes door friends friendship jealousy...,John Lasseter,21.946943,,Toy Story,114709
1,8844,Adventure Family Fantasy,based board book children's disappearance game...,Joe Johnston,17.015539,Roll the dice and unleash the excitement!,Jumanji,113497
2,15602,Comedy Romance,best duringcreditsstinger fishing friend men old,Howard Deutch,11.7129,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,113228
3,949,Action Crime Drama Thriller,and bank bank betrayal cat chase crime crimina...,Michael Mann,17.924927,A Los Angeles Crime Saga,Heat,113277
4,710,Action Adventure Thriller,accused army base bomb car computer cossack cu...,Martin Campbell,14.686036,No limits. No fears. No substitutes.,GoldenEye,113189


In [22]:
# combining rows
#define how to aggregate various fields
agg_functions = {'genre':  ' '.join, 'keywords':  ' '.join, 'director':  'first', 'popularity': 'first', 'tagline': 'first', 'original_title': 'first', 'imdb_id': 'first'}

#create new DataFrame by combining rows with same id values
df_new = df4.groupby(df4['id']).aggregate(agg_functions)
df_new['keywords']=df_new['keywords']. apply(str. lower)
df_new['tagline']=df_new['tagline']. apply(str. lower)
df_new['genre']=df_new['genre']. apply(str. lower)

In [23]:
df_train = df_new


df_train = df_train.reset_index(drop=True)

df_train['id'] = df_train.index

df_train.head()

Unnamed: 0,genre,keywords,director,popularity,tagline,original_title,imdb_id,id
0,action adventure fiction science,android death empire force galactic galaxy her...,George Lucas,42.149697,"a long time ago in a galaxy far, far away...",Star Wars,76759,0
1,animation family,aftercreditsstinger barrier child clownfish du...,Andrew Stanton,25.497794,there are 3.7 trillion fish in the ocean. they...,Finding Nemo,266543,1
2,comedy drama romance,amputee based bench bully disabled family flas...,Robert Zemeckis,48.307194,"the world will never be the same, once you've ...",Forrest Gump,109830,2
3,drama,adultery affair age agent bittersweet camcorde...,Sam Mendes,20.726578,look closer.,American Beauty,169547,3
4,drama mystery,art banker capitalist child collector florida ...,Orson Welles,15.811921,it's terrific!,Citizen Kane,33467,4


In [24]:
# combining all the 3 selected features

combined_features = df_train['genre']+' '+df_train['keywords']+' '+df_train['director']+' '+df_train['tagline']+' '+str(df_train['popularity'])



In [25]:
print(combined_features)

0       action adventure fiction science android death...
1       animation family aftercreditsstinger barrier c...
2       comedy drama romance amputee based bench bully...
3       drama adultery affair age agent bittersweet ca...
4       drama mystery art banker capitalist child coll...
                              ...                        
2907    action thriller suspense Alex Merkin  0       ...
2908    drama movie music romance tv 1960s catskills m...
2909    animation comedy family romance animation come...
2910    action crime thriller s.w.a.t. Tony Giglio  0 ...
2911    action adventure crime saint the Ernie Barbara...
Length: 2912, dtype: object


In [26]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [27]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [28]:
print(feature_vectors)

  (0, 3352)	0.02663747186820855
  (0, 2712)	0.02663747186820855
  (0, 71)	0.02663747186820855
  (0, 5145)	0.02663747186820855
  (0, 6835)	0.02663747186820855
  (0, 6037)	0.02663747186820855
  (0, 60)	0.02663747186820855
  (0, 65)	0.02663747186820855
  (0, 70)	0.02663747186820855
  (0, 77)	0.02663747186820855
  (0, 69)	0.02663747186820855
  (0, 104)	0.02663747186820855
  (0, 68)	0.02663747186820855
  (0, 17)	0.02663747186820855
  (0, 67)	0.02663747186820855
  (0, 2)	0.02663747186820855
  (0, 10)	0.0532749437364171
  (0, 66)	0.02663747186820855
  (0, 103)	0.02663747186820855
  (0, 19)	0.02663747186820855
  (0, 101)	0.02663747186820855
  (0, 49)	0.07991241560462566
  (0, 74)	0.02663747186820855
  (0, 86)	0.02663747186820855
  (0, 87)	0.02663747186820855
  :	:
  (2911, 60)	0.06404947185912964
  (2911, 65)	0.06404947185912964
  (2911, 70)	0.06404947185912964
  (2911, 77)	0.06404947185912964
  (2911, 69)	0.06404947185912964
  (2911, 104)	0.06404947185912964
  (2911, 68)	0.06404947185912964
 

Cosine Similarity

In [29]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [30]:
print(similarity)

[[1.         0.03630099 0.03985671 ... 0.03868416 0.09197035 0.09248856]
 [0.03630099 1.         0.06680819 ... 0.12585912 0.07975538 0.07191536]
 [0.03985671 0.06680819 1.         ... 0.0919637  0.07192974 0.07753597]
 ...
 [0.03868416 0.12585912 0.0919637  ... 1.         0.09397225 0.09301558]
 [0.09197035 0.07975538 0.07192974 ... 0.09397225 1.         0.26047014]
 [0.09248856 0.07191536 0.07753597 ... 0.09301558 0.26047014 1.        ]]


In [31]:
print(similarity.shape)

(2912, 2912)


In [32]:
#similarity scores to df
similarity_df = pd.DataFrame(similarity)

# Stack the DataFrame to convert it to a long format
long_simi_df = similarity_df.stack().reset_index()

# Rename the columns to something more meaningful
long_simi_df.columns = ['movie1', 'movie2', 'similarity']

# Print the resulting long-form DataFrame
long_simi_df.head()

Unnamed: 0,movie1,movie2,similarity
0,0,0,1.0
1,0,1,0.036301
2,0,2,0.039857
3,0,3,0.040576
4,0,4,0.036711


In [33]:
df_train.head()

Unnamed: 0,genre,keywords,director,popularity,tagline,original_title,imdb_id,id
0,action adventure fiction science,android death empire force galactic galaxy her...,George Lucas,42.149697,"a long time ago in a galaxy far, far away...",Star Wars,76759,0
1,animation family,aftercreditsstinger barrier child clownfish du...,Andrew Stanton,25.497794,there are 3.7 trillion fish in the ocean. they...,Finding Nemo,266543,1
2,comedy drama romance,amputee based bench bully disabled family flas...,Robert Zemeckis,48.307194,"the world will never be the same, once you've ...",Forrest Gump,109830,2
3,drama,adultery affair age agent bittersweet camcorde...,Sam Mendes,20.726578,look closer.,American Beauty,169547,3
4,drama mystery,art banker capitalist child collector florida ...,Orson Welles,15.811921,it's terrific!,Citizen Kane,33467,4


In [34]:
#output similarity scores

long_simi_df.to_csv('cosine_similarity.csv')

df_train.to_csv('df_ml.csv')

Getting the movie name from the user

In [97]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : The Godfather


In [98]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Heat', 'GoldenEye', 'Balto', 'Casino', 'Sense and Sensibility', 'Get Shorty', 'Copycat', 'Assassins', 'Powder', 'Leaving Las Vegas', 'Twelve Monkeys', 'Babe', 'Restoration', 'Mortal Kombat', 'To Die For', 'Se7en', 'Pocahontas', 'The Usual Suspects', 'Mighty Aphrodite', 'The Postman', 'The Indian in the Cupboard', "Don't Be a Menace to South Central While Drinking Your Juice in the Hood", 'Friday', 'From Dusk Till Dawn', 'Bed of Roses', 'Screamers', 'Mary Reilly', 'Broken Arrow', 'La Haine', 'Braveheart', 'Taxi Driver', 'Rumble in the Bronx', 'Jade', 'Apollo 13', 'Rob Roy', 'Batman Forever', 'Belle de Jour', 'Blue in the Face', 'Casper', 'Die Hard: With a Vengeance', 'First Knight', 'Hackers', 'Johnny Mnemonic', 'Living in Oblivion', 'The Net', 'The Prophecy', 'Showgirls', 'Species', 'Strange Days', 'Three Wishes', 'Waterworld', 'Before Sunrise', 'Clerks', 'Disclosure', 'Ed Wood', 'Forget Paris', 'A Goofy Movie', 'Interview with the Vampire'

In [99]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['The Godfather', 'The Godfather: Part II', 'The Godfather: Part III']


In [100]:
close_match = find_close_match[0]
print(close_match)

The Godfather


In [101]:
# finding the id of the movie with title

index_of_the_movie = df_train[df_train.original_title == close_match]['id'].values[0]
print(index_of_the_movie)

104


In [86]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.03248994202629654), (1, 0.09310512690346968), (2, 0.056866484506674954), (3, 0.06164337806042359), (4, 0.04671573016177202), (5, 0.08771936830960747), (6, 0.04000303764763575), (7, 0.04642971314417308), (8, 0.06945744571640157), (9, 0.039256360787770436), (10, 0.05966049975517512), (11, 0.02877315267226387), (12, 0.18006037782357692), (13, 0.052983664598590434), (14, 0.09609311558212917), (15, 0.08350937650398225), (16, 0.0389882354262502), (17, 0.0897061615047003), (18, 0.05942908978744638), (19, 0.04195216804039269), (20, 0.083092663936143), (21, 0.03983381672176201), (22, 0.05498578820476184), (23, 0.03873568682044868), (24, 0.04549948888121131), (25, 0.038905811704594895), (26, 0.06965253731707027), (27, 0.05865450142865722), (28, 0.04090689236900682), (29, 0.0414482037077152), (30, 0.07295977283888758), (31, 0.054781029346471326), (32, 0.034250453353527596), (33, 0.04841683412805187), (34, 0.029917252492363523), (35, 0.05233630165365751), (36, 0.04948604688361242), (37, 0.0

In [87]:
len(similarity_score)

2912

In [88]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(104, 1.0), (108, 0.3538679074429417), (601, 0.25812686128255624), (106, 0.25699336046236654), (1437, 0.24099333821518312), (231, 0.23202461347596598), (2492, 0.20848631739338347), (2553, 0.20841997976463988), (2604, 0.20674521246613742), (2697, 0.20348726434961759), (314, 0.20157591461899366), (2556, 0.1928322507115731), (2849, 0.18744314583305405), (1227, 0.18398697059048055), (870, 0.18241739580589234), (12, 0.18006037782357692), (1986, 0.17726892376384462), (340, 0.17673603322858), (2328, 0.17612929905951696), (2507, 0.17072526776625196), (1717, 0.1705252538057504), (1977, 0.16949860748632792), (142, 0.1688938973153744), (68, 0.1652935690851569), (2194, 0.16424284152448557), (757, 0.160885792207595), (185, 0.15740392462754446), (661, 0.15715441361006438), (810, 0.1570247836006175), (1445, 0.15654947803530161), (572, 0.15628668475792984), (872, 0.15480222667284038), (2043, 0.15402765629852322), (2910, 0.1536332247768489), (2379, 0.15339937975064227), (896, 0.15320324071162048), (93

In [89]:
df_train.head()
df_train.columns

Index(['genre', 'keywords', 'director', 'popularity', 'tagline',
       'original_title', 'id'],
      dtype='object')

In [96]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = df_train[df_train.index == index]['original_title'].values[0]
  if (i<31):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . The Godfather
2 . The Godfather: Part III
3 . Closer
4 . The Godfather: Part II
5 . Avanti!
6 . The Conversation
7 . The Bag Man
8 . Live by Night
9 . Everly
10 . War Dogs
11 . GoodFellas
12 . Black Mass
13 . Precious Cargo
14 . Gloria
15 . Gomorra
16 . Apocalypse Now
17 . Copie Conforme
18 . M
19 . The Drop
20 . Kill the Messenger
21 . Mobsters
22 . All Good Things
23 . True Romance
24 . Edward Scissorhands
25 . Serbuan maut
26 . Dracula
27 . Leaving Las Vegas
28 . Bandits
29 . Get Shorty
30 . Suspect


Movie Recommendation Sytem