In [32]:
# %%
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib

# %%
movies = pd.read_csv("25k IMDb movie Dataset.csv")
movies = movies[:10000] 

movies['index'] = movies.index
movies.head()

# %%
movies.shape

# %%
# check null values
movies.isnull().count()

# %%
movies.dtypes

# %%
movies_features = movies["movie title"]+' '+movies["Generes"]+' '+movies["Overview"]+' '+movies["Plot Kyeword"]+' '+movies["Director"]+' '+movies["Top 5 Casts"]+' '+movies["Writer"]

# %%
movies_features

# %%
# converting the text data to feature vectors
vectorizer = TfidfVectorizer()
features_vectors = vectorizer.fit_transform(movies_features.values.astype('U'))

# %%
print(features_vectors)


  (0, 7265)	0.11135593414249477
  (0, 21128)	0.08857254954311414
  (0, 41233)	0.129662359487015
  (0, 27122)	0.10799051005976786
  (0, 9051)	0.12476501264593641
  (0, 20997)	0.09186370008047764
  (0, 9827)	0.11135593414249477
  (0, 42022)	0.07639212279732435
  (0, 9603)	0.09376898290007274
  (0, 31392)	0.07537740060737265
  (0, 21353)	0.08346233642784325
  (0, 13438)	0.13786274962012654
  (0, 20597)	0.08210464175591048
  (0, 22910)	0.15023201492937283
  (0, 21304)	0.0909822825227626
  (0, 27139)	0.08084298023813224
  (0, 3346)	0.2822325839426894
  (0, 28724)	0.12406511597143956
  (0, 1282)	0.2318742395240804
  (0, 37095)	0.07899329201137673
  (0, 21051)	0.2188842586361907
  (0, 14625)	0.28401327675213506
  (0, 17105)	0.12406511597143956
  (0, 45428)	0.1134999187250632
  (0, 41493)	0.054199937762294766
  :	:
  (9999, 28248)	0.11172704747915282
  (9999, 4615)	0.14466590753208974
  (9999, 41343)	0.16779624487872316
  (9999, 42659)	0.12315710287049289
  (9999, 12113)	0.12795230828495543
  

In [2]:
from joblib import dump
dump(movies_features, 'recommend.pkl')

['recommend.pkl']

In [33]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(features_vectors)

print(similarity)

[[1.         0.02348164 0.39631002 ... 0.00569485 0.01843284 0.01118846]
 [0.02348164 1.         0.01045086 ... 0.03573155 0.02204707 0.00447476]
 [0.39631002 0.01045086 1.         ... 0.01744734 0.04451629 0.01305248]
 ...
 [0.00569485 0.03573155 0.01744734 ... 1.         0.00258819 0.002747  ]
 [0.01843284 0.02204707 0.04451629 ... 0.00258819 1.         0.00663715]
 [0.01118846 0.00447476 0.01305248 ... 0.002747   0.00663715 1.        ]]


In [4]:
# getting the book name from the user
movie_name = input('Enter your favourite movie name : ')

Enter your favourite movie name : avatar


In [5]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies["movie title"].tolist()
print(list_of_all_titles)



In [6]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Avatar', 'Daata', 'Rafathar']


In [7]:
close_match = find_close_match[0]
print(close_match)

Avatar


In [8]:
# finding the index of the movie with title

index_of_the_movie = movies[movies["movie title"] == close_match]['index'].values[0]
print(index_of_the_movie)

71


In [9]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.008711448317845578), (1, 0.02488100814027201), (2, 0.004675293354351403), (3, 0.015659644076981078), (4, 0.01649021923916333), (5, 0.01812174029990944), (6, 0.006620619256072879), (7, 0.009828248987090596), (8, 0.04288010933564499), (9, 0.014117578591274181), (10, 0.01141994439000146), (11, 0.016961719307672287), (12, 0.01398784459859085), (13, 0.028632773691182476), (14, 0.02763186448201139), (15, 0.06149430256751954), (16, 0.011569713747694183), (17, 0.008708666543930509), (18, 0.004974032295260309), (19, 0.01230438419919331), (20, 0.012609118465533297), (21, 0.02984348813538391), (22, 0.008116007733160853), (23, 0.00891452669756497), (24, 0.01730669204856794), (25, 0.008317883315813334), (26, 0.029236476059780886), (27, 0.010223679862754124), (28, 0.013239117998044173), (29, 0.03166915656345714), (30, 0.0036881244887999624), (31, 0.0006309720311684803), (32, 0.0010295686187399154), (33, 0.012309761820079018), (34, 0.036354184169613946), (35, 0.014456025085779955), (36, 0.0176

In [10]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(71, 1.0), (2323, 0.5718659597676591), (2758, 0.5127502700915246), (1834, 0.4390464349847079), (59, 0.36236342818287537), (8397, 0.28574572264773723), (185, 0.24579496800706824), (120, 0.19658067524436337), (8220, 0.192122188839411), (9167, 0.18843192491934962), (9296, 0.18763110535569932), (9201, 0.18565113700749422), (8231, 0.1842590460918666), (9898, 0.17213504309722882), (171, 0.16649916142710347), (7921, 0.13013666404696891), (4640, 0.11945499478079913), (2097, 0.1193875041500276), (8394, 0.11115150014627326), (261, 0.11027197459161987), (8759, 0.10930907554309464), (3430, 0.10874926812550996), (8695, 0.10874265864278579), (3731, 0.10840698692058116), (2790, 0.10788062005061964), (9429, 0.1033176751538176), (9703, 0.10087332863548666), (8138, 0.09948124484716751), (1308, 0.09893550271152479), (610, 0.0987897293611963), (277, 0.09737468687825819), (190, 0.09644261619842848), (7119, 0.09617164341042225), (1696, 0.09566041471909177), (1443, 0.0950875934393681), (9317, 0.094368223808

In [11]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies[movies.index==index]['movie title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Avatar
2 . Avatar 5
3 . Avatar 4
4 . Avatar 3
5 . Avatar: The Way of Water
6 . The Abyss
7 . Terminator 2: Judgment Day
8 . Aliens
9 . Jerry Maguire
10 . Aloha
11 . Singles
12 . Shortbus
13 . Almost Famous
14 . Hedwig and the Angry Inch
15 . The Terminator
16 . Untitled Three Stooges Sequel
17 . Kill Order
18 . American Outlaws
19 . Ad Astra
20 . Knight and Day
21 . Rabbit Hole
22 . The Marine 6: Close Quarters
23 . Armageddon Time
24 . Avenging Force
25 . Knuckledust
26 . We Bought a Zoo
27 . The Long Home
28 . Fast Times at Ridgemont High
29 . We Own the Night


In [12]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies['movie title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies[movies['movie title'] == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies[movies.index==index]['movie title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : rambo
Movies suggested for you : 

1 . Rambo
2 . Rambo: First Blood Part II
3 . Rambo III
4 . The Expendables
5 . Rambo: Last Blood
6 . Rocky IV
7 . Rocky III
8 . Rocky II
9 . Rocky Balboa
10 . Rocky V
11 . Cliffhanger
12 . Cobra
13 . The Expendables 2
14 . Creed
15 . Tango & Cash
16 . Driven
17 . The Expendables 3
18 . Creed II
19 . Tough As They Come
20 . First Blood
21 . Hot Shots! Part Deux
22 . I Am That Man
23 . Little America
24 . The Rescue
25 . Take Home Pay
26 . Objective, Burma!
27 . Escape Plan: The Extractors
28 . Nighthawks
29 . Thirteen Lives


In [13]:
movies[movies['movie title'] == "Spy"]

Unnamed: 0,movie title,Run Time,Rating,User Rating,Generes,Overview,Plot Kyeword,Director,Top 5 Casts,Writer,year,path,index
716,Spy,2 hours,7,243K,"['Action', 'Comedy']",A desk-bound CIA analyst volunteers to go unde...,"['fellatio', 'prosthetic penis', 'vulgarity', ...",Paul Feig,"['Melissa McCarthy', 'Rose Byrne', 'Jude Law',...",Paul Feig,-2015,/title/tt3079380/,716


In [34]:
# saving movie recommender model
import pickle

with open('Movie_recommend.pkl', 'wb') as file:
    pickle.dump((vectorizer,similarity,movies), file)



In [35]:
# # Load the TfidfVectorizer and cosine similarity objects from files using pickle
with open('Movie_recommend.pkl', 'rb') as file:
    vectorizer1,similarity1,movies1 = pickle.load(file)


In [36]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies1['movie title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies1[movies1['movie title'] == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity1[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies1[movies.index==index]['movie title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : rampoage
Movies suggested for you : 

1 . Rampage
2 . The Iron Giant
3 . Monster Hunter
4 . Ape vs. Monster
5 . Gammera the Invincible
6 . Reptilicus
7 . Big Ass Spider!
8 . Kong: Skull Island
9 . King Kong vs. Godzilla
10 . Shin Ultraman
11 . Monsters: Dark Continent
12 . Axe Giant: The Wrath of Paul Bunyan
13 . San Andreas
14 . Robo Warriors
15 . Jack the Giant Killer
16 . The Spider
17 . Godzilla 1985
18 . San Andreas 2
19 . DeepStar Six
20 . Ape
21 . Thunder of Gigantic Serpent
22 . Cloverfield
23 . Starship Troopers
24 . Glass Trap
25 . Super Shark
26 . Kyodai hiroin mugen no seresutia
27 . Teenage Mutant Ninja Turtles
28 . Jack the Giant Slayer
29 . Dire Wolf
