In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [6]:
md = pd.read_csv('/content/movie_metadata.csv', low_memory=False)
md.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0




In [7]:
print(md.columns)

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')


In [8]:
if 'plot_keywords' in md.columns:
    md['plot_keywords'] = md['plot_keywords'].fillna('')
    md['description'] = md['movie_title'] + ' ' + md['plot_keywords']
else:
    md['description'] = md['movie_title']

md['description'] = md['description'].fillna('')

In [9]:
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(md['description'])

print(tfidf_matrix.shape)

(5043, 8650)


In [10]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [63]:
indices = pd.Series(md.index, index=md['movie_title'].str.strip()).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim):
    title = title.strip()  # Strip whitespace from input title
    if title not in indices.index:
        return "Movie not found in the dataset."
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return [(md['movie_title'].iloc[i], md['imdb_score'].iloc[i]) for i in movie_indices if i < len(md)]

In [64]:
best_movie = md.loc[md['imdb_score'].idxmax()]

print(f"Best Movie: {best_movie['movie_title']}")
print(f"Director: {best_movie['director_name']}")
print(f"IMDb Score: {best_movie['imdb_score']}")

Best Movie: Towering Inferno             
Director: John Blanchard
IMDb Score: 9.5


In [65]:
print(md.movie_title)

0                                                 Avatar 
1               Pirates of the Caribbean: At World's End 
2                                                Spectre 
3                                  The Dark Knight Rises 
4       Star Wars: Episode VII - The Force Awakens    ...
                              ...                        
5038                             Signed Sealed Delivered 
5039                           The Following             
5040                                A Plague So Pleasant 
5041                                    Shanghai Calling 
5042                                   My Date with Drew 
Name: movie_title, Length: 5043, dtype: object


In [79]:
movie_name = input("Enter a movie name: ")
print("\nbetter then the provided movie: ")
recommendations = get_recommendations(movie_name)
for movie, imdb_score in recommendations:
    print(f"\t{movie.ljust(80)}\t(IMDb Score: {imdb_score})")

Enter a movie name: The Dark Knight Rises

better then the provided movie: 
	Deadfall                                                                        	(IMDb Score: 6.3)
	Swelter                                                                         	(IMDb Score: 4.6)
	The Devil's Own                                                                 	(IMDb Score: 6.1)
	AWOL-72                                                                         	(IMDb Score: 3.9)
	The One                                                                         	(IMDb Score: 5.9)
	Let's Be Cops                                                                   	(IMDb Score: 6.5)
	The Marine 4: Moving Target                                                     	(IMDb Score: 5.2)
	Police Academy                                                                  	(IMDb Score: 6.7)
	No Good Deed                                                                    	(IMDb Score: 5.6)
	First Knight           