In [2]:
import numpy as np
import pandas as pd
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

In [3]:
# Load datasets
movies = pd.read_csv(r"C:\Users\Komal Pandey\Downloads\archive (20)\tmdb_5000_movies.csv")
credits = pd.read_csv(r"C:\Users\Komal Pandey\Downloads\archive (20)\tmdb_5000_credits.csv")

In [4]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [5]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
# Merge on title
movies = movies.merge(credits, on='title')

In [7]:
# Keep only relevant columns
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)

In [8]:
# Utility functions
def convert(obj):
    return [i['name'] for i in ast.literal_eval(obj)]

def convert_limited(obj, limit=3):
    L=[]
    for i in ast.literal_eval(obj)[:limit]:
        L.append(i['name'])
    return L

def fetch_director(obj):
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            return [i['name']]
    return []


In [9]:
# Apply conversions
movies['genres']   = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(lambda x: convert_limited(x,3))
movies['cast']     = movies['cast'].apply(lambda x: convert_limited(x,3))
movies['crew']     = movies['crew'].apply(fetch_director)


In [10]:
# Tokenize overview
movies['overview'] = movies['overview'].apply(lambda x:x.split())


In [11]:
# Remove spaces inside tokens
for col in ['genres','keywords','cast','crew']:
    movies[col] = movies[col].apply(lambda x:[i.replace(" ","") for i in x])



In [12]:
# Create tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [13]:
# New dataframe
new_df = movies[['movie_id','title','tags']].copy()


In [14]:
# Lowercase + remove punctuation
def clean_and_join(tags_list):
    cleaned_words = [re.sub(r'[^\w\s]', '', word.lower()) for word in tags_list]
    return " ".join(cleaned_words)

new_df.loc[:, 'tags_clean'] = new_df['tags'].apply(clean_and_join)

In [15]:
# Stemming
ps = PorterStemmer()
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

new_df.loc[:, 'tags_clean'] = new_df['tags_clean'].apply(stem)

In [16]:
# Vctorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags_clean']).toarray()

In [17]:
# Similarity matrix
similarity = cosine_similarity(vectors)


In [18]:
# Recommendation function
def recommend(movie):
    if movie not in new_df['title'].values:
        print("Movie not found in dataset.")
        return
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)


In [19]:
# Example usage
recommend('Spectre')


Quantum of Solace
Never Say Never Again
Skyfall
From Russia with Love
Thunderball
