In [1]:
import numpy as np
import pandas as pd
import os
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import kagglehub

In [2]:
for root, _, files in os.walk('/kaggle/input'):
    for file in files:
        print(os.path.join(root, file))

In [3]:
data_path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")
print("Dataset path:", data_path)

Dataset path: C:\Users\koush\.cache\kagglehub\datasets\tmdb\tmdb-movie-metadata\versions\2


In [4]:
movies_df = pd.read_csv('C:/Users/koush/.cache/kagglehub/datasets/tmdb/tmdb-movie-metadata/versions/2/tmdb_5000_movies.csv')
credits_df = pd.read_csv('C:/Users/koush/.cache/kagglehub/datasets/tmdb/tmdb-movie-metadata/versions/2/tmdb_5000_credits.csv')

In [5]:
movies_df = movies_df.merge(credits_df, on='title')
movies_df = movies_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies_df.dropna(inplace=True)

In [6]:
def extract_names(text):
    return [item['name'] for item in ast.literal_eval(text)]

def extract_top_names(text, limit=3):
    return [item['name'] for item in ast.literal_eval(text)[:limit]]

def extract_director(text):
    return [item['name'] for item in ast.literal_eval(text) if item['job'] == 'Director']

def sanitize_list(items):
    return [item.replace(" ", "") for item in items]

In [7]:
movies_df['genres'] = movies_df['genres'].apply(extract_names)
movies_df['keywords'] = movies_df['keywords'].apply(extract_names)
movies_df['cast'] = movies_df['cast'].apply(extract_top_names)
movies_df['crew'] = movies_df['crew'].apply(extract_director)

In [8]:
movies_df['cast'] = movies_df['cast'].apply(sanitize_list)
movies_df['crew'] = movies_df['crew'].apply(sanitize_list)
movies_df['genres'] = movies_df['genres'].apply(sanitize_list)
movies_df['keywords'] = movies_df['keywords'].apply(sanitize_list)
movies_df['overview'] = movies_df['overview'].apply(lambda x: x.split())

In [9]:
movies_df['tags'] = movies_df['overview'] + movies_df['genres'] + movies_df['keywords'] + movies_df['cast'] + movies_df['crew']
movies_df = movies_df[['movie_id', 'title', 'tags']]
movies_df['tags'] = movies_df['tags'].apply(lambda x: " ".join(x))

In [10]:
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
vectorized_data = vectorizer.fit_transform(movies_df['tags']).toarray()

In [11]:
similarity_matrix = cosine_similarity(vectorized_data)

In [14]:
def recommend_movies(movie_title):
    movie_idx = movies_df[movies_df['title'] == movie_title].index[0]
    distances = sorted(list(enumerate(similarity_matrix[movie_idx])), reverse=True, key=lambda x: x[1])
    for idx in distances[1:4]:
        print(movies_df.iloc[idx[0]].title)

In [15]:
recommend_movies('Avatar')

Titan A.E.
Small Soldiers
Ender's Game


In [16]:
pickle.dump(movies_df, open('movie_list.pkl', 'wb'))
pickle.dump(similarity_matrix, open('similarity.pkl', 'wb'))