In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import ast


In [2]:
credits = pd.read_csv(r"C:\Users\Lenovo\Documents\Projects\RS_TMDB_5000\dataset\TMB500\tmdb_5000_credits.csv")
movies = pd.read_csv(r"C:\Users\Lenovo\Documents\Projects\RS_TMDB_5000\dataset\TMB500\tmdb_5000_movies.csv")

In [3]:
movies['title'] = movies['title'].astype(str)
credits['title'] = credits['title'].astype(str)

In [4]:
# Merge datasets on the 'title' column
merged_df = movies.merge(credits, on='title')

# Select relevant columns
merged_df = merged_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
merged_df.dropna(inplace=True)

In [5]:
def parse_list(obj):
    return [item['name'] for item in ast.literal_eval(obj)]

merged_df['genres'] = merged_df['genres'].apply(parse_list)
merged_df['keywords'] = merged_df['keywords'].apply(parse_list)

In [6]:
def get_top_cast(obj):
    return[item['name'] for item in ast.literal_eval(obj)[:5]]

merged_df['cast'] = merged_df['cast'].apply(get_top_cast)

In [7]:
def get_director(obj):
    for item in ast.literal_eval(obj):
        if item['job'] == 'Director':
            return [item['name']]
    return[]

merged_df['crew'] = merged_df['crew'].apply(get_director)

In [9]:
def remove_spaces(L):
    return [item.replace(' ', '') for item in L]

In [10]:
merged_df['cast'] = merged_df['cast'].apply(remove_spaces)
merged_df['crew'] = merged_df['crew'].apply(remove_spaces)
merged_df['keywords'] = merged_df['keywords'].apply(remove_spaces)
merged_df['genres'] = merged_df['genres'].apply(remove_spaces)


In [11]:
merged_df['tags'] = merged_df['genres'] + merged_df['keywords'] + merged_df['cast'] + merged_df['crew']
merged_df['tags'] = merged_df['tags'].apply(lambda x: " ".join(x).lower())

In [12]:
# Initialize the PorterStemmer
ps = PorterStemmer()

# Function to stem words
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

# Apply stemming to the 'tags' column
merged_df['tags'] = merged_df['tags'].apply(stem_words)

In [13]:
cv = CountVectorizer(max_features = 5000, stop_words='english')
vectors = cv.fit_transform(merged_df['tags']).toarray()

similarity_matrix = cosine_similarity(vectors)

In [21]:
def recommend_movie(movie_title):
    if movie_title not in merged_df['title'].values:
        print("Movie not Found in the dataset.")
        return
    movie_index = merged_df[merged_df['title'] == movie_title].index[0]
    distances = similarity_matrix[movie_index]
    movie_indices = sorted(list(enumerate(distances)), key = lambda x: x[1], reverse=True) [1:6]
    recommendations = [merged_df.iloc[i[0]].title for i in movie_indices]
    print("Recommendations for '{}':".format(movie_title))
    for title in recommendations:
        print(title)

if __name__ == "__main__":
    user_movie = input("Enter a movie name to get recommendations: ")
    recommend_movie(user_movie)

Recommendations for 'Spider-Man':
Spider-Man 3
Spider-Man 2
Thor: The Dark World
Ghost Rider: Spirit of Vengeance
The Monkey King 2
