In [None]:
import pandas as pd
import numpy as np
import ast

In [None]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [None]:
movies.head()

In [None]:
credits.head()

In [None]:
movies.shape

In [None]:
movies.duplicated().sum()

In [None]:
credits.shape

In [None]:
credits.rename(columns={'movie_id':'id'}, inplace=True)

In [None]:
credits.head()

In [None]:
movies  = pd.merge(credits, movies, on='id')

In [None]:
movies.shape

In [None]:
movies.head()

In [None]:
movies.rename(columns={'title_x':'title'}, inplace=True)

In [None]:
movies.head()

In [None]:
movies_dataset = movies[['id', 'title', 'cast', 'crew', 'genres', 'keywords', 'overview']]

In [None]:
movies_dataset.shape

In [None]:
movies_dataset.head()

In [None]:
print(movies_dataset['cast'][0])

In [None]:
def get_top_5_cast(cast_str):
    try:
        # Convert string to list of dictionaries
        cast_list = ast.literal_eval(cast_str)
        names = [person['name'] for person in cast_list[:5]]
        return names
    except:
        return []

In [None]:
movies_dataset = movies_dataset.copy()  # avoids SettingWithCopyWarning
movies_dataset['get_top_5_cast'] = movies_dataset['cast'].apply(get_top_5_cast)

In [None]:
print(movies_dataset[['title', 'get_top_5_cast']].head())

In [None]:
movies_dataset.head()

In [None]:
movies_dataset['cast'] = movies_dataset['get_top_5_cast']

In [None]:
movies_dataset.head()

In [None]:
movies_dataset.drop('get_top_5_cast', axis=1, inplace=True)

In [None]:
movies_dataset.head()

In [None]:
# Function to extract director's name
def get_director(crew_str):
    try:
        crew_list = ast.literal_eval(crew_str)  # Convert string to list
        for member in crew_list:
            if member.get('job') == 'Director':
                return member.get('name')
        return None
    except:
        return None

# ✅ Apply the function and store the result in a new column
movies_dataset = movies_dataset.copy()  # To avoid SettingWithCopyWarning
movies_dataset['director'] = movies_dataset['crew'].apply(get_director)

In [None]:
movies_dataset.head()

In [None]:
movies_dataset.drop(columns='crew', axis=1, inplace=True)

In [None]:
movies_dataset.rename(columns={'director':'crew'}, inplace=True)

In [None]:
movies_dataset.head()

In [None]:
print(movies_dataset['genres'][0])

In [None]:
# Function to extract just the genre names
def extract_genres(genre_str):
    try:
        genre_list = ast.literal_eval(genre_str)
        names = [g['name'] for g in genre_list]
        return names
    except:
        return []

# Apply the function
movies_dataset['genres'] = movies_dataset['genres'].apply(extract_genres)

In [None]:
movies_dataset.head()

In [None]:
print(movies_dataset['keywords'][0])

In [None]:
def extract_keywords(keyword_str):
    try:
        keyword_list = ast.literal_eval(keyword_str)
        names = [k['name'] for k in keyword_list]
        return names
    except:
        return []

# Apply to 'keywords' column
movies_dataset['keywords'] = movies_dataset['keywords'].apply(extract_keywords)

In [None]:
movies_dataset.head()

In [None]:
movies_dataset.isnull().sum()

In [None]:
movies_dataset.shape

In [None]:
movies_dataset.dropna(inplace=True)

In [None]:
movies_dataset.shape

In [None]:
def remove_spaces(name_list):
    try:
        return [name.replace(" ", "") for name in name_list]
    except:
        return []

# Apply to all list-based columns
movies_dataset['cast'] = movies_dataset['cast'].apply(remove_spaces)
movies_dataset['genres'] = movies_dataset['genres'].apply(remove_spaces)
movies_dataset['keywords'] = movies_dataset['keywords'].apply(remove_spaces)

# For single name strings like director (crew column)
movies_dataset['crew'] = movies_dataset['crew'].apply(lambda x: x.replace(" ", "") if isinstance(x, str) else x)
movies_dataset['overview'] = movies_dataset['overview'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

In [None]:
movies_dataset.isnull().sum()

In [None]:
movies_dataset.head()

In [None]:
#movies_dataset.duplicated().sum()

In [None]:
movies_dataset.head()

In [None]:
movies_dataset['tags'] = (
    movies_dataset['cast'].apply(lambda x: ' '.join(x)) + ' ' +
    movies_dataset['genres'].apply(lambda x: ' '.join(x)) + ' ' +
    movies_dataset['keywords'].apply(lambda x: ' '.join(x)) + ' ' +
    movies_dataset['overview'] + ' ' +
    movies_dataset['crew']
)

In [None]:
movies_dataset.head()

In [None]:
movies_dataset['overview'][0]

In [None]:
movies_dataset['overview'] = movies_dataset['overview'].apply(lambda x:x.split())

In [None]:
movies_dataset.head()

In [None]:
movies_dataset.drop(columns=['cast','genres','keywords','crew','overview'], axis=1, inplace=True)

In [None]:
movies_dataset['tags'] = movies_dataset['tags'].str.lower()

In [None]:
movies_dataset.head()

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [None]:
movies_dataset['tags'] = movies_dataset['tags'].apply(stem)

In [None]:
print(movies_dataset['tags'][0])

In [None]:
# NOW WE NEED TO CONVERT THESE TAGS INTO VECTORS SO THAT WE CAN COMPARE THEM TO EVERY OTHER TAG TO FIND ANY SIMILARITIES BETWEEN THE MOVIES BASED ON THAT
# USING BAG OF WORDS

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(movies_dataset['tags']).toarray()

In [None]:
vectors

In [None]:
len(cv.get_feature_names_out())

In [None]:
cv.get_feature_names_out()

In [None]:
tag_names = cv.get_feature_names_out()
print(list(tag_names))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(vectors).shape

In [None]:
similarity = cosine_similarity(vectors)

In [None]:
similarity[0]

In [None]:
import pickle

# Save only the movie titles (for dropdown and indexing)
pickle.dump(movies_dataset[['title']], open('movies.pkl', 'wb'))

# Save the similarity matrix (used in recommend function)
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [None]:
from difflib import get_close_matches

In [None]:
def recommend(movie):
    movie = movie.lower()
    all_titles = movies_dataset['title'].str.lower().tolist()
    
    if movie not in all_titles:
        close_matches = get_close_matches(movie, all_titles, n=1, cutoff=0.6)
        if close_matches:
            close_match = close_matches[0]
            print(f"✅ Did you mean: {movies_dataset[movies_dataset['title'].str.lower() == close_match]['title'].values[0]}? Following are the recommendations...\n")
            movie_index = movies_dataset[movies_dataset['title'].str.lower() == close_match].index[0]
        else:
            print("❌ Movie not found. Please check the title.")
            return
    else:
        movie_index = movies_dataset[movies_dataset['title'].str.lower() == movie].index[0]

    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]

    for i in movies_list:
        print(movies_dataset.iloc[i[0]].title)

In [None]:
recommend('avatar')

In [None]:
recommend(input('Search Movie'))