In [2]:
import pandas as pd
import numpy as np
import ast
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_and_preprocess_data(movies_path, credits_path):
    # Load datasets
    credits = pd.read_csv(credits_path)
    movies = pd.read_csv(movies_path)

    # Merge datasets on title
    movies = movies.merge(credits, on='title')

    # Select relevant columns
    movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

    # Drop rows with missing values
    movies.dropna(inplace=True)

    # Helper function to extract names from JSON-like strings
    def convert(obj):
        L = []
        for i in ast.literal_eval(obj):
            L.append(i['name'])
        return L

    # Process genres and keywords
    movies['genres'] = movies['genres'].apply(convert)
    movies['keywords'] = movies['keywords'].apply(convert)

    # Process cast: Only keep the top 3 actors
    movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])

    # Process crew: Only keep the Director
    def fetch_director(obj):
        L = []
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                L.append(i['name'])
                break
        return L

    movies['crew'] = movies['crew'].apply(fetch_director)

    # Combine all metadata into a single 'tags' column
    # Also clean overview (split into list)
    movies['overview'] = movies['overview'].apply(lambda x: x.split())

    # Remove spaces from names to avoid confusion (e.g., "Johnny Depp" -> "JohnnyDepp")
    movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
    movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
    movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
    movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

    movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

    # Convert tags list back to a string and lowercase it
    new_df = movies[['movie_id', 'title', 'tags']].copy()
    new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())

    return new_df

def build_model(df):
    # Vectorize the tags using TF-IDF
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df['tags'])

    # Calculate Cosine Similarity
    similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

    return similarity

def get_recommendations(movie_title, df, similarity):
    try:
        # Get index of the movie
        idx = df[df['title'] == movie_title].index[0]

        # Get similarity scores
        distances = similarity[idx]

        # Sort and pick top 10 (excluding itself)
        movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]

        print(f"Recommendations for '{movie_title}':")
        for i in movies_list:
            print(df.iloc[i[0]].title)

    except IndexError:
        print("Movie not found in dataset.")

if __name__ == "__main__":
    # 1. Preprocess
    movies_df = load_and_preprocess_data('/content/drive/MyDrive/Colab Notebooks/new/tmdb_5000_movies.csv', '/content/drive/MyDrive/Colab Notebooks/new/tmdb_5000_credits.csv')

    # 2. Build similarity matrix
    similarity_matrix = build_model(movies_df)

    # 3. Test recommendation
    get_recommendations('The Dark Knight Rises', movies_df, similarity_matrix)

    # 4. Save the model and data for future use (Deployment)
    with open('movie_list.pkl', 'wb') as f:
        pickle.dump(movies_df, f)
    with open('similarity.pkl', 'wb') as f:
        pickle.dump(similarity_matrix, f)

Recommendations for 'The Dark Knight Rises':
The Dark Knight
Batman Returns
Batman Begins
Batman Forever
Batman
Batman
Batman: The Dark Knight Returns, Part 2
Batman & Robin
Batman v Superman: Dawn of Justice
Slow Burn
