In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load the movies.csv dataset
movies_df = pd.read_csv("../data/movies.csv")

In [3]:
# Preprocess the genres column: split the genres into a list
movies_df['genres'] = movies_df['genres'].apply(lambda x: x.split('|'))

In [4]:
# Convert the list of genres to a string for TF-IDF vectorization
movies_df['genres_string'] = movies_df['genres'].apply(lambda x: ' '.join(x))

In [5]:
# Create TF-IDF vectors for the genres
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['genres_string'])

In [6]:
# Calculate the cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix)

In [7]:
# Function to get movie recommendations based on content-based filtering
def get_recommendations(movie_title, cosine_sim=cosine_sim):
    # Get the index of the movie
    idx = movies_df[movies_df['title'] == movie_title].index[0]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [8]:
# Example: Get recommendations for a movie
movie_title = 'Toy Story (1995)'
recommendations = get_recommendations(movie_title)
print(f"Recommendations for {movie_title}:\n{recommendations}")

Recommendations for Toy Story (1995):
1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object
