In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Load the dataset (replace the path if needed)
url = "https://raw.githubusercontent.com/ibrahimnor/Movies-Recommender-Dataset/main/movies_metadata.csv"
df = pd.read_csv(url, low_memory=False)

# Keep relevant columns
df = df[['title', 'overview', 'genres']].dropna()

# Display the dataset
print("Dataset Sample:")
print(df.head())
print(f"\nDataset Shape: {df.shape}")


In [None]:
import ast
import re

# Clean genres column
def parse_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return " ".join([genre["name"] for genre in genres])
    except:
        return ""

df['genres_cleaned'] = df['genres'].apply(parse_genres)

# Combine genres and overview into a single content column
df['content'] = df['genres_cleaned'] + " " + df['overview']

# Display processed data
print("\nProcessed Data Sample:")
print(df[['title', 'genres_cleaned', 'content']].head())


In [None]:
# Convert text data into TF-IDF features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])

print("\nTF-IDF Matrix Shape:", tfidf_matrix.shape)


In [None]:
# Compute cosine similarity between movies based on their TF-IDF features
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("\nCosine Similarity Matrix Shape:", cosine_sim.shape)


In [None]:
# Function to recommend movies based on content similarity
def recommend_movies(title, cosine_sim=cosine_sim, df=df, top_n=5):
    # Get the index of the movie title
    idx = df[df['title'] == title].index[0]
    
    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N similar movies
    top_similar = sim_scores[1:top_n + 1]
    movie_indices = [i[0] for i in top_similar]
    
    # Return the titles of recommended movies
    return df['title'].iloc[movie_indices]

# Test the recommender system
sample_movie = "The Godfather"
print(f"\nTop Recommendations for '{sample_movie}':")
print(recommend_movies(sample_movie))


In [None]:
# Test the recommender with different movies
movies_to_test = ["The Godfather", "Pulp Fiction", "Toy Story"]

for movie in movies_to_test:
    print(f"\nRecommendations for '{movie}':")
    print(recommend_movies(movie))


In [None]:
# Modify content to give more weight to genres
df['weighted_content'] = (df['genres_cleaned'] + " ") * 2 + df['overview']

# Recompute TF-IDF matrix and cosine similarity
tfidf_matrix_weighted = tfidf_vectorizer.fit_transform(df['weighted_content'])
cosine_sim_weighted = cosine_similarity(tfidf_matrix_weighted, tfidf_matrix_weighted)

# Test the improved recommender
print("\nImproved Recommendations for 'The Godfather':")
print(recommend_movies("The Godfather", cosine_sim=cosine_sim_weighted))
