In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### # Step 1: Load the movies dataset

In [5]:
movies = pd.read_csv("movies.csv")

### # Step 2: Fill any missing genres with empty string

In [6]:
movies['genres'] = movies['genres'].fillna('')

### # Step 3: Convert genres into lowercase and format text

In [7]:
movies['genres'] = movies['genres'].str.replace('|', ' ').str.lower()

### # Step 4: Initialize TF-IDF Vectorizer to convert genre text to numerical vectors

In [8]:
tfidf = TfidfVectorizer(stop_words='english')  # remove common words like 'the', 'and'
tfidf_matrix = tfidf.fit_transform(movies['genres']) 

### # Step 5: Compute cosine similarity between all movies based on genre vectors

In [9]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### # Step 6: Create a mapping from movie titles to their DataFrame index

In [10]:
indices = pd.Series(movies.index, index=movies['title'].str.lower())

### # Step 7: Function to recommend similar movies

In [11]:
def recommend_movies(title, num_recommendations=5):
    title = title.lower().strip()

    # Try exact match first
    if title in indices:
        idx = indices[title]
    else:
        # Try to find a partial match
        matches = movies[movies['title'].str.lower().str.contains(title, na=False)]
        if matches.empty:
            return f"No movies found matching '{title}'."
        
        # Pick the best match (first one) and get its index
        idx = matches.index[0]
        print(f"No exact match found. Using closest match: '{movies.loc[idx, 'title']}'")

    # Get similarity scores for this movie with all others
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies by similarity score in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Skip the first one (it’s the same movie)
    sim_scores = sim_scores[1:num_recommendations+1]

    # Get indices of recommended movies
    movie_indices = [i[0] for i in sim_scores]

    return movies['title'].iloc[movie_indices]


### # Step 8: Test the recommender

In [13]:
print("Recommendations for 'Titanic':")
print(recommend_movies("Titanic"))

Recommendations for 'Titanic':
No exact match found. Using closest match: 'Titanic (1997)'
24                Leaving Las Vegas (1995)
27                       Persuasion (1995)
42    How to Make an American Quilt (1995)
45            When Night Is Falling (1995)
66                     Bed of Roses (1996)
Name: title, dtype: object
