# 🎯 Movie Recommendation System using TF-IDF and Cosine Similarity

This notebook uses TF-IDF features extracted from movie metadata (title + description) to recommend similar movies.

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

#### Load Data

In [41]:
df = pd.read_csv('../data/netflix_titles.csv')
df.fillna('', inplace=True)

#### Preprocessing

In [44]:
features = ['title', 'director', 'cast', 'listed_in', 'description']
df_features = df[features].copy()

def clean_text(text):
    if isinstance(text, str):
        # Convert to lowercase and remove special characters
        text = text.lower().strip()
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
    return text

for feature in features:
    df_features[feature] = df_features[feature].apply(clean_text)

df_features['feature_soup'] = (
    df_features['title'] + ' ' + 
    df_features['director'] + ' ' + 
    df_features['cast'] + ' ' + 
    df_features['listed_in'] + ' ' + 
    df_features['description']
)

#### TF-IDF Vectorization

In [47]:
tfidf = TfidfVectorizer(
    stop_words='english',    # Remove English stop words
    min_df=2,                # Minimum document frequency
    max_df=0.85,             # Maximum document frequency
    ngram_range=(1, 2)       # Consider unigrams and bigrams
)

tfidf_matrix = tfidf.fit_transform(df_features['feature_soup'])
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

TF-IDF Matrix Shape: (8807, 51561)


#### Cosine Similarity Matrix

In [49]:
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(f"Similarity Matrix Shape: {similarity_matrix.shape}")

title_to_index = pd.Series(df.index, index=df['title'])

Similarity Matrix Shape: (8807, 8807)


#### Recommendation Function

In [51]:
def get_movie_recommendations(title, top_n=10):
    """Get movie recommendations based on similarity"""
    title_matches = df[df['title'].str.lower() == title.lower()]
    if title_matches.empty:
        print(f"Movie '{title}' not found. Trying partial matches...")
        
        title_lower = title.lower()
        closest_matches = df[df['title'].str.lower().str.contains(title_lower, regex=False)]
        
        if closest_matches.empty:
            print(f"No matches found for '{title}'")
            return pd.DataFrame()
        else:
            match_title = closest_matches.iloc[0]['title']
            print(f"Using closest match: '{match_title}'")
            idx = closest_matches.index[0]
    else:
        idx = title_matches.index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    similarity_values = [i[1] for i in sim_scores]
    recommendations = pd.DataFrame({
        'title': df['title'].iloc[movie_indices],
        'type': df['type'].iloc[movie_indices],
        'director': df['director'].iloc[movie_indices],
        'listed_in': df['listed_in'].iloc[movie_indices],
        'similarity_score': similarity_values
    })
    
    return recommendations

#### Test

In [54]:
movie_title = "Twilight"
recommendations = get_movie_recommendations(movie_title)


print(f"\nMovies similar to '{movie_title}':")
for i, (idx, row) in enumerate(recommendations.iterrows(), 1):
    print(f"{i}. {row['title']} ({row['type']}) - Similarity: {row['similarity_score']:.2f}")
    print(f"   Genres: {row['listed_in']}")
    if row['director']:
        print(f"   Director: {row['director']}")
    print()


Movies similar to 'Twilight':
1. The Twilight Saga: Breaking Dawn: Part 1 (Movie) - Similarity: 0.59
   Genres: Dramas, Romantic Movies
   Director: Bill Condon

2. The Twilight Saga: Eclipse (Movie) - Similarity: 0.59
   Genres: Dramas, Romantic Movies
   Director: David Slade

3. The Twilight Saga: Breaking Dawn: Part 2 (Movie) - Similarity: 0.58
   Genres: Dramas, Romantic Movies
   Director: Bill Condon

4. The Twilight Saga: New Moon (Movie) - Similarity: 0.58
   Genres: Dramas, Romantic Movies
   Director: Chris Weitz

5. The Vanished (Movie) - Similarity: 0.11
   Genres: Thrillers
   Director: Peter Facinelli

6. Burlesque (Movie) - Similarity: 0.10
   Genres: Dramas, Romantic Movies
   Director: Steve Antin

7. The Adventures of Sharkboy and Lavagirl (Movie) - Similarity: 0.10
   Genres: Children & Family Movies
   Director: Robert Rodriguez

8. Samson (Movie) - Similarity: 0.10
   Genres: Action & Adventure, Dramas, Faith & Spirituality
   Director: Bruce MacDonald, Gabriel S