# 🎯 Movie Recommendation System using Word2Vec (Content-Based Filtering)

This notebook builds a content-based movie recommender system using Word2Vec embeddings trained on Netflix movie metadata.

In [81]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re

#### Load data
We use `netflix_titles.csv` containing movie metadata.

In [84]:
netflix_df = pd.read_csv('../data/netflix_titles.csv')
netflix_df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


#### Preprocessing

In [87]:
original_titles = dict(zip(netflix_df.index, netflix_df['title']))

stop_words = set(stopwords.words('english') + list(string.punctuation))

for col in ['title', 'listed_in', 'description']:
    netflix_df[col] = netflix_df[col].fillna('').astype(str).str.lower()
    netflix_df[col] = netflix_df[col].apply(word_tokenize)
    netflix_df[col] = netflix_df[col].apply(
        lambda x: [word for word in x if word not in stop_words and len(word) > 2]
    )
    
    if col == 'description':
        netflix_df[col] = netflix_df[col].apply(
            lambda x: [word.translate(str.maketrans('', '', string.punctuation)) for word in x]
        )
        netflix_df[col] = netflix_df[col].apply(
            lambda x: [word for word in x if len(word) > 0]
        )
    
    netflix_df[col] = netflix_df[col].apply(lambda x: list(set(x)))

netflix_df['tokens'] = netflix_df.apply(
    lambda row: row['title'] + row['listed_in'] + row['description'], 
    axis=1
)

netflix_df = netflix_df[netflix_df['tokens'].apply(len) > 0]

#### Word2Vec Model Training

In [89]:
w2v_model = Word2Vec(
    sentences=netflix_df['tokens'], 
    vector_size=100, 
    window=5, 
    min_count=1, 
    workers=4
)
print("Vocabulary size:", len(w2v_model.wv))

Vocabulary size: 23817


#### Recomendation system

In [91]:
def get_recommendations(title, df, model, orig_titles, top_n=10):
    wv = model.wv
    
    title_lower = title.lower()
    mask = df['title'].apply(lambda x: any(title_lower in t.lower() for t in x))
    target_rows = df[mask]
    
    if len(target_rows) == 0:
        print(f"Movie '{title}' not found in dataset.")
        return pd.DataFrame()
    
    target_row = target_rows.iloc[0]
    target_title = orig_titles.get(target_row.name, ' '.join(target_row['title']))
    print(f"Found movie: {target_title}")
    
    target_tokens = {
        'category': [w for w in target_row['listed_in'] if w in wv],
        'desc': [w for w in target_row['description'] if w in wv],
        'title': [w for w in target_row['title'] if w in wv]
    }
    
    if not target_tokens['category'] or not target_tokens['desc']:
        print(f"Not enough data for movie '{title}'.")
        return pd.DataFrame()
    
    results = []
    
    for idx, row in df.iterrows():
        if idx == target_row.name:
            continue
            
        candidate_tokens = {
            'category': [w for w in row['listed_in'] if w in wv],
            'desc': [w for w in row['description'] if w in wv],
            'title': [w for w in row['title'] if w in wv]
        }
        
        if not candidate_tokens['category'] or not candidate_tokens['desc']:
            continue
            
        try:
            cat_score = wv.n_similarity(candidate_tokens['category'], target_tokens['category'])
            
            if cat_score <= 0.7:
                continue
                
            desc_score = wv.n_similarity(candidate_tokens['desc'], target_tokens['desc'])
            
            title_score = 0
            if candidate_tokens['title'] and target_tokens['title']:
                try:
                    title_score = wv.n_similarity(candidate_tokens['title'], target_tokens['title']) / 2
                except:
                    pass
                    
            movie_title = orig_titles.get(idx, ' '.join(row['title']))
            
            results.append([
                movie_title, target_title, title_score, cat_score, desc_score
            ])
        except:
            continue
    
    if not results:
        print(f"No similar titles found for '{title}'.")
        return pd.DataFrame()
        
    rec_df = pd.DataFrame(
        results, 
        columns=["recommendation", "title", "score_title", "score_category", "score_description"]
    )
    
    rec_df["final_score"] = rec_df["score_title"] + rec_df["score_category"] + rec_df["score_description"]
    
    return rec_df.sort_values(
        by=["final_score", "score_category", "score_description", "score_title"],
        ascending=False
    ).head(top_n)

#### Test

In [94]:
sample_movie = 'Twilight'
recommendations = get_recommendations(sample_movie, netflix_df, w2v_model, original_titles, top_n=5)

print(f"\nMovies similar to '{sample_movie}':")
if not recommendations.empty:
    for i, row in recommendations.iterrows():
        print(f"- {row['recommendation']} (Score: {row['final_score']:.2f})")

Found movie: The Twilight Saga: Breaking Dawn: Part 1

Movies similar to 'Twilight':
- The Jane Austen Book Club (Score: 2.50)
- Dear John (Score: 2.50)
- In Line (Score: 2.50)
- The Desert Bride (Score: 2.50)
- Up North (Score: 2.50)
