In [14]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from datetime import datetime

# ==========================
# 1. Load model once
# ==========================
model = SentenceTransformer('all-MiniLM-L6-v2')

# ==========================
# 2. Load all movies + precompute embeddings
# ==========================
folder_path = '../data/Docs'  # ← change this
movies = []
texts = []

for fn in os.listdir(folder_path):
    if fn.endswith('.json'):
        with open(os.path.join(folder_path, fn), encoding='utf-8') as f:
            data = json.load(f)
            movies.append(data)
            
            # Build a rich text that includes all important fields
            parts = [
                data.get('Title', ''),
                data.get('Overview', ''),
                data.get('Director', ''),
                data.get('Genres', '').replace(',', ' '),
                data.get('Tagline', ''),
                str(data.get('Release_Date', '')[:4]),  # year only
                data.get('Cast', '').replace(',', ' '),
            ]
            rich_text = ' | '.join(filter(None, parts))
            texts.append(rich_text)

print(f"Loaded {len(movies)} movies")

embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
dimension = embeddings.shape[1]

# ==========================
# 3. Create FAISS index
# ==========================
index = faiss.IndexFlatIP(dimension)  # Inner product = cosine similarity (because we normalized)
index.add(embeddings.astype(np.float32))

# ==========================
# 4. Smart Hybrid Search Function
# ==========================
def smart_search(query: str, top_k: int = 10):
    query_lower = query.lower()
    words = query_lower.split()
    
    # === 1. Extract year and director tokens ===
    query_year = None
    director_tokens = []
    
    for w in words:
        if w.isdigit() and 1900 <= int(w) <= 2030:
            query_year = int(w)
        elif len(w) >= 3:  # avoid matching "a", "to", etc.
            director_tokens.append(w)

    # === 2. Encode query ===
    query_vec = model.encode([query], normalize_embeddings=True).astype(np.float32)
    
    # Get more candidates
    D, I = index.search(query_vec, top_k * 5)  # 50 candidates
    
    # === 3. Aggressive rescoring ===
    candidates = []
    for dist, idx in zip(D[0], I[0]):
        movie = movies[idx]
        semantic_score = dist  # this is cosine similarity (higher = better)

        boost = 0.0

        # Year EXACT match = massive boost
        if query_year:
            movie_year = int(movie['Release_Date'][:4]) if movie.get('Release_Date') else 0
            if movie_year == query_year:
                boost += 2.0  # This is the game changer

        # Director match = huge boost
        director_name = movie.get('Director', '').lower()
        if any(token in director_name for token in director_tokens):
            boost += 2.5  # Even stronger for director

        # Bonus: if query contains "nolan" and movie is by Nolan → ultimate boost
        if 'nolan' in query_lower and 'nolan' in director_name:
            boost += 3.0

        final_score = semantic_score + boost
        candidates.append((final_score, movie))

    # Sort and return
    candidates.sort(key=lambda x: x[0], reverse=True)
    return [movie for _, movie in candidates[:top_k]]

Loaded 3500 movies


Batches: 100%|██████████| 110/110 [00:07<00:00, 14.02it/s]


In [19]:
# ==========================
# Test it!
# ==========================
queries = [
    "nolan action 2010",
    "Joss Whedon action "
]

for q in queries:
    print(f"\n=== Query: '{q}' ===")
    results = smart_search(q, top_k=5)
    for i, m in enumerate(results, 1):
        print(f"{i}. {m['Title']} ({m['Release_Date'][:4]}) - {m['Director']} - {m['Genres']}")


=== Query: 'nolan action 2010' ===
1. Inception (2010) - Christopher Nolan - Action, Science Fiction, Adventure
2. The Dark Knight (2008) - Christopher Nolan - Drama, Action, Crime, Thriller
3. Batman Begins (2005) - Christopher Nolan - Action, Crime, Drama
4. The Prestige (2006) - Christopher Nolan - Drama, Mystery, Science Fiction
5. MacGruber (2010) - Jorma Taccone - Action, Comedy

=== Query: 'Joss Whedon action ' ===
1. Sleepover (2004) - Joe Nussbaum - Family, Comedy
2. Observe and Report (2009) - Jody Hill - Comedy, Crime, Drama
3. Wag the Dog (1997) - Barry Levinson - Comedy, Drama
4. Office Space (1999) - Mike Judge - Comedy
5. Unbroken (2014) - Angelina Jolie - Drama, War
