In [20]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from datetime import datetime

# ==========================
# 1. Load model once
# ==========================
model = SentenceTransformer('all-MiniLM-L6-v2')

# ==========================
# 2. Load all movies + precompute embeddings
# ==========================
folder_path = '../data/Docs'  # ← change this
movies = []
texts = []

for fn in os.listdir(folder_path):
    if fn.endswith('.json'):
        with open(os.path.join(folder_path, fn), encoding='utf-8') as f:
            data = json.load(f)
            movies.append(data)
            
            # Build a rich text that includes all important fields
            parts = [
                data.get('Title', ''),
                data.get('Overview', ''),
                data.get('Director', ''),
                data.get('Genres', '').replace(',', ' '),
                data.get('Tagline', ''),
                str(data.get('Release_Date', '')[:4]),  # year only
                data.get('Cast', '').replace(',', ' '),
            ]
            rich_text = ' | '.join(filter(None, parts))
            texts.append(rich_text)

print(f"Loaded {len(movies)} movies")

embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
dimension = embeddings.shape[1]

# ==========================
# 3. Create FAISS index
# ==========================
index = faiss.IndexFlatIP(dimension)  # Inner product = cosine similarity (because we normalized)
index.add(embeddings.astype(np.float32))

# ==========================
# 4. Smart Hybrid Search Function
# ==========================
def smart_search(query: str, top_k: int = 10):
    query_lower = query.lower().strip()
    words = query_lower.split()
    
    # Extract year
    query_year = None
    for w in words:
        if w.isdigit() and 1900 <= int(w) <= 2100:
            query_year = int(w)
            break

    # Remove year from words so it doesn't interfere with director matching
    clean_words = [w for w in words if not (w.isdigit() and 1900 <= int(w) <= 2100)]

    # Encode query
    query_vec = model.encode([query], normalize_embeddings=True).astype(np.float32)
    D, I = index.search(query_vec, top_k * 8)  # get more candidates

    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1: continue
        movie = movies[idx]
        
        boost = score  # base semantic similarity (0–1)

        movie_year = movie.get('Release_Date', '')[:4]
        movie_director = movie.get('Director', '').lower().strip()
        movie_genres = movie.get('Genres', '').lower()

        # === 1. Year match → huge boost ===
        if query_year and movie_year == str(query_year):
            boost += 3.0

        # === 2. Director match → THE most important boost ===
        director_match = False
        if clean_words:
            # Full name match (e.g. "joss whedon")
            full_name = ' '.join(clean_words[-2:])  # last two words most likely director
            if len(clean_words) >= 2 and full_name in movie_director:
                boost += 5.0
                director_match = True
            # Or partial match
            elif any(word in movie_director for word in clean_words if len(word) >= 4):
                boost += 4.0
                director_match = True

        # === 3. Genre boost (action, comedy, etc.) ===
        genre_keywords = ['action', 'comedy', 'drama', 'thriller', 'horror', 'sci-fi', 'romance', 'animation']
        matched_genres = [g for g in genre_keywords if g in query_lower and g in movie_genres]
        boost += len(matched_genres) * 0.6

        # === 4. Title match bonus ===
        movie_title_lower = movie.get('Title', '').lower()
        if any(word in movie_title_lower for word in clean_words if len(word) >= 3):
            boost += 1.5

        results.append((boost, movie, director_match))

    # Sort by score
    results.sort(key=lambda x: x[0], reverse=True)

    # Final list
    final_movies = [m for _, m, _ in results[:top_k]]
    return final_movies

Loaded 3500 movies


Batches: 100%|██████████| 110/110 [00:08<00:00, 13.71it/s]


In [21]:
# ==========================
# Test it!
# ==========================
queries = [
    "nolan action 2010",
    "Joss Whedon action "
]

for q in queries:
    print(f"\n=== Query: '{q}' ===")
    results = smart_search(q, top_k=5)
    for i, m in enumerate(results, 1):
        print(f"{i}. {m['Title']} ({m['Release_Date'][:4]}) - {m['Director']} - {m['Genres']}")


=== Query: 'nolan action 2010' ===
1. Inception (2010) - Christopher Nolan - Action, Science Fiction, Adventure
2. The Dark Knight (2008) - Christopher Nolan - Drama, Action, Crime, Thriller
3. Batman Begins (2005) - Christopher Nolan - Action, Crime, Drama
4. The Dark Knight Rises (2012) - Christopher Nolan - Action, Crime, Drama, Thriller
5. The Prestige (2006) - Christopher Nolan - Drama, Mystery, Science Fiction

=== Query: 'Joss Whedon action ' ===
1. Chain Reaction (1996) - Andrew Davis - Science Fiction, Action, Drama, Thriller
2. eXistenZ (1999) - David Cronenberg - Action, Thriller, Science Fiction
3. Wild Card (2015) - Simon West - Thriller, Crime, Action
4. Beowulf (2007) - Robert Zemeckis - Adventure, Action, Animation
5. Reindeer Games (2000) - John Frankenheimer - Thriller, Action, Crime
