In [12]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from datetime import datetime

# ==========================
# 1. Load model once
# ==========================
model = SentenceTransformer('all-MiniLM-L6-v2')

# ==========================
# 2. Load all movies + precompute embeddings
# ==========================
folder_path = '../data/Docs'  # ← change this
movies = []
texts = []

for fn in os.listdir(folder_path):
    if fn.endswith('.json'):
        with open(os.path.join(folder_path, fn), encoding='utf-8') as f:
            data = json.load(f)
            movies.append(data)
            
            # Build a rich text that includes all important fields
            parts = [
                data.get('Title', ''),
                data.get('Overview', ''),
                data.get('Director', ''),
                data.get('Genres', '').replace(',', ' '),
                data.get('Tagline', ''),
                str(data.get('Release_Date', '')[:4]),  # year only
                data.get('Cast', '').replace(',', ' '),
            ]
            rich_text = ' | '.join(filter(None, parts))
            texts.append(rich_text)

print(f"Loaded {len(movies)} movies")

embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
dimension = embeddings.shape[1]

# ==========================
# 3. Create FAISS index
# ==========================
index = faiss.IndexFlatIP(dimension)  # Inner product = cosine similarity (because we normalized)
index.add(embeddings.astype(np.float32))

# ==========================
# 4. Smart Hybrid Search Function
# ==========================
def smart_search(query: str, top_k: int = 10, year_boost: float = 0.35, director_boost: float = 0.45):
    # Extract possible year from query
    words = query.lower().split()
    query_year = None
    for w in words:
        if w.isdigit() and 1900 <= int(w) <= 2030:
            query_year = int(w)
            break

    # Base semantic embedding
    query_vec = model.encode([query], normalize_embeddings=True).astype(np.float32)
    
    # FAISS search
    scores, indices = index.search(query_vec, top_k * 3)  # get more candidates
    
    # Rescore with boosts
    final_results = []
    for score, idx in zip(scores[0], indices[0]):
        movie = movies[idx]
        
        # 1. Semantic score (already from FAISS Inner Product)
        semantic_score = score
        
        # 2. Year boost
        year_score = 0.0
        if query_year:
            movie_year = int(movie['Release_Date'][:4]) if movie.get('Release_Date') else 0
            if movie_year == query_year:
                year_score = year_boost
        
        # 3. Director exact match boost
        director_score = 0.0
        director = movie.get('Director', '').lower()
        if any(token in director for token in words):
            director_score = director_boost
        
        # Final combined score
        final_score = semantic_score + year_score + director_score
        
        final_results.append((final_score, movie))
    
    # Sort by final score and return top_k
    final_results.sort(key=lambda x: x[0], reverse=True)
    return [movie for _, movie in final_results[:top_k]]



Loaded 3500 movies


Batches: 100%|██████████| 110/110 [00:07<00:00, 13.97it/s]


In [13]:
# ==========================
# Test it!
# ==========================
queries = [
    "nolan action 2010",
]

for q in queries:
    print(f"\n=== Query: '{q}' ===")
    results = smart_search(q, top_k=5)
    for i, m in enumerate(results, 1):
        print(f"{i}. {m['Title']} ({m['Release_Date'][:4]}) - {m['Director']} - {m['Genres']}")


=== Query: 'nolan action 2010' ===
1. The Dark Knight (2008) - Christopher Nolan - Drama, Action, Crime, Thriller
2. MacGruber (2010) - Jorma Taccone - Action, Comedy
3. Redbelt (2008) - David Mamet - Drama
4. Thunderball (1965) - Terence Young - Adventure, Action, Thriller
5. Turbo (2013) - David Soren - Animation, Family
