In [24]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# ========================== LOAD MODEL ==========================
model = SentenceTransformer('all-MiniLM-L6-v2')

# ========================== LOAD MOVIES ==========================
folder_path = '../data/Docs'
movies = []
texts = []

for fn in os.listdir(folder_path):
    if fn.endswith('.json'):
        path = os.path.join(folder_path, fn)
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            movies.append(data)

            # Very rich text so the embedding knows everything
            parts = [
                data.get('Title', ''),
                data.get('Overview', ''),
                data.get('Director', ''),
                data.get('Genres', '').replace(',', ' '),
                data.get('Tagline', ''),
                data.get('Release_Date', '')[:4],
                data.get('Cast', '').replace(',', ' '),
                data.get('clean_text', '')
            ]
            rich_text = ' | '.join(filter(None, parts))
            texts.append(rich_text)

print(f"Loaded {len(movies)} movies")

embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
dimension = embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)      # cosine similarity
index.add(embeddings.astype(np.float32))


# ========================== FINAL SMART SEARCH ==========================
def smart_search(query: str, top_k: int = 10):
    q = query.strip().lower()
    words = q.split()

    # Extract year
    query_year = None
    for w in words:
        if w.isdigit() and 1900 <= int(w) <= 2100:
            query_year = int(w)

    # Remove year from words list for director matching
    words_no_year = [w for w in words if not (w.isdigit() and 1900 <= int(w) <= 2100)]

    # Query vector
    q_vec = model.encode([query], normalize_embeddings=True).astype(np.float32)
    D, I = index.search(q_vec, top_k * 15)   # 150 candidates = never miss anything

    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1: continue
        movie = movies[idx]
        boost = score  # cosine similarity

        director = movie.get('Director', '').lower().strip()
        movie_year_str = movie.get('Release_Date', '')[:4]

        # 1. YEAR EXACT MATCH → massive boost
        if query_year and movie_year_str == str(query_year):
            boost += 5.0

        # 2. DIRECTOR MATCH — THE NUCLEAR OPTION
        director_match = False
        
        # Try exact full name (handles "joss whedon", "christopher nolan", etc.)
        if len(words_no_year) >= 2:
            # Try last two words as name
            name_candidate = f"{words_no_year[-2]} {words_no_year[-1]}"
            if name_candidate in director:
                boost += 10.0   # GUARANTEES #1
                director_match = True
            # Try first two words
            elif len(words_no_year) >= 2:
                name_candidate = f"{words_no_year[0]} {words_no_year[1]}"
                if name_candidate in director:
                    boost += 10.0
                    director_match = True

        # Fallback: any significant word in director name
        if not director_match:
            if any(w in director for w in words_no_year if len(w) >= 4):
                boost += 7.0

        # 3. Genre boost
        genres_lower = movie.get('Genres', '').lower()
        genre_boost = sum(1 for g in ['action', 'adventure', 'comedy', 'drama', 'thriller',
                                     'sci-fi', 'science fiction', 'horror', 'animation', 'crime']
                         if g in q and g in genres_lower)
        boost += genre_boost * 1.0

        results.append((boost, movie))

    results.sort(key=lambda x: x[0], reverse=True)
    return [movie for _, movie in results[:top_k]]

Loaded 3500 movies


Batches: 100%|██████████| 110/110 [00:13<00:00,  8.13it/s]


In [25]:
# ==========================
# Test it!
# ==========================
queries = [
    "nolan action 2010",
    "Joss Whedon action "
]

for q in queries:
    print(f"\n=== Query: '{q}' ===")
    results = smart_search(q, top_k=5)
    for i, m in enumerate(results, 1):
        print(f"{i}. {m['Title']} ({m['Release_Date'][:4]}) - {m['Director']} - {m['Genres']}")


=== Query: 'nolan action 2010' ===
1. Inception (2010) - Christopher Nolan - Action, Science Fiction, Adventure
2. The Dark Knight (2008) - Christopher Nolan - Drama, Action, Crime, Thriller
3. Batman Begins (2005) - Christopher Nolan - Action, Crime, Drama
4. The Dark Knight Rises (2012) - Christopher Nolan - Action, Crime, Drama, Thriller
5. Interstellar (2014) - Christopher Nolan - Adventure, Drama, Science Fiction

=== Query: 'Joss Whedon action ' ===
1. The Avengers (2012) - Joss Whedon - Science Fiction, Action, Adventure
2. Serenity (2005) - Joss Whedon - Science Fiction, Action, Adventure, Thriller
3. eXistenZ (1999) - David Cronenberg - Action, Thriller, Science Fiction
4. MacGruber (2010) - Jorma Taccone - Action, Comedy
5. The Three Musketeers (2011) - Paul W.S. Anderson - Adventure, Action, Thriller
