In [22]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# ========================== LOAD MODEL ==========================
model = SentenceTransformer('all-MiniLM-L6-v2')

# ========================== LOAD MOVIES ==========================
folder_path = '../data/Docs'
movies = []
texts = []

for fn in os.listdir(folder_path):
    if fn.endswith('.json'):
        path = os.path.join(folder_path, fn)
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            movies.append(data)

            # Very rich text so the embedding knows everything
            parts = [
                data.get('Title', ''),
                data.get('Overview', ''),
                data.get('Director', ''),
                data.get('Genres', '').replace(',', ' '),
                data.get('Tagline', ''),
                data.get('Release_Date', '')[:4],
                data.get('Cast', '').replace(',', ' '),
                data.get('clean_text', '')
            ]
            rich_text = ' | '.join(filter(None, parts))
            texts.append(rich_text)

print(f"Loaded {len(movies)} movies")

embeddings = model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
dimension = embeddings.shape[1]

index = faiss.IndexFlatIP(dimension)      # cosine similarity
index.add(embeddings.astype(np.float32))


# ========================== FINAL SMART SEARCH ==========================
def smart_search(query: str, top_k: int = 10):
    q = query.lower().strip()
    words = q.split()

    # 1. Extract year if present
    query_year = None
    for w in words:
        if w.isdigit() and 1900 <= int(w) <= 2100:
            query_year = int(w)
            break

    # 2. Query embedding
    q_vec = model.encode([query], normalize_embeddings=True).astype(np.float32)

    # Get many candidates (very important!)
    D, I = index.search(q_vec, top_k * 10)   # 100 candidates → never miss a match

    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1: continue
        movie = movies[idx]

        boost = score  # cosine similarity (0–1)

        director = movie.get('Director', '').lower().strip()
        movie_year = movie.get('Release_Date', '')[:4]

        # YEAR EXACT MATCH → huge boost
        if query_year and movie_year == str(query_year):
            boost += 4.0

        # DIRECTOR MATCH – the real magic
        director_match = False
        # Case 1: user wrote full name like "joss whedon", "christopher nolan"
        query_name_parts = []
        for i in range(len(words)-1):
            pair = f"{words[i]} {words[i+1]}"
            if pair in director:
                boost += 8.0          # almost guarantees #1
                director_match = True
                break

        # Case 2: single word (nolan, tarantino, spielberg…)
        if not director_match:
            if any(w in director for w in words if len(w) >= 4):
                boost += 6.0
                director_match = True

        # Genre boost (action, comedy, horror…)
        genres = movie.get('Genres', '').lower()
        genre_hits = sum(1 for g in ['action','adventure','comedy','drama','thriller',
                                    'sci-fi','science fiction','horror','animation']
                        if g in q and g in genres)
        boost += genre_hits * 0.8

        results.append((boost, movie))

    # Sort and return
    results.sort(key=lambda x: x[0], reverse=True)
    return [movie for _, movie in results[:top_k]]




Loaded 3500 movies


Batches: 100%|██████████| 110/110 [00:14<00:00,  7.64it/s]


In [23]:
# ==========================
# Test it!
# ==========================
queries = [
    "nolan action 2010",
    "Joss Whedon action "
]

for q in queries:
    print(f"\n=== Query: '{q}' ===")
    results = smart_search(q, top_k=5)
    for i, m in enumerate(results, 1):
        print(f"{i}. {m['Title']} ({m['Release_Date'][:4]}) - {m['Director']} - {m['Genres']}")


=== Query: 'nolan action 2010' ===
1. Inception (2010) - Christopher Nolan - Action, Science Fiction, Adventure
2. The Dark Knight (2008) - Christopher Nolan - Drama, Action, Crime, Thriller
3. Batman Begins (2005) - Christopher Nolan - Action, Crime, Drama
4. Interstellar (2014) - Christopher Nolan - Adventure, Drama, Science Fiction
5. The Prestige (2006) - Christopher Nolan - Drama, Mystery, Science Fiction

=== Query: 'Joss Whedon action ' ===
1. The Avengers (2012) - Joss Whedon - Science Fiction, Action, Adventure
2. Serenity (2005) - Joss Whedon - Science Fiction, Action, Adventure, Thriller
3. eXistenZ (1999) - David Cronenberg - Action, Thriller, Science Fiction
4. MacGruber (2010) - Jorma Taccone - Action, Comedy
5. The Three Musketeers (2011) - Paul W.S. Anderson - Adventure, Action, Thriller
