In [2]:
# 1. Install once
# pip install sentence-transformers faiss-cpu tqdm
# (use faiss-gpu if you have a good GPU)

import os
import json
import glob
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pickle

# -----------------------------
# CONFIGURATION
# -----------------------------
FOLDER = "../data/Docs"           # ← your folder with .json files
INDEX_FILE = "data/movie_index.faiss"
METADATA_FILE = "data/movie_metadata.pkl"

# -----------------------------
# 1. Load all movies
# -----------------------------
def load_movies():
    files = glob.glob(os.path.join(FOLDER, "*.json"))
    movies = []
    for f in tqdm(files, desc="Loading JSONs"):
        with open(f, "r", encoding="utf8") as jf:
            data = json.load(jf)
            movies.append(data)
    return movies, files

movies, file_paths = load_movies()
print(f"Loaded {len(movies)} movies")

# -----------------------------
# 2. Create clean searchable text
# -----------------------------
def make_text(movie):
    parts = [
        movie.get("Title", ""),
        movie.get("Overview", ""),
        movie.get("Tagline", ""),
        movie.get("Director", ""),
        movie.get("Cast", ""),
        movie.get("Genres", ""),
        str(movie.get("Release_Date", "")[:4])
    ]
    return " | ".join([p.strip() for p in parts if p])

texts = [make_text(m) for m in movies]

# -----------------------------
# 3. Load embedding model (small & fast or big & accurate)
# -----------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")        # Super fast (5-10k movies in <10 sec)
# model = SentenceTransformer("all-mpnet-base-v2")     # More accurate (slower)
# model = SentenceTransformer("BAAI/bge-small-en-v1.5") # Best speed/quality 2025

print("Generating embeddings...")
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# -----------------------------
# 4. Build FAISS index
# -----------------------------
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)   # Inner Product = cosine after normalization
index.add(embeddings)
print(f"FAISS index ready with {index.ntotal} movies")

# -----------------------------
# 5. Save everything
# -----------------------------
faiss.write_index(index, INDEX_FILE)
with open(METADATA_FILE, "wb") as f:
    pickle.dump({
        "movies": movies,
        "file_paths": file_paths,
        "texts": texts
    }, f)

print("Index saved! Ready for search.")

Loading JSONs: 100%|██████████| 3500/3500 [00:00<00:00, 13360.06it/s]


Loaded 3500 movies
Generating embeddings...


Batches: 100%|██████████| 110/110 [00:09<00:00, 11.34it/s]

FAISS index ready with 3500 movies





RuntimeError: Error in faiss::FileIOWriter::FileIOWriter(const char *) at /Users/runner/work/faiss-wheels/faiss-wheels/faiss/faiss/impl/io.cpp:102: Error: 'f' failed: could not open data/movie_index.faiss for writing: No such file or directory