In [3]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# Step 1: Load the embedding model (you may need to install sentence-transformers via pip if not already installed)
model = SentenceTransformer('all-MiniLM-L6-v2')  # A lightweight model for semantic embeddings

# Step 2: Specify the folder containing your JSON files
folder_path = '../data/Docs'  # Replace with your actual folder path

# Step 3: Load all JSON files and extract clean_text for embedding
movies = []  # List to store full movie data
texts = []   # List to store texts for embedding

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
            movies.append(data)  # Store full movie info
            texts.append(data.get('clean_text', ''))  # Use clean_text field for embedding

print(f"Loaded {len(movies)} movies.")

# Step 4: Generate embeddings for all texts
embeddings = model.encode(texts, convert_to_numpy=True)
print(f"Generated embeddings with shape: {embeddings.shape}")

# Step 5: Create FAISS index (you may need to install faiss-cpu via pip if not already installed)
dimension = embeddings.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(dimension)  # L2 distance index (simple and effective for small datasets)
index.add(embeddings.astype(np.float32))  # Add embeddings to the index

# Optional: Save the index to disk for reuse
faiss.write_index(index, 'movie_index.faiss')

# Step 6: Function to perform semantic search
def search_movies(query, top_k=5):
    # Embed the query
    query_embedding = model.encode([query], convert_to_numpy=True).astype(np.float32)
    
    # Search the index
    distances, indices = index.search(query_embedding, top_k)
    
    # Retrieve and return top results
    results = []
    for i, idx in enumerate(indices[0]):
        if idx != -1:  # Valid index
            result = {
                'movie': movies[idx],
                'distance': distances[0][i]
            }
            results.append(result)
    
    return results



Loaded 3500 movies.
Generated embeddings with shape: (3500, 384)


In [4]:
# Example usage
query = "nolan action 2010"
results = search_movies(query, top_k=5)

# Print top results
for res in results:
    print(f"Title: {res['movie']['Title']}")
    print(f"Overview: {res['movie']['Overview']}")
    print(f"Director: {res['movie']['Director']}")
    print(f"Genres: {res['movie']['Genres']}")
    print(f"Release Date: {res['movie']['Release_Date']}")
    print(f"Distance: {res['distance']}")
    print("---")

Title: MacGruber
Overview: Ex-special operative MacGruber is called back into action to take down his archenemy, Dieter Von Cunth, who's in possession of a nuclear warhead and bent on destroying Washington, DC.
Director: Jorma Taccone
Genres: Action, Comedy
Release Date: 2010-05-21
Distance: 0.9583710432052612
---
Title: Jason Goes to Hell: The Final Friday
Overview: Jason Voorhees is tracked down and blown to bits by a special FBI task force, reborn with the bone-chilling ability to assume the identity of anyone he touches.
Director: Adam Marcus
Genres: Horror
Release Date: 1993-08-13
Distance: 1.1253694295883179
---
Title: The Dark Knight
Overview: Batman raises the stakes in his war on crime. With the help of Lt. Jim Gordon and District Attorney Harvey Dent, Batman sets out to dismantle the remaining criminal organizations that plague the streets. The partnership proves to be effective, but they soon find themselves prey to a reign of chaos unleashed by a rising criminal mastermind 