In [1]:
from dotenv import load_dotenv
import os

load_dotenv()  # this reads .env and adds env variables

api_key = os.environ.get("NOMIC_API_KEY")
if not api_key:
    raise ValueError("NOMIC_API_KEY not found. Check your .env file.")


In [2]:
# Phase 2 — Build Embeddings + ChromaDB index using Nomic API

import os
import json
from nomic import embed
from pathlib import Path

DATA = Path("data")
CHUNKS_PATH = DATA / "chunked_data.jsonl"
DB_DIR = Path("chroma_db")

if not CHUNKS_PATH.exists():
    raise FileNotFoundError("chunked_data.jsonl not found in /data folder")

# Load chunks (JSONL)
chunks = []
with CHUNKS_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            chunks.append(json.loads(line))

print("Loaded", len(chunks), "chunks.")

# Prepare lists
ids, texts, metas = [], [], []

for c in chunks:
    ids.append(str(c["chunk_id"]))
    texts.append(c["text"].strip())
    metas.append({
        "chunk_id": c.get("chunk_id"),
        "chunk_type": c.get("chunk_type"),
        "act": c.get("act"),
        "scene": c.get("scene"),
        "speaker": c.get("speaker"),
        "is_soliloquy": c.get("is_soliloquy"),
        "start_page": c.get("start_page"),
        "end_page": c.get("end_page")
    })

# Generate embeddings
BATCH_SIZE = 64
embeddings = []

for i in range(0, len(texts), BATCH_SIZE):
    batch = texts[i:i + BATCH_SIZE]
    resp = embed.text(
        texts=batch,
        model="nomic-embed-text-v1.5"
    )
    embeddings.extend(resp["embeddings"])

print("Generated embeddings:", len(embeddings))


Loaded 871 chunks.
Generated embeddings: 871


In [9]:
# ============================================
# Phase 2 — FAISS Indexing using Nomic Embeddings
# Clean, stable, no Chroma dependency issues
# ============================================

from dotenv import load_dotenv
load_dotenv()

import os
import json
from pathlib import Path

from nomic import embed
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS


# -----------------------------
# Paths
# -----------------------------
DATA = Path("data")
CHUNKS_PATH = DATA / "chunked_data.jsonl"
FAISS_DIR = "faiss_index"

if not CHUNKS_PATH.exists():
    raise FileNotFoundError("chunked_data.jsonl not found")

if not (os.environ.get("NOMIC_API_KEY") or Path.home().joinpath(".nomic").exists()):
    raise ValueError("Nomic API not configured")


# ============================
# Custom Nomic Embedding Adapter
# ============================
class NomicEmbeddings(Embeddings):
    def __init__(self, model="nomic-embed-text-v1.5", batch_size=64):
        self.model = model
        self.batch_size = batch_size

    def embed_documents(self, texts):
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i:i+self.batch_size]
            resp = embed.text(texts=batch, model=self.model)
            embeddings.extend(resp["embeddings"])
        return embeddings

    def embed_query(self, text):
        resp = embed.text(texts=[text], model=self.model)
        return resp["embeddings"][0]


# ============================
# Load Chunks (JSONL)
# ============================
texts = []
metas = []
ids = []

with CHUNKS_PATH.open("r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        rec = json.loads(line)
        ids.append(str(rec.get("chunk_id")))
        texts.append((rec.get("text") or "").strip())
        metas.append({
            "chunk_id": rec.get("chunk_id"),
            "chunk_type": rec.get("chunk_type"),
            "act": rec.get("act"),
            "scene": rec.get("scene"),
            "speaker": rec.get("speaker"),
            "is_soliloquy": rec.get("is_soliloquy"),
            "start_page": rec.get("start_page"),
            "end_page": rec.get("end_page"),
        })

print("Loaded chunks:", len(texts))


# ============================
# Create FAISS Vector Store
# ============================
embedding_fn = NomicEmbeddings()

faiss_store = FAISS.from_texts(
    texts=texts,
    embedding=embedding_fn,
    metadatas=metas,
)

# Save FAISS index to disk
faiss_store.save_local(FAISS_DIR)

print("FAISS index created and saved at:", FAISS_DIR)


Loaded chunks: 871
FAISS index created and saved at: faiss_index
