In [2]:
import json, faiss, sqlite3, numpy as np
from sentence_transformers import SentenceTransformer
chunks = json.load(open("../data/processed/all_chunks.json"))
model  = SentenceTransformer("all-MiniLM-L12-v2")

txts  = [c["text"][:512] for c in chunks]
embs  = model.encode(txts, normalize_embeddings=True)
ids   = np.arange(len(txts)).astype("int64")

index = faiss.IndexFlatIP(embs.shape[1])
index = faiss.IndexIDMap2(index)
index.add_with_ids(embs, ids)
faiss.write_index(index, "../data/faiss.index")

con = sqlite3.connect("../data/meta.db")
con.execute("""CREATE TABLE IF NOT EXISTS meta
               (id INTEGER PRIMARY KEY, chunk_id TEXT, source TEXT,
                doc_path TEXT, loc TEXT, text TEXT)""")
con.executemany("INSERT OR REPLACE INTO meta VALUES (?,?,?,?,?,?)",
                [(i,c["chunk_id"],c["source"],c["doc_path"],json.dumps(c["loc"]),c["text"])
                 for i,c in enumerate(chunks)])
con.commit()


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import faiss, sqlite3, json, os, numpy as np
print("FAISS file  exists:", os.path.exists("../data/faiss.index"))
print("Meta DB size:", os.path.getsize("../data/meta.db")//1024, "KB")

# quick sanity search
index = faiss.read_index("../data/faiss.index")
vec   = np.random.randn(384).astype("float32")   # dummy vector
D,I   = index.search(vec.reshape(1,-1), 5)
print("Random search returns IDs:", I)


FAISS file  exists: True
Meta DB size: 216 KB
Random search returns IDs: [[25 92 91  5  4]]
