In [6]:
!pip install -qU langchain-voyageai

In [7]:
!pip install -qU langchain-community faiss-cpu

In [1]:
import voyageai
import faiss
import numpy as np
import pandas as pd
from langchain_core.documents import Document
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

# Init Voyage client
vo = voyageai.Client()

# embeddings = VoyageAIEmbeddings(model="voyage-3.5")

In [7]:
# Read recipes
documents = []
with open("embedding_text.txt", "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        parts = [p.strip() for p in line.split("|")]
        recipe_id = parts[0].replace("ID:", "").strip()
        name = parts[1].replace("Name:", "").strip()

        page_content = f"Name: {name}. " + " ".join(parts[2:])
        metadata = {"recipe_id": recipe_id, "name": name}

        documents.append(Document(page_content=page_content, metadata=metadata))

print(f"Loaded {len(documents)} recipes")

# Figure out embedding dimension (once)
dim = len(vo.embed(["hello"], model="voyage-3.5").embeddings[0])

# Create FAISS index
index = faiss.IndexFlatL2(dim)
vector_store = FAISS(
    embedding_function=None,   # we’re embedding manually
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

Loaded 491593 recipes


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [8]:
import numpy as np

# # Get already embedded IDs
# already_done = set(vector_store.docstore._dict.keys())
# print(f"Already in index: {len(already_done)} docs")

batch_size = 100
for i in range(0, len(documents), batch_size):
    batch_docs = documents[i:i+batch_size]
    ids = [doc.metadata["recipe_id"] for doc in batch_docs]

    # Skip if this batch is already embedded
    if all(_id in already_done for _id in ids):
        continue  

    texts = [doc.page_content for doc in batch_docs]

    # Direct Voyage embed
    response = vo.embed(texts, model="voyage-3.5")
    embeddings_list = response.embeddings  

    # Convert to numpy array
    embeddings_array = np.array(embeddings_list, dtype="float32")

    # Add vectors to FAISS
    vector_store.index.add(embeddings_array)

    # Add docs to InMemoryDocstore
    new_docs = {doc.metadata["recipe_id"]: doc for doc in batch_docs if doc.metadata["recipe_id"] not in already_done}
    vector_store.docstore.add(new_docs)

    # Map FAISS index positions → IDs
    start_pos = len(vector_store.index_to_docstore_id)
    for offset, _id in enumerate(new_docs.keys()):
        vector_store.index_to_docstore_id[start_pos + offset] = _id
    
    # Progress + checkpoint
    if i % 1000 == 0:
        print(f"✅ Done {i}/{len(documents)}")
        vector_store.save_local("faiss_index")

# Final save
vector_store.save_local("faiss_index")
print("🎉 Resume complete, all new recipes embedded & saved")


✅ Done 0/491593
✅ Done 1000/491593
✅ Done 2000/491593
✅ Done 3000/491593
✅ Done 4000/491593
✅ Done 5000/491593
✅ Done 6000/491593
✅ Done 7000/491593
✅ Done 8000/491593
✅ Done 9000/491593
✅ Done 10000/491593
✅ Done 11000/491593
✅ Done 12000/491593
✅ Done 13000/491593
✅ Done 14000/491593
✅ Done 15000/491593
✅ Done 16000/491593
✅ Done 17000/491593
✅ Done 18000/491593
✅ Done 19000/491593
✅ Done 20000/491593
✅ Done 21000/491593
✅ Done 22000/491593
✅ Done 23000/491593
✅ Done 24000/491593
✅ Done 25000/491593
✅ Done 26000/491593
✅ Done 27000/491593
✅ Done 28000/491593
✅ Done 29000/491593
✅ Done 30000/491593
✅ Done 31000/491593
✅ Done 32000/491593
✅ Done 33000/491593
✅ Done 34000/491593
✅ Done 35000/491593
✅ Done 36000/491593
✅ Done 37000/491593
✅ Done 38000/491593
✅ Done 39000/491593
✅ Done 40000/491593
✅ Done 41000/491593
✅ Done 42000/491593
✅ Done 43000/491593
✅ Done 44000/491593
✅ Done 45000/491593
✅ Done 46000/491593
✅ Done 47000/491593
✅ Done 48000/491593
✅ Done 49000/491593
✅ Done 50000/

In [12]:
# Load FAISS back
vector_store = FAISS.load_local("faiss_index", None, allow_dangerous_deserialization=True)

# Embed query manually
query = "italian diabetic friendly chicken"
query_vec = vo.embed([query], model="voyage-3.5").embeddings

# Search FAISS
results = vector_store.similarity_search_by_vector(query_vec[0], k=5)

for r in results:
    print(r.metadata["recipe_id"], r.metadata["name"])


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


413213 Healthy Chicken Cacciatore
172094 Zesty Chicken Italiano
454992 Chicken Scampi (Diabetic)
390367 Italian Chicken With Tomatoes and Mushrooms
330631 Divine Italian Chicken Salad
