In [None]:
import pandas as pd
#takes raw text and converts into format that langchain can understand
from langchain_community.document_loaders import TextLoader
#split the document into meaningful chunks
from langchain_text_splitters import CharacterTextSplitter
#converting chunks into document embeddings
from langchain_openai import OpenAIEmbeddings
#vectorDB to store embeddings
from langchain_chroma import Chroma

In [None]:
recipes = pd.read_csv("final_recipes_data.csv")

In [7]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
recipes.columns

Index(['id', 'name', 'ingredients', 'steps', 'servings', 'ingredients_cleaned',
       'tags_merged'],
      dtype='object')

In [6]:
import ast

def safe_eval(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            # try to "fix" some broken escape chars
            x = x.replace('\\"', '"').replace("\\'", "'").replace("\\", "")
            try:
                return ast.literal_eval(x)
            except Exception:
                return [x]  # fallback: wrap raw string in list
    return x

recipes["ingredients"] = recipes["ingredients"].apply(safe_eval)


In [7]:
import ast

def safe_literal_eval(x):
    if isinstance(x, str):  # only try if it's a string
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return x  # if still broken, return as-is
    return x  # already list (or other type)

recipes["ingredients"] = recipes["ingredients"].apply(safe_literal_eval)
recipes["ingredients_cleaned"] = recipes["ingredients_cleaned"].apply(safe_literal_eval)
recipes["steps"] = recipes["steps"].apply(safe_literal_eval)
recipes["tags_merged"] = recipes["tags_merged"].apply(
    lambda x: [tag.strip() for tag in x.split(",")] if isinstance(x, str) else x
)

In [8]:
recipes["tags_merged"].dtype

dtype('O')

In [9]:
recipes["embedding_text"] = recipes.apply(
    lambda row: (
        f"ID: {row['id']} | "
        f"Name: {row['name']} | "
        f"Ingredients: {' | '.join(row['ingredients']) if isinstance(row['ingredients'], list) else row['ingredients']} | "
        f"Tags: {' | '.join(row['tags_merged']) if isinstance(row['tags_merged'], list) else row['tags_merged']} | "
        f"Servings: {int(row['servings'])}"
    ),
    axis=1
)

In [10]:
recipes["embedding_text"].to_csv("embedding_text.txt", index=False, header=False, sep='\n')

In [11]:
recipes["embedding_text"]

0         ID: 71247 | Name: Cherry Streusel Cobbler | In...
1         ID: 76133 | Name: Reuben and Swiss Casserole B...
2         ID: 503816 | Name: Yam-Pecan Recipe | Ingredie...
3         ID: 418749 | Name: Tropical  Orange Layer Cake...
4         ID: 392934 | Name: Safe to Eat Raw Chocolate C...
                                ...                        
491588    ID: 173790 | Name: Sausage Meatloaf OAMC | Ing...
491589    ID: 301838 | Name: Potato Salad With Olives Ca...
491590    ID: 130682 | Name: Chocolate Banana Pound Cake...
491591    ID: 353659 | Name: Cheesy Ground Beef and Rice...
491592    ID: 510848 | Name: Chicken and Dumplings &ndas...
Name: embedding_text, Length: 491593, dtype: object

In [12]:
from langchain.schema import Document

# load file directly
with open("embedding_text.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# convert each line into a Document
documents = [
    Document(page_content=line.strip())
    for line in lines if line.strip()
]

print(f"Created {len(documents)} documents")
print("Example:", documents[0].page_content[:200])

Created 491593 documents
Example: ID: 71247 | Name: Cherry Streusel Cobbler | Ingredients: cherry pie filling | condensed milk | melted margarine | cinnamon | nutmeg | light brown sugar | flour | margarine | chopped nuts | oats | butt


In [8]:
from langchain.text_splitter import CharacterTextSplitter

raw = TextLoader("embedding_text.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(
    chunk_size=1000,  # instead of 1
    chunk_overlap=0,
    separator="\n"
)
documents = text_splitter.split_documents(raw)
print(len(documents))  # should be ~10–20k instead of 491k



Created a chunk of size 1033, which is longer than the specified 1000
Created a chunk of size 1044, which is longer than the specified 1000
Created a chunk of size 1097, which is longer than the specified 1000
Created a chunk of size 1003, which is longer than the specified 1000
Created a chunk of size 1118, which is longer than the specified 1000
Created a chunk of size 1113, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 1010, which is longer than the specified 1000
Created a chunk of size 1009, which is longer than the specified 1000
Created a chunk of size 1008, which is longer than the specified 1000
Created a chunk of size 1111, which is longer than the specified 1000
Created a chunk of size 1121, which is longer than the specified 1000
Created a chunk of size 1017, which is longer than the specified 1000


212716


In [11]:
from langchain_core.documents import Document
import re

new_documents = []

for doc in documents:
    # Split by newline followed by ID:
    recipes = re.split(r'\n(?=ID:)', doc.page_content)
    for recipe in recipes:
        new_documents.append(Document(page_content=recipe, metadata=doc.metadata))

print(len(new_documents))
print(new_documents[1].page_content)
print(new_documents[2].page_content)  # preview first recipe


491407
ID: 76133 | Name: Reuben and Swiss Casserole Bake | Ingredients: corned beef chopped | sauerkraut cold water | swiss cheese shredded | rye bread | butter | Tags: 60-minutes-or-less | casseroles | cheese | eggs-dairy | main-dish | oven | Servings: 4
ID: 503816 | Name: Yam-Pecan Recipe | Ingredients: unsalted butter | vegetable oil | all - purpose flour | baking soda | chopped pecans | mashed canned louisiana yams juice glaze | unsalted butter | juice yam | Tags: 3-steps-or-less | 4-hours-or-less | american | breads | cajun | comfort-food | easy | gifts | holiday-event | north-american | nuts | quick-breads | southern-united-states | Servings: 8


In [13]:
print(new_documents[0].page_content)
print(new_documents[1].page_content)
print(new_documents[2].page_content)

ID: 71247 | Name: Cherry Streusel Cobbler | Ingredients: cherry pie filling | condensed milk | melted margarine | cinnamon | nutmeg | light brown sugar | flour | margarine | chopped nuts | oats | butter - flavored cooking spray | Tags: 60-minutes-or-less | cherries | cobblers-and-crisps | desserts | fruit | north-american | oven | pitted-fruit | Servings: 6
ID: 76133 | Name: Reuben and Swiss Casserole Bake | Ingredients: corned beef chopped | sauerkraut cold water | swiss cheese shredded | rye bread | butter | Tags: 60-minutes-or-less | casseroles | cheese | eggs-dairy | main-dish | oven | Servings: 4
ID: 503816 | Name: Yam-Pecan Recipe | Ingredients: unsalted butter | vegetable oil | all - purpose flour | baking soda | chopped pecans | mashed canned louisiana yams juice glaze | unsalted butter | juice yam | Tags: 3-steps-or-less | 4-hours-or-less | american | breads | cajun | comfort-food | easy | gifts | holiday-event | north-american | nuts | quick-breads | southern-united-states | 

In [14]:
import pickle

with open("all_recipes.pkl", "wb") as f:
    pickle.dump(new_documents, f)

In [13]:
import voyageai
import time
import os

# 1️⃣ Load your recipes from text file
with open("embedding_text.txt", "r", encoding="utf-8") as f:
    recipes = [line.strip() for line in f if line.strip()]

print(f"Loaded {len(recipes)} recipes for embedding.")

# 2️⃣ Initialize Voyage client
vo = voyageai.Client()  # make sure VOYAGE_API_KEY is set

# 3️⃣ Parameters
batch_size = 100            # safe batch size under free limits
output_dir = "embeddings_batches"
os.makedirs(output_dir, exist_ok=True)

# 4️⃣ Embed in batches and save each batch to disk
for i in range(0, len(recipes), batch_size):
    batch = recipes[i:i + batch_size]
    try:
        result = vo.embed(batch, model="voyage-3.5", input_type="document")
        
        # Save this batch to disk
        batch_filename = os.path.join(output_dir, f"embeddings_{i}_{i+len(batch)}.pkl")
        with open(batch_filename, "wb") as f:
            import pickle
            pickle.dump(result.embeddings, f)
        
        print(f"✅ Batch {i} → {i + len(batch)} done, saved to {batch_filename}")
        
        time.sleep(1)  # small pause to avoid rate limits
    except Exception as e:
        print(f"❌ Error at batch {i} → {i + len(batch)}: {e}")
        time.sleep(5)

print("✅ Finished embedding all recipes.")


Loaded 491593 recipes for embedding.
✅ Batch 0 → 100 done, saved to embeddings_batches\embeddings_0_100.pkl
✅ Batch 100 → 200 done, saved to embeddings_batches\embeddings_100_200.pkl
✅ Batch 200 → 300 done, saved to embeddings_batches\embeddings_200_300.pkl
✅ Batch 300 → 400 done, saved to embeddings_batches\embeddings_300_400.pkl
✅ Batch 400 → 500 done, saved to embeddings_batches\embeddings_400_500.pkl
✅ Batch 500 → 600 done, saved to embeddings_batches\embeddings_500_600.pkl
✅ Batch 600 → 700 done, saved to embeddings_batches\embeddings_600_700.pkl
✅ Batch 700 → 800 done, saved to embeddings_batches\embeddings_700_800.pkl
✅ Batch 800 → 900 done, saved to embeddings_batches\embeddings_800_900.pkl
✅ Batch 900 → 1000 done, saved to embeddings_batches\embeddings_900_1000.pkl
✅ Batch 1000 → 1100 done, saved to embeddings_batches\embeddings_1000_1100.pkl
✅ Batch 1100 → 1200 done, saved to embeddings_batches\embeddings_1100_1200.pkl
✅ Batch 1200 → 1300 done, saved to embeddings_batches\emb

In [None]:
import pickle
from langchain_community.vectorstores import Chroma
import os
import chromadb

persist_directory = "chroma_recipes"
os.makedirs(persist_directory, exist_ok=True)

# Initialize persistent Chroma client
client = chromadb.PersistentClient(path=persist_directory)
collection = client.create_collection("recipes", embedding_function=None)

# Load batch files
batch_files = sorted(os.listdir("embeddings_batches"))

for batch_idx, file in enumerate(batch_files):
    if not file.endswith(".pkl"):
        continue
    with open(os.path.join("embeddings_batches", file), "rb") as f:
        embeddings = pickle.load(f)  # list of embeddings

    # Create fake IDs and dummy texts (if you want to store texts too, load them from embedding_text.txt)
    batch_size = len(embeddings)
    texts = [f"recipe_{batch_idx}_{i}" for i in range(batch_size)]
    ids = [f"{batch_idx}_{i}" for i in range(batch_size)]

    # Add embeddings
    collection.add(
        documents=texts,
        embeddings=embeddings,
        ids=ids
    )

    print(f"✅ Batch {batch_idx} added to Chroma")

print("✅ All embeddings inserted into Chroma!")


✅ Batch 0 added to Chroma
✅ Batch 1 added to Chroma
✅ Batch 2 added to Chroma
✅ Batch 3 added to Chroma
✅ Batch 4 added to Chroma
✅ Batch 5 added to Chroma
✅ Batch 6 added to Chroma
✅ Batch 7 added to Chroma
✅ Batch 8 added to Chroma
✅ Batch 9 added to Chroma
✅ Batch 10 added to Chroma
✅ Batch 11 added to Chroma
✅ Batch 12 added to Chroma
✅ Batch 13 added to Chroma
✅ Batch 14 added to Chroma
✅ Batch 15 added to Chroma
✅ Batch 16 added to Chroma
✅ Batch 17 added to Chroma
✅ Batch 18 added to Chroma
✅ Batch 19 added to Chroma
✅ Batch 20 added to Chroma
✅ Batch 21 added to Chroma
✅ Batch 22 added to Chroma
✅ Batch 23 added to Chroma
✅ Batch 24 added to Chroma
✅ Batch 25 added to Chroma
✅ Batch 26 added to Chroma
✅ Batch 27 added to Chroma
✅ Batch 28 added to Chroma
✅ Batch 29 added to Chroma
✅ Batch 30 added to Chroma
✅ Batch 31 added to Chroma
✅ Batch 32 added to Chroma
✅ Batch 33 added to Chroma
✅ Batch 34 added to Chroma
✅ Batch 35 added to Chroma
✅ Batch 36 added to Chroma
✅ Batch 37 