In [1]:
import pandas as pd

In [2]:
recipes = pd.read_csv("final_recipes_data.csv")

In [3]:
from dotenv import load_dotenv

load_dotenv()

True

In [4]:
#takes raw text and converts into format that langchain can understand
from langchain_community.document_loaders import TextLoader
#split the document into meaningful chunks
from langchain_text_splitters import CharacterTextSplitter
#converting chunks into document embeddings
from langchain_openai import OpenAIEmbeddings
#vectorDB to store embeddings
from langchain_chroma import Chroma

In [5]:
recipes.columns

Index(['id', 'name', 'ingredients', 'steps', 'servings', 'ingredients_cleaned',
       'tags_merged'],
      dtype='object')

In [6]:
import ast

def safe_eval(x):
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            # try to "fix" some broken escape chars
            x = x.replace('\\"', '"').replace("\\'", "'").replace("\\", "")
            try:
                return ast.literal_eval(x)
            except Exception:
                return [x]  # fallback: wrap raw string in list
    return x

recipes["ingredients"] = recipes["ingredients"].apply(safe_eval)


In [7]:
import ast

def safe_literal_eval(x):
    if isinstance(x, str):  # only try if it's a string
        try:
            return ast.literal_eval(x)
        except (ValueError, SyntaxError):
            return x  # if still broken, return as-is
    return x  # already list (or other type)

recipes["ingredients"] = recipes["ingredients"].apply(safe_literal_eval)
recipes["ingredients_cleaned"] = recipes["ingredients_cleaned"].apply(safe_literal_eval)
recipes["steps"] = recipes["steps"].apply(safe_literal_eval)
recipes["tags_merged"] = recipes["tags_merged"].apply(
    lambda x: [tag.strip() for tag in x.split(",")] if isinstance(x, str) else x
)

In [8]:
recipes["tags_merged"].dtype

dtype('O')

In [9]:
recipes["embedding_text"] = recipes.apply(
    lambda row: (
        f"ID: {row['id']} | "
        f"Name: {row['name']} | "
        f"Ingredients: {' | '.join(row['ingredients']) if isinstance(row['ingredients'], list) else row['ingredients']} | "
        f"Tags: {' | '.join(row['tags_merged']) if isinstance(row['tags_merged'], list) else row['tags_merged']} | "
        f"Servings: {int(row['servings'])}"
    ),
    axis=1
)

In [10]:
recipes["embedding_text"].to_csv("embedding_text.txt", index=False, header=False, sep='\n')

In [11]:
recipes["embedding_text"]

0         ID: 71247 | Name: Cherry Streusel Cobbler | In...
1         ID: 76133 | Name: Reuben and Swiss Casserole B...
2         ID: 503816 | Name: Yam-Pecan Recipe | Ingredie...
3         ID: 418749 | Name: Tropical  Orange Layer Cake...
4         ID: 392934 | Name: Safe to Eat Raw Chocolate C...
                                ...                        
491588    ID: 173790 | Name: Sausage Meatloaf OAMC | Ing...
491589    ID: 301838 | Name: Potato Salad With Olives Ca...
491590    ID: 130682 | Name: Chocolate Banana Pound Cake...
491591    ID: 353659 | Name: Cheesy Ground Beef and Rice...
491592    ID: 510848 | Name: Chicken and Dumplings &ndas...
Name: embedding_text, Length: 491593, dtype: object

In [12]:
from langchain.schema import Document

# load file directly
with open("embedding_text.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# convert each line into a Document
documents = [
    Document(page_content=line.strip())
    for line in lines if line.strip()
]

print(f"Created {len(documents)} documents")
print("Example:", documents[0].page_content[:200])

Created 491593 documents
Example: ID: 71247 | Name: Cherry Streusel Cobbler | Ingredients: cherry pie filling | condensed milk | melted margarine | cinnamon | nutmeg | light brown sugar | flour | margarine | chopped nuts | oats | butt


In [13]:
# from langchain_community.vectorstores import Chroma
# from langchain_openai import OpenAIEmbeddings
# from tqdm import tqdm

# embeddings = OpenAIEmbeddings()

# # init with persist_directory
# db = Chroma(persist_directory="recipes_db", embedding_function=embeddings)

# batch_size = 256
# for i in tqdm(range(0, len(documents), batch_size)):
#     batch = documents[i:i+batch_size]
#     db.add_documents(batch)
#     db.persist()  # works now 🚀


In [14]:
# from langchain.schema import Document

# documents = [
#     Document(
#         page_content=row["embedding_text"],  # the text to embed
#         metadata={"id": row["id"], "name": row["name"]}  # keep ID + Name for retrieval
#     )
#     for _, row in recipes.iterrows()
# ]

# # 2. Initialize embeddings (use your OpenAI key in env vars)
# embedding_function = OpenAIEmbeddings()
# # 
# # 3. Create Chroma DB
# db_recipes = Chroma.from_documents(
#     documents=documents,
#     embedding=embedding_function,
#     persist_directory="recipes_chroma"   # so you can reload later
# )

# # 4. Persist the DB (important if you want to reuse it later)
# db_recipes.persist()

In [15]:
from langchain.text_splitter import CharacterTextSplitter

raw = TextLoader("embedding_text.txt", encoding="utf-8").load()
text_splitter = CharacterTextSplitter(
    chunk_size=1000,  # instead of 1
    chunk_overlap=0,
    separator="\n"
)
documents = text_splitter.split_documents(raw)
print(len(documents))  # should be ~10–20k instead of 491k



Created a chunk of size 1033, which is longer than the specified 1000
Created a chunk of size 1044, which is longer than the specified 1000
Created a chunk of size 1097, which is longer than the specified 1000
Created a chunk of size 1003, which is longer than the specified 1000
Created a chunk of size 1118, which is longer than the specified 1000
Created a chunk of size 1113, which is longer than the specified 1000
Created a chunk of size 1025, which is longer than the specified 1000
Created a chunk of size 1010, which is longer than the specified 1000
Created a chunk of size 1009, which is longer than the specified 1000
Created a chunk of size 1008, which is longer than the specified 1000
Created a chunk of size 1111, which is longer than the specified 1000
Created a chunk of size 1121, which is longer than the specified 1000
Created a chunk of size 1017, which is longer than the specified 1000


212716


In [17]:
documents

[Document(metadata={'source': 'embedding_text.txt'}, page_content='ID: 71247 | Name: Cherry Streusel Cobbler | Ingredients: cherry pie filling | condensed milk | melted margarine | cinnamon | nutmeg | light brown sugar | flour | margarine | chopped nuts | oats | butter - flavored cooking spray | Tags: 60-minutes-or-less | cherries | cobblers-and-crisps | desserts | fruit | north-american | oven | pitted-fruit | Servings: 6\nID: 76133 | Name: Reuben and Swiss Casserole Bake | Ingredients: corned beef chopped | sauerkraut cold water | swiss cheese shredded | rye bread | butter | Tags: 60-minutes-or-less | casseroles | cheese | eggs-dairy | main-dish | oven | Servings: 4'),
 Document(metadata={'source': 'embedding_text.txt'}, page_content='ID: 503816 | Name: Yam-Pecan Recipe | Ingredients: unsalted butter | vegetable oil | all - purpose flour | baking soda | chopped pecans | mashed canned louisiana yams juice glaze | unsalted butter | juice yam | Tags: 3-steps-or-less | 4-hours-or-less | 

In [None]:
from langchain_core.documents import Document
import re

new_documents = []

for doc in documents:
    # Split by newline followed by ID:
    recipes = re.split(r'\n(?=ID:)', doc.page_content)
    for recipe in recipes:
        new_documents.append(Document(page_content=recipe, metadata=doc.metadata))

print(len(new_documents))
print(new_documents[1].page_content)  # preview first recipe


491407
ID: 71247 | Name: Cherry Streusel Cobbler | Ingredients: cherry pie filling | condensed milk | melted margarine | cinnamon | nutmeg | light brown sugar | flour | margarine | chopped nuts | oats | butter - flavored cooking spray | Tags: 60-minutes-or-less | cherries | cobblers-and-crisps | desserts | fruit | north-american | oven | pitted-fruit | Servings: 6


In [23]:
import pickle

with open("new_documents.pkl", "wb") as f:
    pickle.dump(new_documents, f)

In [19]:
print(new_documents[1].page_content)

ID: 76133 | Name: Reuben and Swiss Casserole Bake | Ingredients: corned beef chopped | sauerkraut cold water | swiss cheese shredded | rye bread | butter | Tags: 60-minutes-or-less | casseroles | cheese | eggs-dairy | main-dish | oven | Servings: 4


In [21]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

# Your documents
texts = [doc.page_content for doc in new_documents]

embeddings = OpenAIEmbeddings()  # defaults to text-embedding-3-small
db_recipes = Chroma(persist_directory="chroma_db", embedding_function=embeddings)

batch_size = 500  # adjust if needed
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i+batch_size]
    db_recipes.add_texts(batch_texts)
    print(f"Embedded batch {i} → {i + len(batch_texts)} / {len(texts)}")


  embeddings = OpenAIEmbeddings()  # defaults to text-embedding-3-small
  db_recipes = Chroma(persist_directory="chroma_db", embedding_function=embeddings)


Embedded batch 0 → 500 / 491407
Embedded batch 500 → 1000 / 491407
Embedded batch 1000 → 1500 / 491407
Embedded batch 1500 → 2000 / 491407
Embedded batch 2000 → 2500 / 491407
Embedded batch 2500 → 3000 / 491407
Embedded batch 3000 → 3500 / 491407
Embedded batch 3500 → 4000 / 491407
Embedded batch 4000 → 4500 / 491407
Embedded batch 4500 → 5000 / 491407
Embedded batch 5000 → 5500 / 491407
Embedded batch 5500 → 6000 / 491407
Embedded batch 6000 → 6500 / 491407
Embedded batch 6500 → 7000 / 491407
Embedded batch 7000 → 7500 / 491407
Embedded batch 7500 → 8000 / 491407
Embedded batch 8000 → 8500 / 491407
Embedded batch 8500 → 9000 / 491407
Embedded batch 9000 → 9500 / 491407
Embedded batch 9500 → 10000 / 491407
Embedded batch 10000 → 10500 / 491407
Embedded batch 10500 → 11000 / 491407
Embedded batch 11000 → 11500 / 491407
Embedded batch 11500 → 12000 / 491407
Embedded batch 12000 → 12500 / 491407
Embedded batch 12500 → 13000 / 491407
Embedded batch 13000 → 13500 / 491407
Embedded batch 135

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
from langchain_chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
import os
import uuid

# Local paths
db1_path = "recipes_db_first_half"
db2_path = "recipes_db_2nd_half"
merged_path = "recipes_db_merged"

embeddings = OpenAIEmbeddings()

# Load both DBs
db1 = Chroma(embedding_function=embeddings, persist_directory=db1_path)
db2 = Chroma(embedding_function=embeddings, persist_directory=db2_path)

# Extract all documents and embeddings from db2
all_docs = db2._collection.get()["documents"]
all_embeddings = db2._collection.get()["embeddings"]

all_ids = [str(uuid.uuid4()) for _ in all_docs]

# Add to db1
db1._collection.add(documents=all_docs, embeddings=all_embeddings, ids=all_ids)

# Persist merged DB
if not os.path.exists(merged_path):
    os.makedirs(merged_path)
db1.persist(persist_directory=merged_path)

print("Merged DB saved at:", merged_path)


ValueError: You must provide an embedding function to compute embeddings.https://docs.trychroma.com/guides/embeddings in add.

In [10]:
!pip install "protobuf<=3.20.3"




In [1]:
!pip install chromadb langchain
!pip install huggingface-hub transformers sentence-transformers
!pip install langchain-community

Collecting protobuf (from onnxruntime>=1.14.1->chromadb)
  Downloading protobuf-6.32.0-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Downloading protobuf-6.32.0-cp310-abi3-win_amd64.whl (435 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
Successfully installed protobuf-6.32.0


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.11.0 requires keras<2.12,>=2.11.0, but you have keras 3.11.1 which is incompatible.
tensorflow-intel 2.11.0 requires protobuf<3.20,>=3.9.2, but you have protobuf 6.32.0 which is incompatible.
tensorflow-intel 2.11.0 requires tensorboard<2.12,>=2.11, but you have tensorboard 2.19.0 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.32.0 which is incompatible.




In [2]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"


In [4]:
!pip uninstall -y tensorflow keras
!pip install torch torchvision torchaudio
!pip install sentence-transformers --upgrade


Found existing installation: tensorflow 2.19.0
Uninstalling tensorflow-2.19.0:
  Successfully uninstalled tensorflow-2.19.0
Found existing installation: keras 3.11.1
Uninstalling keras-3.11.1:
  Successfully uninstalled keras-3.11.1

  You can safely remove it manually.



Collecting torchvision
  Downloading torchvision-0.23.0-cp310-cp310-win_amd64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.8.0-cp310-cp310-win_amd64.whl.metadata (7.2 kB)
Collecting torch
  Downloading torch-2.8.0-cp310-cp310-win_amd64.whl.metadata (30 kB)
Downloading torchvision-0.23.0-cp310-cp310-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.6 MB ? eta -:--:--
   ------------------- -------------------- 0.8/1.6 MB 2.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 2.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 2.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 2.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 2.0 MB/s eta 0:00:01
   -------------------------------- ------- 1.3/1.6 MB 2.0 MB/s

In [None]:
# from langchain.schema import Document

# class MergedChroma:
#     def __init__(self, db_list):
#         self.dbs = db_list

#     def similarity_search(self, query, k=5):
#         results = []
#         for db in self.dbs:
#             try:
#                 results.extend(db.similarity_search(query, k=k))
#             except Exception as e:
#                 print(f"Error searching DB: {e}")
#         # If scores available, sort; otherwise just truncate
#         return results[:k]

# # Usage
# from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")

# db1 = Chroma(embedding_function=embeddings, persist_directory="recipes_db_first_half")
# db2 = Chroma(embedding_function=embeddings, persist_directory="recipes_db_2nd_half")

# merged_db = MergedChroma([db1, db2])

# # Now use merged_db like a single DB
# results = merged_db.similarity_search("best chocolate cake", k=5)
# for r in results:
#     print(r.page_content[:100])

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")


TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates