In [2]:
pip install -qU langchain[groq] langchain-chroma

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
pip install --upgrade opentelemetry-api langchain-chroma chromadbpip install -U opentelemetry-api opentelemetry-sdk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement chromadbpip (from versions: none)

[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for chromadbpip


In [4]:
import getpass
import os
import pandas as pd
import io
from modules.document_processor import DocumentProcessor


In [5]:

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("llama3-8b-8192", model_provider="groq")

In [6]:
import pandas as pd
import io
from modules.document_processor import DocumentProcessor
 
with open("gita.csv", "r", encoding="utf-8") as f:
    csv_buffer = io.StringIO(f.read())

processor = DocumentProcessor()
final_docs, original_df = processor.process_csv_to_chunks(
    file_path=csv_buffer,          # now it's actual CSV content
    content_column="translation",
    filename="gita.csv"            # required if file_path is not a path
)

print(pd.read_csv("gita.csv", nrows=0).columns.tolist())


# --- 3. Inspect the Results ---
print(f"Original CSV had {len(original_df)} rows.")
print(f"Processed into {len(final_docs)} final document chunks.\n")

# Print the first chunk to see the result
print("--- Example of the First Chunk ---")
print(f"Content:\n{final_docs[0].page_content}")
print(f"\nMetadata:\n{final_docs[0].metadata}")
print("-" * 30)

# Print the second chunk to see another example
if len(final_docs) > 1:
    print("\n--- Example of the Second Chunk ---")
    print(f"Content:\n{final_docs[1].page_content}")
    print(f"\nMetadata:\n{final_docs[1].metadata}")
    print("-" * 30)

['chapter', 'verse', 'speaker', 'sanskrit', 'translation', 'question']
Original CSV had 700 rows.
Processed into 117 final document chunks.

--- Example of the First Chunk ---
Content:
Chapter 1 Verse 1 — Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"

Chapter 1 Verse 2 — Sanjaya said: Having seen the army of the Pandavas drawn up in battle array, King Duryodhana approached his teacher, Drona, and spoke these words.

Chapter 1 Verse 3 — Behold, O Teacher! This mighty army of the sons of Pandu, arrayed by the son of Drupada, thy wise disciple.

Chapter 1 Verse 4 — Here are heroes, mighty archers, equal in battle to Bhima and Arjuna, Yuyudhana (Satyaki), Virata, and Drupada—all mighty warriors.

Chapter 1 Verse 5 — Dhrishtaketu, Chekitana, the valiant king of Kasi, Purujit, Kuntibhoja, and Saibya—the best of men.

Chapter 1 Verse 6 — The strong Yudhamanyu and the brave Utta

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(final_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 185 sub-documents.


In [8]:
 
class Config:
    """Configuration settings for the document processor."""
    CHUNK_SIZE = 1000  # The target size for each final text chunk in characters.
    CHUNK_OVERLAP = 150 # Number of characters to overlap between chunks.
    GROUP_SIZE = 5     # Number of CSV rows to group together before splitting.

In [9]:
import os
from langchain_chroma import Chroma
# from langchain_openai import OpenAIEmbeddings # <- REMOVE THIS
from langchain_community.embeddings import HuggingFaceEmbeddings # <- ADD THIS

# 1. Initialize your new embedding function
# embeddings = OpenAIEmbeddings() # <- REPLACE THIS

# Use a popular, lightweight model from Hugging Face
# The first time you run this, it will download the model which may take a minute.
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'} # Use 'cuda' for GPU
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

print("Successfully imported Chroma and initialized HuggingFace embeddings.")

# 2. Your code to create or load the vector store
# THIS PART REMAINS EXACTLY THE SAME
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_huggingface_db", # Changed directory to avoid conflicts
)

print("Successfully created Chroma vector store instance with HuggingFace embeddings.")

# 3. Add some documents to verify it's working
vector_store.add_texts(
    texts=["This is a test document about ChromaDB.", "LangChain helps build LLM applications."],
    metadatas=[{"source": "test1"}, {"source": "test2"}],
    ids=["doc1", "doc2"]
)

print("Successfully added documents to the vector store.")

 

  embeddings = HuggingFaceEmbeddings(


Successfully imported Chroma and initialized HuggingFace embeddings.
Successfully created Chroma vector store instance with HuggingFace embeddings.
Successfully added documents to the vector store.


In [32]:
import os
from langchain_groq import ChatGroq
# Assume 'vector_store' and 'query' are already defined from the retrieval step
# vector_store = Chroma(...)

# --- Groq Integration ---

# Make sure your Groq API key is set
# os.environ["GROQ_API_KEY"] = "gsk_..."
query = "Is there any significance of time in the attainment of Yoga?"
retrieved_docs = vector_store.similarity_search(query, k=5)

# 1. Define the Groq LLM you want to use for generation
# Llama3 8b is a great, fast choice available on Groq
llm = ChatGroq(model_name="openai/gpt-oss-20b")

# 2. Create a prompt template (this part is unchanged)
prompt_template = """
"You are a helpful assistant. Use the provided context from the source"
             "the user's question accurately. "
             "Do not include verse text in your answer. The answer is is ashort as possible.
Context:
{context}

Question: {question}
"""

# 3. Format the retrieved documents into a single context string
context_string = "\n\n".join([doc.page_content for doc in retrieved_docs])

# 4. Fill the prompt with the context and question
formatted_prompt = prompt_template.format(
    context=context_string,
    question=query
)

# 5. Call the Groq LLM to get the fast, final answer
final_answer = llm.invoke(formatted_prompt)

 
print(final_answer.content)
 

Yes. Certain times of day—especially early morning (dawn) and twilight—are regarded as most auspicious for yoga practice, and regular, sustained practice over time is essential for progress.


In [11]:
# Notebook cell: setup embeddings and Chroma (no OpenAI)
from modules.config import Config
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# HuggingFace embeddings (same as Neo4j pipeline)
embeddings = HuggingFaceEmbeddings(
    model_name=Config.EMBEDDING_MODEL,
    model_kwargs={'device': Config.EMBEDDING_DEVICE},
    encode_kwargs={'normalize_embeddings': True},
)

vector_store = Chroma(
    collection_name="gita_idx",
    embedding_function=embeddings,
    persist_directory="./chroma_huggingface_db",
)
print("Chroma + HuggingFaceEmbeddings ready.")

Chroma + HuggingFaceEmbeddings ready.


In [12]:
# Notebook cell: build docs for Chroma (if not already built in this session)
from modules.document_processor import DocumentProcessor

csv_path = "gita.csv"  # adjust path if needed
processor = DocumentProcessor()
final_docs, original_df = processor.process_csv_to_chunks(
    file_path=csv_path,
    content_column="translation",
    filename="gita.csv"
)

# Populate Chroma with the same docs
vector_store.add_documents(final_docs)
print(f"Chroma populated with {len(final_docs)} chunks from {csv_path}.")

ValueError: Expected metadata value to be a str, int, float, bool, SparseVector, or None, got [1, 1, 1, 1, 1, 1] which is a list in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.

In [None]:
# Notebook cell: compare function
from modules.llm_chain import LLMChain
from modules.neo4j_manager import Neo4jManager
from modules.config import Config

def compare_results(query: str, filename: str, top_k: int = 5):
    llm = LLMChain()

    # 1) Rewrite once for fairness (same rule as your app)
    rw = llm.rewrite_query(query, chat_history=[], source_name=filename, fallback_on_error=True)
    rewritten = rw.get("rewritten_query", query)

    # 2) Neo4j retrieval (connect to existing index for this file)
    idx_name = Config.get_index_name(filename)
    neo = Neo4jManager()                            # uses the same HF embeddings as Chroma
    neo.connect_to_existing_index(idx_name)         # index should already exist (created via app)
    neo_docs = neo.retrieve_with_filename_filter(rewritten, filename, top_k=top_k)
    neo_answer = llm.graph_qa_chain(
        question=query, docs=neo_docs, source_name=filename, memory_context=None, fallback_on_error=True
    )

    # 3) Chroma retrieval with the SAME rewritten query and SAME LLM
    chroma_docs = vector_store.similarity_search(rewritten, k=top_k)
    chroma_answer = llm.graph_qa_chain(
        question=query, docs=chroma_docs, source_name=filename, memory_context=None, fallback_on_error=True
    )

    # 4) Print side-by-side summaries
    print("=== Query ===")
    print("Original: ", query)
    print("Rewritten:", rewritten)
    print()

    print("=== Neo4j Answer ===")
    print(neo_answer.get("text", ""))
    print("Refs:", neo_answer.get("refrence", []))
    print()

    print("=== Chroma Answer ===")
    print(chroma_answer.get("text", ""))
    print("Refs:", chroma_answer.get("refrence", []))
    print()

    # Optional: top-1 snippet previews to see retrieved context difference
    if neo_docs:
        print("Neo4j top-1 snippet:", neo_docs[0].page_content[:250].replace("\n", " "), "| score:", neo_docs[0].metadata.get("score"))
    if chroma_docs:
        print("Chroma top-1 snippet:", chroma_docs[0].page_content[:250].replace("\n", " "))

# Example usage:
# compare_results("What is karma yoga?", filename="gita.csv", top_k=5)

In [None]:
# Create the Neo4j vector index from the same final_docs (one-time)
nm = Neo4jManager()
nm.create_vector_store(final_docs, Config.get_index_name("gita.csv"))
nm.create_file_relationships("gita.csv")