In [38]:
import os
import json
from langchain.schema import Document
from langchain_community.document_loaders import JSONLoader
from langchain_chroma import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

In [39]:
import os, json
from langchain.schema import Document

def flatten_json_for_rag(directory: str):
    docs = []
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            filepath = os.path.join(directory, filename)
            # Explicit UTF-8 encoding
            with open(filepath, "r", encoding="utf-8") as f:
                data = json.load(f)

            content = json.dumps(data, indent=2)
            metadata = {"source": filename}
            docs.append(Document(page_content=content, metadata=metadata))
    return docs

docs = flatten_json_for_rag("crop_cultivation_json")
print(f"Loaded {len(docs)} documents")

Loaded 30 documents


In [40]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    add_start_index=True,
)

split_docs = []


for doc in docs:
    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        split_docs.append(Document(page_content=chunk, metadata={**doc.metadata, "chunk": i}))

In [41]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [42]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

vectorstore = FAISS.from_documents(split_docs, embeddings)

In [33]:
vectorstore.save_local("faiss_index_vectorstore")
print("Vector store saved to 'faiss_index_vectorstore'")

Vector store saved to 'faiss_index_vectorstore'


In [43]:
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv

load_dotenv()

## Create the LLM
llm =  init_chat_model("gemini-2.5-flash", model_provider="google_genai")


In [44]:
from langchain.chains import RetrievalQA

In [45]:
retriever = vectorstore.as_retriever()
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever
)

In [49]:
query = "what are some diseases in ginger?"
result = qa.invoke(query)
print(result)

{'query': 'what are some diseases in ginger?', 'result': 'Some diseases that affect ginger are:\n*   Red rot\n*   Damping off (Rhizoctonia solani)\n*   Collar rot (Pythium sp., Phytophthora sp. and Sclerotium rolfsii)\n*   Alternaria leaf spot (Alternaria tagetica, A. zinnia and A. alternata)'}


### chroma DB method

In [6]:
print(f"Total characters: {len(docs[0].page_content)}")
print(docs[0].page_content[:300])

Total characters: 6181
Ginger Cultivation Guide

Introduction Ginger is an important spice and used in different forms such as raw ginger, dry ginger, bleached dry ginger, ginger powder oleoresin, ginger beer, ginger candy, ginger wine etc. Kerala is the major ginger growing state. Other major ginger growing states are Or


In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=1000,  # chunk size (characters)
#     chunk_overlap=200,  # chunk overlap (characters)
#     add_start_index=True,  # track index in original document
# )
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    add_start_index=True,
)

all_splits = text_splitter.split_documents(docs)

print(f"Split docs into {len(all_splits)} sub-documents.")

Split docs into 47 sub-documents.


In [8]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [9]:
vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db"
)

In [10]:
# Index chunks
_ = vector_store.add_documents(documents=all_splits)

In [11]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [26]:
query = "seed cultivation ginger"
results = retriever.get_relevant_documents(query)


In [27]:
results

[Document(id='14633aee-a016-4d77-869e-5e8dcf8d1afb', metadata={'source': 'crop_cultivation_information\\ginger_info.txt', 'start_index': 1699}, page_content='Propagation Ginger is always propagated by rhizomes. Carefully preserved seeds rhizomes are cut into small pieces of 2.5-5cm length weighing 20-25g each having one or two good buds. The seed rate varies from region to region and with the method of cultivation adopted. The seed rhizomes are treated with 0.3% Dithane M-45 (3g in one litre of water) for 30 min, drained and planted at a spacing of 20-25cm along the rows and 20-25cm between the rows.'),
 Document(id='3b3badf8-2992-4256-a259-cdc596883ebf', metadata={'start_index': 1232, 'source': 'crop_cultivation_information\\ginger_info.txt'}, page_content='Varieties They are generally named after the localities or places where they are grown. Common cultivars are Maran, Assam and Rio-de-Janeiro, Suprabha, Suruchi, Suravi, Himagiri, China, Himachal, Nadia, HSR-Varada.\n\nSeason The be