In [None]:
# ===============================
# RAG INGESTION NOTEBOOK
# ===============================

!pip install -q langchain langchain-community faiss-cpu sentence-transformers

In [None]:
import os
import json
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [None]:
def flatten_json(data, prefix="", source=""):
    docs = []
    if isinstance(data, dict):
        for k, v in data.items():
            new_prefix = f"{prefix}.{k}" if prefix else k
            docs.extend(flatten_json(v, new_prefix, source))
    elif isinstance(data, list):
        for item in data:
            docs.extend(flatten_json(item, prefix, source))
    else:
        text = str(data).strip()
        if text:
            docs.append(
                Document(
                    page_content=f"{prefix}: {text}",
                    metadata={"source": source}
                )
            )
    return docs


In [None]:
DATA_DIR = "/content/drive/MyDrive/new_rag"

documents = []

for file in os.listdir(DATA_DIR):
    if not file.endswith(".json"):
        continue
    with open(os.path.join(DATA_DIR, file), "r", encoding="utf-8") as f:
        data = json.load(f)
    documents.extend(flatten_json(data, source=file))

print("Raw documents:", len(documents))


In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=350,
    chunk_overlap=50
)

chunks = splitter.split_documents(documents)
print("Final chunks:", len(chunks))


In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    encode_kwargs={"normalize_embeddings": True}
)

vectorstore = FAISS.from_documents(chunks, embeddings)


In [None]:
SAVE_PATH = "/content/vectorstore"
vectorstore.save_local(SAVE_PATH)
print("âœ… Vectorstore saved at:", SAVE_PATH)
