In [29]:
import re
from bs4 import SoupStrainer
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

def clean_text(text: str) -> str:
    # Collapse multiple newlines into a single newline
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

def scrape_and_clean(url: str, div_class: str = "page-content-ab") -> Document:
    """Scrape a webpage, extract main div text, clean it, and return Document."""
    loader = WebBaseLoader(
        url,
        bs_kwargs={"parse_only": SoupStrainer("div", class_=div_class)}
    )
    docs = loader.load()
    cleaned_docs = []

    for doc in docs:
        cleaned = clean_text(doc.page_content)

        # Split into title + body
        lines = cleaned.split("\n", 1)
        title = lines[0].strip() if lines else ""
        body = lines[1].strip() if len(lines) > 1 else ""

        new_doc = Document(
            page_content=body,
            metadata={**doc.metadata, "page_title": title}
        )
        cleaned_docs.append(new_doc)
    return cleaned_docs



In [31]:
with open(r"D:\QA\sebi_hackathon\backend\prototype\data\links.txt", "r", encoding="utf-8") as f:
    urls = [line.strip() for line in f if line.strip()]

print(len(urls))

23


In [32]:

all_docs = []
for url in urls:
    all_docs.extend(scrape_and_clean(url))

print(f"✅ Scraped {len(all_docs)} cleaned documents from {len(urls)} URLs")

✅ Scraped 23 cleaned documents from 23 URLs


In [33]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")


In [34]:
vector_store = Chroma(
    collection_name="sebi-resources",
    embedding_function=embeddings,
    persist_directory="../data/chroma_langchain_db",
)

In [35]:
all_docs[0]
# vector_store.add_documents(all_docs)


Document(metadata={'source': 'https://www.nism.ac.in/understanding-needs-wants-and-desires/', 'page_title': 'Understanding needs, wants, and desires'}, page_content='Managing money is about taking care of income and expenses. The expenses include various regular expenses, financial goals, investments, tax payments, and payments towards insurance premiums, among others.\nThe regular expenses or even some of the large lump sum expenses could be towards basic needs or towards wants or desires. While the three terms are often used interchangeably, they are not, and hence, it is important to understand the difference between them.\nNeeds:\nThese are things that are essential for our survival and well-being. Food, a roof over the head, water, clothing, and healthcare are among some examples of needs. It is not possible to live without these, and hence the needs assume priority when one plans one’s spending budget.\nWants:\nOnce the needs are met, we strive for things beyond those, for things