In [None]:
# data_loader.py
import requests

BASE_URL = "https://www.myscheme.gov.in/api/v2/scheme/search"

def fetch_schemes(query: str, limit: int = 10):
    params = {"q": query, "limit": limit}
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json().get("schemes", [])
    return []


In [None]:
# retriever.py
from langchain.schema import Document

def get_scheme_docs(query: str):
    schemes = fetch_schemes(query)
    docs = []
    for s in schemes:
        title = s.get("title", "No Title")
        desc = s.get("shortDesc", "")
        url = f"https://www.myscheme.gov.in/schemes/{s.get('schemeId')}"
        docs.append(
            Document(
                page_content=f"{title}\n{desc}\nMore info: {url}",
                metadata={"source": url}
            )
        )
    return docs


In [None]:
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from transformers import pipeline
# from retriever import get_scheme_docs   # keep if you have retriever.py

def build_chatbot(query: str):
    docs = get_scheme_docs(query)

    # ✅ HuggingFace embeddings
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # ✅ Chroma VectorDB (with persistence option)
    vectordb = Chroma.from_documents(
        documents=docs,
        embedding=embeddings,
        persist_directory="./chroma_db"   # folder to save db
    )
    retriever = vectordb.as_retriever()

    # ✅ HuggingFace LLM (Flan-T5 as example)
    hf_pipeline = pipeline("text2text-generation", model="google/flan-t5-base", max_length=512)
    llm = HuggingFacePipeline(pipeline=hf_pipeline)

    # ✅ RetrievalQA chain
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    return qa_chain


In [None]:
# app.py
# from chatbot import build_chatbot

if __name__ == "__main__":
    query = "education loan schemes"
    chatbot = build_chatbot(query)
    result = chatbot.run("Tell me about available education loan schemes?")
    print(result)


In [None]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.myscheme.gov.in/search"

def scrape_schemes(query: str, limit: int = 5):
    params = {"q": query}
    response = requests.get(BASE_URL, params=params, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        print("Error:", response.status_code)
        return []

    soup = BeautifulSoup(response.text, "html.parser")
    schemes = []

    # Each scheme card is inside <a href="/schemes/...">
    for card in soup.select("a[href^='/schemes']")[:limit]:
        title = card.select_one("h2, h3")
        desc = card.select_one("p")
        schemes.append({
            "title": title.text.strip() if title else None,
            "description": desc.text.strip() if desc else None,
            "url": "https://www.myscheme.gov.in" + card["href"]
        })
    return schemes


if __name__ == "__main__":
    schemes = scrape_schemes("education", limit=5)
    for s in schemes:
        print(f"\nTitle: {s['title']}\nDescription: {s['description']}\nURL: {s['url']}")


In [None]:
!pip install --quiet -U langchain-scrapegraph



In [None]:
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = getpass.getpass()

In [None]:
from scrapegraph_py.logger import sgai_logger
import json

from langchain_scrapegraph.tools import (
    GetCreditsTool,
    
    MarkdownifyTool,
    SmartCrawlerTool,
    SmartScraperTool,
)

sgai_logger.set_logging(level="INFO")

smartscraper = SmartScraperTool()
smartcrawler = SmartCrawlerTool()
markdownify = MarkdownifyTool()
credits = GetCreditsTool()

In [None]:
# SmartScraper
result = smartscraper.invoke(
    {
        "user_prompt": "Extract the company name and description",
        "website_url": "https://www.myscheme.gov.in/search/category/Housing%20&%20Shelter",
    }
)
print("SmartScraper Result:", result)

# Markdownify
markdown = markdownify.invoke({"website_url": "https://www.myscheme.gov.in/search/category/Housing%20&%20Shelter"})
print("\nMarkdownify Result (first 200 chars):", markdown[:200])

# SmartCrawler
url = "https://www.myscheme.gov.in/search/category/Housing%20&%20Shelter"
prompt = (
    "What does the company do? and I need text content from their privacy and terms"
)

# Use the tool with crawling parameters
result_crawler = smartcrawler.invoke(
    {
        "url": url,
        "prompt": prompt,
        "cache_website": True,
        "depth": 2,
        "max_pages": 2,
        "same_domain_only": True,
    }
)

print("\nSmartCrawler Result:")
print(json.dumps(result_crawler, indent=2))

# Check credits
credits_info = credits.invoke({})
print("\nCredits Info:", credits_info)

In [None]:
!pip install crawl4ai

In [None]:
crawl4ai-setup


In [None]:
import asyncio
from crawl4ai import AsyncWebCrawler

async def main():
    # Create an instance of AsyncWebCrawler
    async with AsyncWebCrawler() as crawler:
        # Run the crawler on a URL
        result = await crawler.arun(url="https://crawl4ai.com")

        # Print the extracted content
        print(result.markdown)

# Run the async main function
asyncio.run(main())
