In [1]:
# !pip install ddgs groq crawl4ai PyPDF2 chromadb sentence-transformers requests beautifulsoup4 python-magic
import json


In [2]:
from ddgs import DDGS
from groq import Groq

client = Groq(api_key="")

def groq_llm(prompt):
    try:
        chat_completion = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.1-8b-instant",
            temperature=0.7,
            max_tokens=1024,
        )
        return chat_completion.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

In [3]:
from datetime import datetime, timedelta
import json
from ddgs import DDGS
from groq import Groq



def search_urls(query: str, max_results: int = 20):
    try:
        ddgs = DDGS()
        results = []
        search_results = list(ddgs.text(query, max_results=max_results))

        for result in search_results:
            results.append({
                'title': result.get('title', ''),
                'url': result.get('href', ''),
                'snippet': result.get('body', '')
            })
        return results
    except Exception as e:
        print(f"Search error for '{query}': {e}")
        return []

In [4]:
def validate_source_relevance(source, original_query):
    """Validate if a source is relevant to the original query"""
    validation_prompt = f"""
    Analyze if this source is relevant to the query: "{original_query}"

    Source Title: {source['title']}
    Source Snippet: {source['snippet']}
    Source URL: {source['url']}

    Answer ONLY with "RELEVANT" or "IRRELEVANT" based on whether this source contains useful information about the query topic.
    Consider it RELEVANT if it contains government data, reports, policies, or official information related to the query.
    Consider it IRRELEVANT if it's completely unrelated, spam, or doesn't contain substantive information.
    """

    try:
        response = groq_llm(validation_prompt)
        response = response.strip().upper()
        return "RELEVANT" in response
    except Exception as e:
        print(f"Validation error: {e}")
        return True

In [5]:

def get_indian_gov_sources(user_query):
    print("🇮🇳 Analyzing query for Indian government sources...")

    analysis_prompt = f"""
    Analyze this Indian government query and break it into 10 core search topics:
    User Query: {user_query}

    Return only the topics as a numbered list, one per line.
    Focus on government-related aspects, policies, schemes, data, and reports.
    """
    analysis_result = groq_llm(analysis_prompt)
    print(f"Core search topics:\n{analysis_result}")

    search_topics = []
    for line in analysis_result.split('\n'):
        line = line.strip()
        if line and any(c.isalnum() for c in line):
            clean_string = line.lstrip('1234567890. -•')
            if clean_string and len(clean_string) > 5:
                search_topics.append(clean_string)

    print(f"Extracted {len(search_topics)} search topics")

    search_prompts = []
    for topic in search_topics[:3]:
        prompts = [
            f"{topic}",
            f"{topic} PDF",
            f"{topic} document",
            f"{topic} filetype:pdf",
            f"{topic} report",
            f"{topic} data",
            f"{topic} statistics",
            f"{topic} policy",
            f"{topic} scheme",
            f"{topic} guidelines",
            f"{topic} official document",
            f"{topic} government data",
            f"{topic} annual report",
            f"{topic} whitepaper",
            f"{topic} research paper",
            f"{topic} analysis report",
            f"{topic} dataset"
        ]
        search_prompts.extend(prompts)

    all_urls = []
    print(f"\nExecuting {len(search_prompts)} search queries...")

    for i, search_prompt in enumerate(search_prompts, 1):
        print(f"   {i}. Searching: '{search_prompt}'")
        urls = search_urls(search_prompt, 2)
        print(f"      Found {len(urls)} URLs")
        all_urls.extend(urls)

    unique_urls = []
    seen_urls = set()
    for url in all_urls:
        if url['url'] not in seen_urls:
            unique_urls.append(url)
            seen_urls.add(url['url'])

    print(f"\nFound {len(unique_urls)} unique URLs before validation")

    print(f"\nValidating source relevance...")
    relevant_urls = []

    for i, source in enumerate(unique_urls, 1):
        print(f"   {i}. Validating: {source['title'][:50]}...")

        if validate_source_relevance(source, user_query):
            relevant_urls.append(source)
            print(f"      RELEVANT - Keeping source")
        else:
            print(f"      IRRELEVANT - Discarding source")

    print(f"\nValidation Results:")
    print(f"   Total sources found: {len(unique_urls)}")
    print(f"   Relevant sources: {len(relevant_urls)}")
    print(f"   Irrelevant sources discarded: {len(unique_urls) - len(relevant_urls)}")

    filename = "indian_gov_sources.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(relevant_urls, f, ensure_ascii=False, indent=4)

    print(f"\nSaved {len(relevant_urls)} relevant URLs to '{filename}'")

    return relevant_urls


In [6]:

if __name__ == "__main__":
    user_query = "Sewage Drainage Problems in Mumbai"
    results = get_indian_gov_sources(user_query)

    print(f"\nFinal Results:")
    for i, result in enumerate(results, 1):
        print(f"{i}. {result['title']}")
        print(f"   URL: {result['url']}")
        print(f"   Snippet: {result['snippet'][:100]}...")
        print()

🇮🇳 Analyzing query for Indian government sources...
Core search topics:
1. Mumbai Sewage Drainage System
2. Government Schemes for Sewage Management in Mumbai
3. Mumbai Municipal Corporation's Sewage Drainage Policy
4. Sewage Drainage Data for Mumbai
5. Government Reports on Mumbai's Sewage Drainage Issues
6. Mumbai's Sewage Drainage Infrastructure Development
7. Water Conservation and Sewage Management in Mumbai
8. Mumbai's Sewage Drainage Budget Allocation
9. Government Initiatives to Prevent Sewage Pollution in Mumbai
10. Mumbai's Sewage Drainage Regulatory Framework
Extracted 10 search topics

Executing 51 search queries...
   1. Searching: 'Mumbai Sewage Drainage System'
      Found 2 URLs
   2. Searching: 'Mumbai Sewage Drainage System PDF'
      Found 2 URLs
   3. Searching: 'Mumbai Sewage Drainage System document'
      Found 2 URLs
   4. Searching: 'Mumbai Sewage Drainage System filetype:pdf'
      Found 2 URLs
   5. Searching: 'Mumbai Sewage Drainage System report'
      Foun

In [7]:
import os
import json
import asyncio
import requests
from datetime import datetime, timedelta
from ddgs import DDGS
from groq import Groq
import PyPDF2
import chromadb
from sentence_transformers import SentenceTransformer
from crawl4ai import AsyncWebCrawler
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import urllib.parse



# async def crawl_urls_from_json(json_file_path, output_file="crawled_results.json"):
#     with open(json_file_path, 'r', encoding='utf-8') as f:
#         data = json.load(f)

#     browser_config = BrowserConfig()

#     run_config = CrawlerRunConfig(
#         word_count_threshold=10,
#         remove_overlay_elements=True,
#         exclude_external_links=False
#     )

#     results = []

#     async with AsyncWebCrawler(config=browser_config) as crawler:
#         for item in data:
#             url = item["url"]
#             print(f"Crawling: {url}")

#             try:
#                 result = await crawler.arun(url=url, config=run_config)

#                 crawled_data = {
#                     "title": item["title"],
#                     "url": url,
#                     "snippet": item["snippet"],
#                     "success": result.success,
#                     "status_code": result.status_code,
#                     "markdown_content": result.markdown if result.success else None,
#                     "error_message": result.error_message if not result.success else None
#                 }
#                 results.append(crawled_data)

#                 if result.success:
#                     print(f"✓ Successfully crawled: {url}")
#                 else:
#                     print(f"✗ Failed to crawl: {url} - {result.error_message}")

#             except Exception as e:
#                 print(f"✗ Error crawling {url}: {str(e)}")
#                 results.append({
#                     "title": item["title"],
#                     "url": url,
#                     "snippet": item["snippet"],
#                     "success": False,
#                     "status_code": None,
#                     "markdown_content": None,
#                     "error_message": str(e)
#                 })

#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(results, f, indent=2, ensure_ascii=False)

#     print(f"\nCrawling completed! Results saved to: {output_file}")

#     successful_crawls = sum(1 for r in results if r["success"])
#     print(f"Successfully crawled: {successful_crawls}/{len(data)} URLs")



In [41]:
async def crawl_urls_from_json(json_file_path, output_file="crawled_results.json"):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    browser_config = BrowserConfig()
    run_config = CrawlerRunConfig(
        word_count_threshold=10,
        remove_overlay_elements=True,
        exclude_external_links=False
    )

    results = []

    async with AsyncWebCrawler(config=browser_config) as crawler:
        for item in data:
            url = item["url"]
            print(f"Crawling: {url}")

            try:
                result = await crawler.arun(url=url, config=run_config)

                crawled_data = {
                    "title": item["title"],
                    "url": url,
                    "snippet": item["snippet"],
                    "success": result.success,
                    "markdown_content": result.markdown if result.success else None,
                    "error_message": result.error_message if not result.success else None
                }
                results.append(crawled_data)

                if result.success:
                    print(f"✓ Successfully crawled: {url}")
                else:
                    print(f"✗ Failed to crawl: {url}")

            except Exception as e:
                print(f"✗ Error crawling {url}: {str(e)}")
                results.append({
                    "title": item["title"],
                    "url": url,
                    "snippet": item["snippet"],
                    "success": False,
                    "markdown_content": None,
                    "error_message": str(e)
                })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\nCrawling completed! Results saved to: {output_file}")
    return results

def run_crawler():
    """Run the crawler (wrapper for async function)"""
    return crawl_urls_from_json("indian_gov_sources.json")

In [28]:
client = Groq(api_key="")

os.makedirs("files", exist_ok=True)
os.makedirs("pdfs", exist_ok=True)
os.makedirs("docs", exist_ok=True)

In [29]:
def download_file(url, folder="files"):
    """Download files from URLs"""
    try:
        parsed_url = urllib.parse.urlparse(url)
        filename = os.path.basename(parsed_url.path)

        if not filename:
            filename = f"downloaded_file_{hash(url)}"

        filepath = os.path.join(folder, filename)

        # Download the file
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        with open(filepath, 'wb') as f:
            f.write(response.content)

        print(f"Downloaded: {filename}")
        return filepath

    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None


In [30]:

def download_files_from_sources(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        sources = json.load(f)

    downloaded_files = []

    for source in sources:
        url = source['url']

        if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
            filepath = download_file(url)
            if filepath:
                downloaded_files.append({
                    'source_title': source['title'],
                    'url': url,
                    'filepath': filepath,
                    'file_type': os.path.splitext(filepath)[1].lower()
                })

    with open("downloaded_files.json", "w", encoding="utf-8") as f:
        json.dump(downloaded_files, f, indent=2, ensure_ascii=False)

    print(f"\nDownloaded {len(downloaded_files)} files")
    return downloaded_files

In [15]:
# !pip install crawl4ai ddgs
# !playwright install
# !pip install groq

In [31]:
async def crawl_urls_from_json(json_file_path, output_file="crawled_results.json"):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    browser_config = BrowserConfig()
    run_config = CrawlerRunConfig(
        word_count_threshold=10,
        remove_overlay_elements=True,
        exclude_external_links=False
    )

    results = []

    async with AsyncWebCrawler(config=browser_config) as crawler:
        for item in data:
            url = item["url"]
            print(f"Crawling: {url}")

            try:
                result = await crawler.arun(url=url, config=run_config)

                crawled_data = {
                    "title": item["title"],
                    "url": url,
                    "snippet": item["snippet"],
                    "success": result.success,
                    "markdown_content": result.markdown if result.success else None,
                    "error_message": result.error_message if not result.success else None
                }
                results.append(crawled_data)

                if result.success:
                    print(f"✓ Successfully crawled: {url}")
                else:
                    print(f"✗ Failed to crawl: {url}")

            except Exception as e:
                print(f"✗ Error crawling {url}: {str(e)}")
                results.append({
                    "title": item["title"],
                    "url": url,
                    "snippet": item["snippet"],
                    "success": False,
                    "markdown_content": None,
                    "error_message": str(e)
                })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

    print(f"\nCrawling completed! Results saved to: {output_file}")
    return results



In [32]:
def run_crawler():
    """Run the crawler (wrapper for async function)"""
    return asyncio.run(crawl_urls_from_json("indian_gov_sources.json"))

In [33]:
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    try:
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text() + "\n"
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""


In [34]:

def chunk_text(text, chunk_size=500, overlap=50):
    """Split text into overlapping chunks"""
    words = text.split()
    chunks = []

    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)

        if i + chunk_size >= len(words):
            break

    return chunks


In [35]:

def process_pdf_folder(pdf_folder="files"):
    """Process all PDFs in a folder and extract text"""
    pdf_data = []

    for filename in os.listdir(pdf_folder):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Processing PDF: {filename}")

            text = extract_text_from_pdf(pdf_path)
            chunks = chunk_text(text)

            for i, chunk in enumerate(chunks):
                pdf_data.append({
                    "filename": filename,
                    "chunk_id": i,
                    "text": chunk,
                    "total_chunks": len(chunks)
                })

    with open("extracted_pdf_text.json", "w", encoding="utf-8") as f:
        json.dump(pdf_data, f, indent=2, ensure_ascii=False)

    print(f"Processed {len(pdf_data)} chunks from PDFs")
    return pdf_data

In [36]:
def setup_vector_db():
    """Initialize ChromaDB and sentence transformer"""
    model = SentenceTransformer('all-MiniLM-L6-v2')
    chroma_client = chromadb.Client()
    collection = chroma_client.create_collection(name="documents")
    return model, collection


In [37]:

def add_to_vector_db(documents, metadatas, collection, model):
    """Add documents to vector database"""
    if not documents:
        print("No documents to add to vector DB")
        return

    embeddings = model.encode(documents).tolist()

    ids = [f"doc_{i}" for i in range(len(documents))]

    collection.add(
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )

    print(f"Added {len(documents)} documents to vector database")


In [38]:

def process_all_data_for_vector_db():
    """Process all crawled and PDF data for vector DB"""
    model, collection = setup_vector_db()

    all_documents = []
    all_metadatas = []

    if os.path.exists("crawled_results.json"):
        with open("crawled_results.json", 'r', encoding='utf-8') as f:
            crawled_data = json.load(f)

        for item in crawled_data:
            if item.get('success') and item.get('markdown_content'):
                all_documents.append(item['markdown_content'])
                all_metadatas.append({
                    "source": "web_crawl",
                    "title": item.get('title', ''),
                    "url": item.get('url', '')
                })

    if os.path.exists("extracted_pdf_text.json"):
        with open("extracted_pdf_text.json", 'r', encoding='utf-8') as f:
            pdf_data = json.load(f)

        for item in pdf_data:
            all_documents.append(item['text'])
            all_metadatas.append({
                "source": "pdf",
                "filename": item['filename'],
                "chunk_id": item['chunk_id']
            })

    add_to_vector_db(all_documents, all_metadatas, collection, model)
    print(f"Total documents in vector DB: {len(all_documents)}")

    return collection

In [45]:
async def main_workflow(user_query):
    # print("Starting complete workflow...")
    # sources = get_indian_gov_sources(user_query)

    print("Downloading files")
    downloaded_files = download_files_from_sources("indian_gov_sources.json")


    print("Crawling web pages")
    crawled_results = await run_crawler()

    print("Processing PDFs")
    pdf_data = process_pdf_folder()

    print("Creating vector database")
    collection = process_all_data_for_vector_db()

    print("\nWorkflow completed successfully!")
    return collection

if __name__ == "__main__":
    test_query = "Digital India initiative"
    await main_workflow(test_query)

Downloading files
Downloaded: sustainable-underground-water-drainage-system-for-mumbai-city-IJERTV8IS060285.pdf
Downloaded: Terratec-Project-Reference-Mumbai-Sewer-Projects-PST-I-II.pdf
Failed to download https://www.pas.org.in/Portal/document/ResourcesFiles/WorkshopPDFs/Citywide+Sanitation+Workshop/7_CSP+Mumbai+(A+Kantak).pdf: 404 Client Error: /Portal/document/ResourcesFiles/WorkshopPDFs/Citywide+Sanitation+Workshop/7_CSP+Mumbai+(A+Kantak).pdf for url: https://www.pas.org.in/Portal/document/ResourcesFiles/WorkshopPDFs/Citywide+Sanitation+Workshop/7_CSP+Mumbai+(A+Kantak).pdf
Failed to download https://praja.org/praja_docs/praja_downloads/Status+of+Civic+Issues+in+Mumbai+2024.pdf: 404 Client Error: Not Found for url: https://praja.org/praja_docs/praja_downloads/Status+of+Civic+Issues+in+Mumbai+2024.pdf
Downloaded: City-Waste-Management-Profile-Mumbai-India.pdf
Downloaded: JETIR2104153.pdf
Downloaded: Mumbai-RfP-_Waste-Composition-2.pdf
Failed to download https://www.mcgm.gov.in/irj/go/

Crawling: https://en.wikipedia.org/wiki/Brihanmumbai_Storm_Water_Disposal_System


✓ Successfully crawled: https://en.wikipedia.org/wiki/Brihanmumbai_Storm_Water_Disposal_System
Crawling: https://citizenmatters.in/explainer-sewage-in-mumbai-treatment-disposal/


✓ Successfully crawled: https://citizenmatters.in/explainer-sewage-in-mumbai-treatment-disposal/
Crawling: https://www.ijert.org/research/sustainable-underground-water-drainage-system-for-mumbai-city-IJERTV8IS060285.pdf


✓ Successfully crawled: https://www.ijert.org/research/sustainable-underground-water-drainage-system-for-mumbai-city-IJERTV8IS060285.pdf
Crawling: https://terratec.co/wp-content/uploads/2024/05/Terratec-Project-Reference-Mumbai-Sewer-Projects-PST-I-II.pdf


✓ Successfully crawled: https://terratec.co/wp-content/uploads/2024/05/Terratec-Project-Reference-Mumbai-Sewer-Projects-PST-I-II.pdf
Crawling: https://www.pas.org.in/Portal/document/ResourcesFiles/WorkshopPDFs/Citywide+Sanitation+Workshop/7_CSP+Mumbai+(A+Kantak).pdf


✓ Successfully crawled: https://www.pas.org.in/Portal/document/ResourcesFiles/WorkshopPDFs/Citywide+Sanitation+Workshop/7_CSP+Mumbai+(A+Kantak).pdf
Crawling: https://libreriacad.com/en/dwg/sewage-system-design/


✓ Successfully crawled: https://libreriacad.com/en/dwg/sewage-system-design/
Crawling: https://timesofindia.indiatimes.com/25-of-mumbai-sewage-flows-untreated-into-sea/articleshow/94594233.cms


✓ Successfully crawled: https://timesofindia.indiatimes.com/25-of-mumbai-sewage-flows-untreated-into-sea/articleshow/94594233.cms
Crawling: https://www.india.com/travel/mumbai/


✓ Successfully crawled: https://www.india.com/travel/mumbai/
Crawling: https://www.india.com/business/us-bound-air-india-flight-returns-to-mumbai-pilot-suspects-malfunction-after-3-hours-ai-191-newark-usa-flight-radar-24-8143935/


✓ Successfully crawled: https://www.india.com/business/us-bound-air-india-flight-returns-to-mumbai-pilot-suspects-malfunction-after-3-hours-ai-191-newark-usa-flight-radar-24-8143935/
Crawling: https://journals.sagepub.com/doi/10.1177/25148486251315289


✓ Successfully crawled: https://journals.sagepub.com/doi/10.1177/25148486251315289
Crawling: https://praja.org/praja_docs/praja_downloads/Status+of+Civic+Issues+in+Mumbai+2024.pdf


✓ Successfully crawled: https://praja.org/praja_docs/praja_downloads/Status+of+Civic+Issues+in+Mumbai+2024.pdf
Crawling: https://indianexpress.com/article/cities/mumbai/mumbai-sewage-disposal-mithi-river-bmc-ngt-neeri-5599336/


✓ Successfully crawled: https://indianexpress.com/article/cities/mumbai/mumbai-sewage-disposal-mithi-river-bmc-ngt-neeri-5599336/
Crawling: https://earth5r.org/water-security-and-urban-rivers-lessons-from-mumbai-to-new-york/


✓ Successfully crawled: https://earth5r.org/water-security-and-urban-rivers-lessons-from-mumbai-to-new-york/
Crawling: https://www.researchgate.net/publication/319098696_Evaluation_of_the_costs_and_benefits_of_mumbai_sewage_disposal_project_India


✓ Successfully crawled: https://www.researchgate.net/publication/319098696_Evaluation_of_the_costs_and_benefits_of_mumbai_sewage_disposal_project_India
Crawling: https://indianexpress.com/article/cities/mumbai/cabinet-clears-mumbai-slum-cluster-redevelopment-plan-sewage-treatment-reuse-policy-10293782/


✓ Successfully crawled: https://indianexpress.com/article/cities/mumbai/cabinet-clears-mumbai-slum-cluster-redevelopment-plan-sewage-treatment-reuse-policy-10293782/
Crawling: https://mumbaisuburban.gov.in/en/schemes/


✓ Successfully crawled: https://mumbaisuburban.gov.in/en/schemes/
Crawling: https://www.usa.gov/


✓ Successfully crawled: https://www.usa.gov/
Crawling: https://www.usa.gov/about-the-us


✓ Successfully crawled: https://www.usa.gov/about-the-us
Crawling: https://www.thecirculateinitiative.org/wp-content/uploads/City-Waste-Management-Profile-Mumbai-India.pdf


✓ Successfully crawled: https://www.thecirculateinitiative.org/wp-content/uploads/City-Waste-Management-Profile-Mumbai-India.pdf
Crawling: https://www.jetir.org/papers/JETIR2104153.pdf


✓ Successfully crawled: https://www.jetir.org/papers/JETIR2104153.pdf
Crawling: https://indianexpress.com/article/cities/mumbai/cag-report-financial-irregularities-found-in-bmcs-multi-crore-sewage-contracts-4792750/


✓ Successfully crawled: https://indianexpress.com/article/cities/mumbai/cag-report-financial-irregularities-found-in-bmcs-multi-crore-sewage-contracts-4792750/
Crawling: https://diversedaily.com/understanding-mumbais-sewage-treatment-infrastructure-addressing-challenges-and-promoting-sanitation/


✓ Successfully crawled: https://diversedaily.com/understanding-mumbais-sewage-treatment-infrastructure-addressing-challenges-and-promoting-sanitation/
Crawling: https://www.indiawaterportal.org/questions/solution-exchange-discussion-best-practices-effective-sewage-disposal-experiences


✓ Successfully crawled: https://www.indiawaterportal.org/questions/solution-exchange-discussion-best-practices-effective-sewage-disposal-experiences
Crawling: https://www.linkedin.com/pulse/sewage-treatment-plants-mumbai-perspective-from-vikrant-bhalerao


✓ Successfully crawled: https://www.linkedin.com/pulse/sewage-treatment-plants-mumbai-perspective-from-vikrant-bhalerao
Crawling: https://earth5r.org/indias-sustainability-push-30-government-schemes-driving-green-growth/


✓ Successfully crawled: https://earth5r.org/indias-sustainability-push-30-government-schemes-driving-green-growth/
Crawling: https://www.data.gov.in/sector/Water+and+Sanitation


✓ Successfully crawled: https://www.data.gov.in/sector/Water+and+Sanitation
Crawling: https://www.c40.org/wp-content/uploads/2024/02/Mumbai-RfP-_Waste-Composition-2.pdf


✓ Successfully crawled: https://www.c40.org/wp-content/uploads/2024/02/Mumbai-RfP-_Waste-Composition-2.pdf
Crawling: https://citizenmatters.in/mumbai-bmc-sewage-treatment-water/


✓ Successfully crawled: https://citizenmatters.in/mumbai-bmc-sewage-treatment-water/
Crawling: https://www.mcgm.gov.in/irj/go/km/docs/documents/MCGM+Department+List/City+Engineer/Deputy+City+Engineer+(Planning+and+Design)/City+Development+Plan/Urban+Basic+Services.pdf


✓ Successfully crawled: https://www.mcgm.gov.in/irj/go/km/docs/documents/MCGM+Department+List/City+Engineer/Deputy+City+Engineer+(Planning+and+Design)/City+Development+Plan/Urban+Basic+Services.pdf
Crawling: https://scroll.in/article/886210/mumbai-floods-the-development-crazy-city-must-stop-blocking-natural-drainage-channels-with-concrete


✓ Successfully crawled: https://scroll.in/article/886210/mumbai-floods-the-development-crazy-city-must-stop-blocking-natural-drainage-channels-with-concrete
Crawling: https://www.mcgm.gov.in/irj/go/km/docs/documents/Tenders/ETH/VolumeII_7000009684.pdf


✓ Successfully crawled: https://www.mcgm.gov.in/irj/go/km/docs/documents/Tenders/ETH/VolumeII_7000009684.pdf
Crawling: https://www.scribd.com/document/888428917/Mumbai-Civic-Diary-2025


✓ Successfully crawled: https://www.scribd.com/document/888428917/Mumbai-Civic-Diary-2025
Crawling: https://indiankanoon.org/doc/60254496/


✓ Successfully crawled: https://indiankanoon.org/doc/60254496/
Crawling: https://qz.com/india/1071407/all-the-reasons-why-mumbai-will-be-under-water-again-next-year-and-the-year-after-that


✓ Successfully crawled: https://qz.com/india/1071407/all-the-reasons-why-mumbai-will-be-under-water-again-next-year-and-the-year-after-that
Crawling: https://en.wikipedia.org/wiki/Mumbai


✓ Successfully crawled: https://en.wikipedia.org/wiki/Mumbai
Crawling: https://data.opencity.in/dataset?organization=brihanmumbai-municipal-corporation-bmc-mcgm


✓ Successfully crawled: https://data.opencity.in/dataset?organization=brihanmumbai-municipal-corporation-bmc-mcgm
Crawling: https://www.deswater.com/DWT_articles/vol_116_papers/116_2018_329.pdf


✓ Successfully crawled: https://www.deswater.com/DWT_articles/vol_116_papers/116_2018_329.pdf

Crawling completed! Results saved to: crawled_results.json
Processing PDFs
Processing PDF: VolumeII_7000009684.pdf


  ret = name.decode(enc)
  ret = name.decode(enc)


Processing PDF: City-Waste-Management-Profile-Mumbai-India.pdf
Processing PDF: sustainable-underground-water-drainage-system-for-mumbai-city-IJERTV8IS060285.pdf
Processing PDF: Mumbai-RfP-_Waste-Composition-2.pdf
Processing PDF: JETIR2104153.pdf
Processing PDF: Terratec-Project-Reference-Mumbai-Sewer-Projects-PST-I-II.pdf
Processing PDF: 116_2018_329.pdf
Processed 291 chunks from PDFs
Creating vector database


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Added 328 documents to vector database
Total documents in vector DB: 328

Workflow completed successfully!
