In [1]:
%pip install requests beautifulsoup4 langchain sentence-transformers chromadb streamlit langchain_community selenium webdriver-manager openai lxml


Note: you may need to restart the kernel to use updated packages.


In [1]:

# Core
import os
import requests
from bs4 import BeautifulSoup

# LangChain & Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# For local saving
import pickle


In [2]:
# Set up directories
os.makedirs("data", exist_ok=True)
os.makedirs("embeddings", exist_ok=True)


In [9]:


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup

def get_rendered_html(url, wait_time=3):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(url)
    time.sleep(wait_time)  # Give time for JS to load
    
    html = driver.page_source
    driver.quit()
    
    return html


In [11]:
def extract_text_from_url(url):
    try:
        html = get_rendered_html(url)
        soup = BeautifulSoup(html, "lxml")
        texts = []
        for tag in ["p", "li", "td"]:
            elements = soup.find_all(tag)
            texts.extend([el.get_text(strip=True) for el in elements if el.get_text(strip=True)])
        return "\n".join(texts)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

# ------------------------------
# URL List
# ------------------------------
urls = {
    "weather reports": "https://mosdac.gov.in/weather-reports",
    "satellites catalog": "https://mosdac.gov.in/catalog/satellite.php",
    "faq": "https://mosdac.gov.in/faq-page#n1277",
    "about": "https://mosdac.gov.in/about-us"
}

# ------------------------------
# Scraping All Pages
# ------------------------------
combined_data = ""
for name, url in urls.items():
    print(f"Scraping: {name} ({url})")
    page_text = extract_text_from_url(url)
    combined_data += f"\n\n====== {name.upper()} PAGE ======\n\n" + page_text

# ------------------------------
# Save Output
# ------------------------------
with open("mosdac_all_pages.txt", "w", encoding="utf-8") as f:
    f.write(combined_data)

print("\n✅ Done! Fully rendered content saved to 'mosdac_all_pages.txt'")

Scraping: weather reports (https://mosdac.gov.in/weather-reports)
Scraping: satellites catalog (https://mosdac.gov.in/catalog/satellite.php)
Scraping: faq (https://mosdac.gov.in/faq-page#n1277)
Scraping: about (https://mosdac.gov.in/about-us)

✅ Done! Fully rendered content saved to 'mosdac_all_pages.txt'


In [None]:
import re

# Load scraped file
with open("mosdac_all_pages.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Define garbage phrases to remove
garbage_patterns = [
    r"(?i)skip to main content",
    r"(?i)login", r"(?i)password", r"(?i)user id",
    r"(?i)sitemap", r"(?i)privacy policy", r"(?i)terms of use",
    r"(?i)home\s*\|.*",  # Navigation bars
    r"\bOK\b", r"\bYes\b", r"\bNo\b"
]

# Remove garbage text
for pattern in garbage_patterns:
    raw_text = re.sub(pattern, '', raw_text)

with open("mosdac_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(raw_text)

print("✅ Cleaned data saved to 'mosdac_cleaned.txt'")


✅ Cleaned data saved to 'mosdac_cleaned.txt'


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# 1. Load Cleaned Text
with open("mosdac_cleaned.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# 2. Split into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
chunks = text_splitter.split_text(raw_text)
print(f"✅ Split into {len(chunks)} chunks")

# 3. Initialize Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 4. Store in Chroma
vectorstore = Chroma.from_texts(chunks, embedding=embedding_model, persist_directory="mosdac_chroma")
vectorstore.persist()
print("✅ Embeddings stored in 'mosdac_chroma'")


✅ Split into 29 chunks


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


✅ Embeddings stored in 'mosdac_chroma'


  vectorstore.persist()


In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub  # or use dummy if no LLM
from langchain.document_loaders import TextLoader
from langchain.chains import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

load_dotenv()


# 1. Load Chroma VectorStore
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory="mosdac_chroma", embedding_function=embedding_model)

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful and precise assistant answering questions based only on the provided context.
Be concise and clear in your response. If the answer is not available in the context, reply with:
"I could not find that information in the current data."

Context:
{context}

Question:
{question}

Answer:
"""
)

# 3. Load OpenRouter-compatible LLM (DeepSeek)
llm = ChatOpenAI(
    model="deepseek/deepseek-chat:free",  # Corrected model name for OpenRouter
    temperature=0,
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key=os.getenv("OPENROUTER_API_KEY")  # Make sure your key is exported correctly
)

# 4. Create the document chain (StuffDocumentsChain expects LLMChain)
llm_chain = LLMChain(llm=llm, prompt=prompt)
stuff_chain = StuffDocumentsChain(llm_chain=llm_chain, document_variable_name="context")

# 5. Combine with RetrievalQA
qa_chain = RetrievalQA(
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    combine_documents_chain=stuff_chain,
    return_source_documents=False
)


# 6. Ask a question
query = "What is MOSDAC and what kind of data does it provide?"
response = qa_chain.run(query)

print("Q:", query)
print("A:", response)


Q: What is MOSDAC and what kind of data does it provide?
A: MOSDAC (Meteorological and Oceanographic Satellite Data Archival Centre) is a website owned and maintained by the Space Applications Centre, Indian Space Research Organisation (ISRO), Government of India. It provides various types of data, including:

- **Atmosphere**: Bayesian-based MT-SAPHIR rainfall, GPS-derived integrated water vapour, GSMap ISRO Rain, METEOSAT8 Cloud Properties.
- **Land**: 3D Volumetric TERLS DWR product, Inland Water Height, River Discharge, Soil Moisture.
- **Ocean**: Global Ocean Surface Current, High-Resolution Sea Surface Salinity, Indian Mainland Coastal Product, Ocean Subsurface, Oceanic Eddies Detection, Sea Ice Occurrence Probability, Wave-based Renewable Energy.
