In [None]:
%pip install requests beautifulsoup4 langchain sentence-transformers chromadb streamlit langchain_community selenium webdriver-manager openai lxml pymupdf unstructured pdfminer.six streamlit_audio_recorder SpeechRecognition pydub openai-whisper




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting unstructured
  Downloading unstructured-0.18.3-py3-none-any.whl.metadata (24 kB)
Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting chardet (from unstructured)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting nltk (from unstructured)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9

In [None]:

# Core
import os
import requests
from bs4 import BeautifulSoup

# LangChain & Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory


# For local saving
import pickle


In [2]:
# Set up directories
os.makedirs("data", exist_ok=True)
os.makedirs("embeddings", exist_ok=True)


In [9]:


from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
from bs4 import BeautifulSoup

def get_rendered_html(url, wait_time=3):
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    driver.get(url)
    time.sleep(wait_time)  # Give time for JS to load
    
    html = driver.page_source
    driver.quit()
    
    return html


In [11]:
def extract_text_from_url(url):
    try:
        html = get_rendered_html(url)
        soup = BeautifulSoup(html, "lxml")
        texts = []
        for tag in ["p", "li", "td"]:
            elements = soup.find_all(tag)
            texts.extend([el.get_text(strip=True) for el in elements if el.get_text(strip=True)])
        return "\n".join(texts)
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

# ------------------------------
# URL List
# ------------------------------
urls = {
    "weather reports": "https://mosdac.gov.in/weather-reports",
    "satellites catalog": "https://mosdac.gov.in/catalog/satellite.php",
    "faq": "https://mosdac.gov.in/faq-page#n1277",
    "about": "https://mosdac.gov.in/about-us"
}

# ------------------------------
# Scraping All Pages
# ------------------------------
combined_data = ""
for name, url in urls.items():
    print(f"Scraping: {name} ({url})")
    page_text = extract_text_from_url(url)
    combined_data += f"\n\n====== {name.upper()} PAGE ======\n\n" + page_text

# ------------------------------
# Save Output
# ------------------------------
with open("mosdac_all_pages.txt", "w", encoding="utf-8") as f:
    f.write(combined_data)

print("\n✅ Done! Fully rendered content saved to 'mosdac_all_pages.txt'")

Scraping: weather reports (https://mosdac.gov.in/weather-reports)
Scraping: satellites catalog (https://mosdac.gov.in/catalog/satellite.php)
Scraping: faq (https://mosdac.gov.in/faq-page#n1277)
Scraping: about (https://mosdac.gov.in/about-us)

✅ Done! Fully rendered content saved to 'mosdac_all_pages.txt'


In [12]:
import re

# Load scraped file
with open("mosdac_all_pages.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# Define garbage phrases to remove
garbage_patterns = [
    r"(?i)skip to main content",
    r"(?i)login", r"(?i)password", r"(?i)user id",
    r"(?i)sitemap", r"(?i)privacy policy", r"(?i)terms of use",
    r"(?i)home\s*\|.*",  # Navigation bars
    r"\bOK\b", r"\bYes\b", r"\bNo\b"
]

# Remove garbage text
for pattern in garbage_patterns:
    raw_text = re.sub(pattern, '', raw_text)

with open("mosdac_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(raw_text)

print("✅ Cleaned data saved to 'mosdac_cleaned.txt'")


✅ Cleaned data saved to 'mosdac_cleaned.txt'


In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# 1. Load Cleaned Text
with open("mosdac_cleaned.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# 2. Split into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
chunks = text_splitter.split_text(raw_text)
print(f"✅ Split into {len(chunks)} chunks")

# 3. Initialize Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 4. Store in Chroma
vectorstore = Chroma.from_texts(chunks, embedding=embedding_model, persist_directory="mosdac_chroma")
vectorstore.persist()
print("✅ Embeddings stored in 'mosdac_chroma'")


✅ Split into 29 chunks


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


✅ Embeddings stored in 'mosdac_chroma'


  vectorstore.persist()


In [13]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import os

load_dotenv()

# 1. Load Chroma VectorStore
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory="mosdac_chroma", embedding_function=embedding_model)

# 2. Define custom prompt (Optional – LangChain uses a good default)
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful and precise assistant answering questions based only on the provided context and prior conversation.
Be concise and clear in your response. If the answer is not available in the context, reply:
"I could not find that information in the current data."

Context:
{context}

Question:
{question}

Answer:
"""
)

# 3. Load OpenRouter-compatible LLM (DeepSeek)
llm = ChatOpenAI(
    model="deepseek/deepseek-chat:free",
    temperature=0,
    openai_api_base="https://openrouter.ai/api/v1",
    openai_api_key=os.getenv("OPENROUTER_API_KEY")
)

# 4. Add memory for contextual chat
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

# 5. Create ConversationalRetrievalChain with memory
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    memory=memory,
    combine_docs_chain_kwargs={"prompt": prompt}  # optional; overrides default prompt
)

# 6. Ask a question
query = "What is MOSDAC and what kind of data does it provide?"
response = qa_chain.run({"question": query})

print("Q:", query)
print("A:", response)

# Optional: Follow-up to test context
followup = "Does it have data for rainfall and temperature?"
followup_response = qa_chain.run({"question": followup})

print("Q:", followup)
print("A:", followup_response)


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm
  vectorstore = Chroma(persist_directory="mosdac_chroma", embedding_function=embedding_model)
  llm = ChatOpenAI(
  memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
  response = qa_chain.run({"question": query})


Q: What is MOSDAC and what kind of data does it provide?
A: MOSDAC (Meteorological & Oceanographic Satellite Data Archival Centre) is a website owned and maintained by the Space Applications Centre, Indian Space Research Organisation (ISRO), under the Government of India. It provides a variety of data related to atmosphere, land, ocean, and inland water, including:

- **Atmosphere**: Bayesian-based MT-SAPHIR rainfall, GPS-derived integrated water vapour, GSMap ISRO Rain, METEOSAT8 Cloud Properties.  
- **Land**: 3D Volumetric TERLS DWR product, inland water height, river discharge, soil moisture.  
- **Ocean**: Global ocean surface current, high-resolution sea surface salinity, Indian mainland coastal product, ocean subsurface, oceanic eddies detection, sea ice occurrence probability, wave-based renewable energy.
Q: Does it have data for rainfall and temperature?
A: I could not find that information in the current data. The context mentions rainfall-related data (e.g., Bayesian based M