In [1]:
import os
import re
import time
import json
import requests
from pathlib import Path
from datetime import datetime, timedelta
from urllib.parse import urlencode, quote
from dateutil import parser as dateparser
from bs4 import BeautifulSoup
import pdfplumber
from typing import List
from IPython.display import clear_output

In [2]:
states = {
  "Andaman and Nicobar": "Andaman and Nicobar",
  "Andhra Pradesh": "Andhra Pradesh",
  "Arunachal Pradesh": "Arunachal Pradesh",
  "Assam": "Assam",
  "Bihar": "Bihar",
  "Chandigarh": "Chandigarh",
  "Chhattisgarh": "Chhattisgarh",
  "Dadra And Nagar Haveli": "Dadra And Nagar Haveli",
  "Daman And Diu": "Daman And Diu",
  "Goa": "Goa",
  "Gujarat": "Gujarat",
  "Haryana": "Haryana",
  "Himachal Pradesh": "Himachal Pradesh",
  "Jammu and Kashmir": "Jammu and Kashmir",
  "Jharkhand": "Jharkhand",
  "Karnataka": "Karnataka",
  "Kerala": "Kerala",
  "Lakshadweep": "Lakshadweep",
  "Madhya Pradesh": "Madhya Pradesh",
  "Maharashtra": "Maharashtra",
  "Manipur": "Manipur",
  "Meghalaya": "Meghalaya",
  "Mizoram": "Mizoram",
  "Nagaland": "Nagaland",
  "New Delhi": "New Delhi",
  "Odisha": "Odisha",
  "Puducherry": "Puducherry",
  "Punjab": "Punjab",
  "Rajasthan": "Rajasthan",
  "Sikkim": "Sikkim",
  "Tamil Nadu": "Tamil Nadu",
  "Telangana": "Telangana",
  "Tripura": "Tripura",
  "Uttar Pradesh": "Uttar Pradesh",
  "Uttarakhand": "Uttarakhand",
  "West Bengal": "West Bengal"
}

In [3]:
def sanitize_filename(s: str) -> str:
    """Make a safe filename fragment with underscores for spaces."""
    s = re.sub(r"\s+", "_", s.strip())
    return re.sub(r"[^A-Za-z0-9._-]", "", s)

In [4]:
def looks_like_pdf(resp: requests.Response) -> bool:
    ct = (resp.headers.get("Content-Type") or "").lower()
    if "application/pdf" in ct:
        return True
    # Sometimes servers mislabel; quick heuristic:
    return resp.content[:5] == b"%PDF-"

In [5]:
def get_districts(state: str) -> List:
    BASE = "https://mausam.imd.gov.in/responsive/agrometinformation/district_past_en_get.php"
    params = {
        "s": state, 
        "d": "undefined",
        "step1": "true"
    }
    qs = urlencode(params, quote_via=quote)
    url = f"{BASE}?{qs}"
    # print(url)
    try:
        resp = requests.get(url, timeout=10)  # 10s timeout for safety
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            options = soup.find_all("option")
            districts = {}
            
            for opt in options:
                key = (opt.get("value") or "").strip()
                val = (opt.text or "").strip()
                # skip placeholder
                if not key or "select" in val.lower():
                    continue
                districts[key] = val
            return districts
        else:
            print(f"Request failed with status code {resp.status_code}")
    except Exception as e:
        print(f"Error: {e}")    

In [6]:
def get_bulletin(state: str, district: str, date_str: str, output_path: str, language: str = "English") -> str:
    BASE_URL = "https://imdagrimet.gov.in/Services/DistrictBulletin.php"
    params = {
        "state": state,
        "district": district,
        "date": date_str,
        "language": language,
    }
    qs = urlencode(params, quote_via=quote)
    url = f"{BASE_URL}?{qs}"

    # Parse/normalize date to get year/month folders
    dt = dateparser.parse(date_str).date()
    year_str = dt.strftime("%Y") 
    month_str = dt.strftime("%b") 

    # Make output folder: output_path/YYYY/MM
    out_dir = Path(output_path) / year_str / month_str / sanitize_filename(state)
    out_dir.mkdir(parents=True, exist_ok=True)

    # Filename: state_district_date.pdf
    filename = f"{sanitize_filename(state)}_{sanitize_filename(district)}_{dt.isoformat()}.pdf"
    out_file = out_dir / filename.lower()

    headers = {"User-Agent": "Mozilla/5.0 (compatible; PDFFetcher/1.0)"}
    resp = requests.get(url, headers=headers, timeout=30)
    if resp.status_code == 200 and resp.content[:5] == b"%PDF-":
        with open(out_file, "wb") as f:
            f.write(resp.content)
        return out_file
    else:
        raise RuntimeError(f"Failed to download PDF (status {resp.status_code}) from {url}")

In [7]:
def run_bulletin_job(states: dict, date_start: str, date_end: str, output_root:str, pause=0.6):
    """
    Iterate states, fetch districts from IMD, iterate dates (inclusive),
    and call your existing download_bulletin(...).

    Args:
        states (list[str]): State names to process.
        date_start (str): Start date 'YYYY-MM-DD' (inclusive).
        date_end (str): End date 'YYYY-MM-DD' (inclusive).
        output_root (str|path): Base output folder used by download_bulletin.
        pause (float): Seconds to sleep between requests.
    """

    def _daterange(s, e):
        d0, d1 = dateparser.parse(s).date(), dateparser.parse(e).date()
        if d1 < d0:
            raise ValueError("date_end is earlier than date_start")
        d = d0
        one = timedelta(days=1)
        while d <= d1:
            yield d.isoformat()
            d += one

    for state in states:
        print(f" Getting district for {state}")
        districts = get_districts(state)
        
        for district in districts:
            for date_str in _daterange(date_start, date_end):
                try:
                    saved = get_bulletin(state, district, date_str, output_root)
                    print(f"[OK] {state} / {district} / {date_str} -> {saved}")
                except Exception as e:
                    print(f"[FAIL] {state} / {district} / {date_str}: {e}")
                time.sleep(pause)
        clear_output(wait=True)
    clear_output(wait=True)

In [8]:
start_date = "2025-06-01"
end_date = "2025-06-31"
out_root = "../data/mausam"


In [9]:
run_bulletin_job(states, start_date, end_date, out_root)

 Getting district for Andaman and Nicobar


ParserError: day is out of range for month: 2025-06-31

# Experimental

In [None]:
state = "Uttar Pradesh"
district = "Agra"
date_str = "2025-08-01"
out_root = "../data/mausam"

saved = get_bulletin(state, district, date_str, out_root)
print(f"Saved: {saved}")


In [None]:
get_districts("Uttar Pradesh")

In [None]:
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br, zstd',
    'Accept-Language': 'en-US,en;q=0.5',
    'Connection': 'keep-alive',
    'Cookie': 'IMDSESSIONID=jgvt06svgthl88cj9esgh35jrbj2u2hi',
    'Host': 'imdagrimet.gov.in',
    'Priority': 'u=4',
    'Referer': 'https://imdagrimet.gov.in/weatherdata/DistStationList.php?token=72e7a12d0f15362c57a016859adac2f2',
    'Sec-Fetch-Dest': 'frame',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:142.0) Gecko/20100101 Firefox/142.0'
}

In [None]:
url = "https://imdagrimet.gov.in/weatherdata/PastWeekWeather.php?st=AGROMET&si=42745&sn=Arnej(A)"
response = requests.get(url, headers=headers)

In [None]:
from IPython.display import display, HTML
display(HTML(response.text))