In [38]:
import requests
import json
import pandas as pd
import re
from bs4 import BeautifulSoup
from typing import List, Dict, Any

In [15]:
states = [
      {"id": "1", "name": "A And N Islands"},
      {"id": "2", "name": "Andhra Pradesh"},
      {"id": "3", "name": "Arunachal Pradesh"},
      {"id": "4", "name": "Assam"},
      {"id": "5", "name": "Bihar"},
      {"id": "6", "name": "Chandigarh"},
      {"id": "7", "name": "Chhattisgarh"},
      {"id": "8", "name": "Dadra And Nagar Haveli"},
      {"id": "9", "name": "Daman And Diu"},
      {"id": "10", "name": "Delhi"},
      {"id": "11", "name": "Goa"},
      {"id": "12", "name": "Gujarat"},
      {"id": "13", "name": "Haryana"},
      {"id": "14", "name": "Himachal Pradesh"},
      {"id": "15", "name": "Jammu And Kashmir"},
      {"id": "16", "name": "Jharkand"},
      {"id": "17", "name": "Karnataka"},
      {"id": "18", "name": "Kerala"},
      {"id": "19", "name": "Ladakh"},
      {"id": "20", "name": "Lakshadweep"},
      {"id": "21", "name": "Madhya Pradesh"},
      {"id": "22", "name": "Maharashtra"},
      {"id": "23", "name": "Manipur"},
      {"id": "24", "name": "Meghalaya"},
      {"id": "25", "name": "Mizoram"},
      {"id": "26", "name": "Nagaland"},
      {"id": "27", "name": "Odisha"},
      {"id": "28", "name": "Puducherry"},
      {"id": "29", "name": "Punjab"},
      {"id": "30", "name": "Rajasthan"},
      {"id": "31", "name": "Sikkim"},
      {"id": "32", "name": "Tamil Nadu"},
      {"id": "33", "name": "Telangana"},
      {"id": "34", "name": "Tripura"},
      {"id": "35", "name": "Uttarakhand"},
      {"id": "36", "name": "Uttar Pradesh"},
      {"id": "37", "name": "West Bengal"}
]

In [16]:
headers = {
    "Host": "imdagrimet.gov.in",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:142.0) Gecko/20100101 Firefox/142.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Referer": "https://imdagrimet.gov.in/weatherdata/BlockList.php",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "frame",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Priority": "u=4"
}

In [17]:
def map_district_keys(rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Rename keys in each dict: 'id' -> 'district_id', 'name' -> 'district_name'.
    Returns a NEW list (does not mutate the input).
    Non-dict items are ignored.
    """
    mapping = {"id": "district_id", "name": "district_name"}
    out = []
    for row in rows:
        if not isinstance(row, dict):
            continue
        out.append({mapping.get(k, k): v for k, v in row.items()})
    return out

In [18]:
def save_jsonl(data, file_path):
    """Save a list of dicts to JSONL."""
    with open(file_path, "w", encoding="utf-8") as f:
        for obj in data:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")


In [19]:
def load_jsonl(file_path):
    """Load a JSONL file and return a list of locations (strings)."""
    locations = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # skip empty lines
                obj = json.loads(line)
                locations.append(obj)
    return locations

In [20]:
def extract_districts(html):
    soup = BeautifulSoup(html, "lxml")
    districts = [
        {"id": opt.get("value"), "name": opt.text.strip()}
        for opt in soup.select("select#seldist option")
    ]
    
    # Drop the placeholder "Select"
    districts = [d for d in districts if d["id"] != "0"]
    return districts

In [21]:
def extract_blocks(html):
    blocks = []
    soup = BeautifulSoup(html, "lxml")
    # In the table, the 2nd <a> per row holds the block name + forecast link (rf=E)
    for row in soup.select("table.viewtb tr")[1:]:  # skip header row
        links = row.find_all("a")
        if len(links) >= 2:
            a = links[1]
            href = urljoin(base_url, a.get("href", "").strip())
            name = a.get_text(strip=True)
            qs = parse_qs(urlparse(href).query)
            blocks.append({
                "name": name,
                "href": href,
                "params": {k: v[0] for k, v in qs.items()}  # e.g., {"sc":"04","dc":"0401","bc":"4049","rf":"E"}
            })
    return blocks

In [22]:
def get_district_json(states, user):
    districts = []
    params = {
      "frmuser": user,
      "seldist": "0"
    }
    for state in states:
        print(f"Processing {state['id']}:{state['name']} ...")
        data = params.copy()
        data["selstate"] = state['id']
        response = requests.post(url, headers=headers, data=data)
        if response.status_code == 200:
            ds = map_district_keys(ds)
            items = {district_mapping.get(k, k): v for k, v in d.items()}
            districts.extend(items)
        else:
            print (response.text)
    save_jsonl(districts, "imdagrimet_districts.jsonl")
    return districts

In [30]:
def get_block_json(states, user):
    districts = []
    blocks = []
    params = {
      "frmuser": user,
    }
    for state in states:
        print(f"Processing {state['id']}:{state['name']} ...")
        data = params.copy()
        data["seldist"] = "0"
        data["selstate"] = state['id']
        response = requests.post(url, headers=headers, data=data)
        if response.status_code == 200:
            recs = extract_districts(response.text)
            items = map_district_keys(recs)
            print(items)
            input("District pulled ...")
            districts.extend(items)
            for rec in recs:
                print(f"Processing {rec['id']}:{rec['name']} ...")
                data = params.copy()
                data["btvw"] = "View+Blocks"
                response = requests.post(url, headers=headers, data=data)
                if response.status_code == 200:
                    items = extract_blocks(response.text)
                    print(items)
                    blocks.extend(items)
                else:
                    print (response.text)        
                input("...")
        else:
            print (response.text)
    save_jsonl(districts, "imdagrimet_blocks.jsonl")
    return districts, blocks

# Session

In [28]:
token = "8374125c317e85a80177f8df0cf4a1ff"
cookie = "IMDSESSIONID=tobsrpjti6ikk9g7ldp4h3grj3hk0ndo"
user = "74792e1a6706ce32f21d73d2bb3d0903"

In [29]:
uri = "https://imdagrimet.gov.in/weatherdata/BlockList.php"
url= uri + "?" + token
headers["Cookie"] = cookie

# Run

In [31]:
districts, blocks = get_block_json(states, user)

Processing 1:A And N Islands ...
[{'district_id': '1', 'district_name': 'Nicobar'}, {'district_id': '2', 'district_name': 'North And Middle Andaman'}, {'district_id': '3', 'district_name': 'South Andaman'}]


District pulled ... 


Processing 1:Nicobar ...
[]


... 


Processing 2:North And Middle Andaman ...
[]


... 


Processing 3:South Andaman ...
[]


... 


Processing 2:Andhra Pradesh ...
[{'district_id': '1', 'district_name': 'Alluri Sitharama Raju'}, {'district_id': '2', 'district_name': 'Anakapalli'}, {'district_id': '3', 'district_name': 'Anantpur'}, {'district_id': '4', 'district_name': 'Annamayya'}, {'district_id': '5', 'district_name': 'Bapatla'}, {'district_id': '6', 'district_name': 'Chittoor'}, {'district_id': '7', 'district_name': 'East Godavari'}, {'district_id': '8', 'district_name': 'Eluru'}, {'district_id': '9', 'district_name': 'Guntur'}, {'district_id': '10', 'district_name': 'Kakinada'}, {'district_id': '11', 'district_name': 'Konaseema'}, {'district_id': '12', 'district_name': 'Krishna'}, {'district_id': '13', 'district_name': 'Kurnool'}, {'district_id': '14', 'district_name': 'Nandyal'}, {'district_id': '15', 'district_name': 'Nellore'}, {'district_id': '16', 'district_name': 'Ntr'}, {'district_id': '17', 'district_name': 'Palnadu'}, {'district_id': '18', 'district_name': 'Parvathipuram Manyam'}, {'district_id': '19', 

District pulled ... 


Processing 1:Alluri Sitharama Raju ...
[]


... 


Processing 2:Anakapalli ...
[]


KeyboardInterrupt: Interrupted by user

# Meteo

In [32]:
url = "https://imdagrimet.gov.in/weatherdata/DistStationList.php?token=f475d468ed2dae41eac12961b2e36926"

In [33]:
def _to_float(x):
    x = x.strip()
    if x in {"", "-", "--", "- - -"}:
        return None
    if re.fullmatch(r"[+-]?\d+(\.\d+)?", x):
        return float(x)
    try:
        return float(re.sub(r"[^0-9.+-]", "", x))
    except:
        return None

In [34]:
url = "https://imdagrimet.gov.in/weatherdata/PastWeekWeather.php?st=AGROMET&si=43238&sn=Anantpur%20Rekalakuntha"

In [35]:
resp = requests.get(url, headers = headers)
resp.status_code

200

In [36]:
soup = BeautifulSoup(resp.text, "lxml")
table = soup.select_one("table.viewtb")
if table is None:
    raise RuntimeError("Could not find table.viewtb in the HTML.")

In [37]:
# --- extract rows ---
rows = []
for tr in table.find_all("tr"):
    cells = [td.get_text(strip=True).replace("\xa0", " ") for td in tr.find_all("td")]
    if cells:
        rows.append(cells)

# header: "Date", then dates..., last col = "Total / Average"
dates = rows[0][1:-1]
agg_label = rows[0][-1]

# build long records
records = []
agg = []
for r in rows[1:]:
    metric = r[0]
    day_vals = r[1:-1]
    agg_val = r[-1]
    for d, v in zip(dates, day_vals):
        records.append({"Date": d, "Metric": metric, "Value": _to_float(v)})
    agg.append({"Metric": metric, agg_label: _to_float(agg_val)})

df_long = pd.DataFrame(records)
df_agg = pd.DataFrame(agg)          # optional: totals/averages column

# --- TRANSPOSE: one row per date, metrics as columns ---
df_daily = df_long.pivot(index="Date", columns="Metric", values="Value").reset_index()

# optional: sort dates (dd-mm-YYYY -> to datetime)
df_daily["Date"] = pd.to_datetime(df_daily["Date"], format="%d-%m-%Y", errors="coerce")
df_daily = df_daily.sort_values("Date").reset_index(drop=True)

print(df_daily)

NameError: name 're' is not defined