In [14]:
# pip install pdfplumber pandas python-dateutil
import pdfplumber, re, json, os
import pandas as pd
from dateutil import parser as dateparser
from glob import glob
from IPython.display import clear_output

In [2]:
def clean_cell(x:str) -> str:
    if x is None: return ""
    x = re.sub(r"\s+", " ", str(x).strip())
    return x.replace(r"\u00b0C", "°C").replace("° C", "°C")


In [3]:
CANON = {
    "date": ["date", "date (y-m-d)", "dt"],
    "rainfall": ["rainfall", "rainfall (mm)", "rain (mm)", "rf"],
    "tmax": ["tmax", "t max", "max temp", "maximum temp", "tmax (°c)", "tmax (c)"],
    "tmin": ["tmin", "t min", "min temp", "minimum temp", "tmin (°c)", "tmin (c)"],
    "rh i": ["rh i", "rh1", "rh-i", "rh morning", "rh i (%)"],
    "rh ii": ["rh ii", "rh2", "rh-ii", "rh evening", "rh ii (%)"],
    "wind speed": ["wind speed", "wind (kmph)", "wnd spd"],
    "wind direction": ["wind direction", "direction (degree)", "degree"],
    "cloud cover": ["cloud cover", "cloud (octa)", "cloud cover (octa)"],
    "warnings": ["warnings", "warning", "remarks", "note"],
}

In [4]:
def canonize(h: str) -> str:
    h0 = clean_cell(h).lower()
    h0 = re.sub(r"[^a-z0-9 %°()-]", " ", h0)
    h0 = re.sub(r"\s+", " ", h0).strip()
    for k, alist in CANON.items():
        if h0 in [re.sub(r"[^a-z0-9 %°()-]", " ", a) for a in alist]: return k
    if "rain" in h0: return "rainfall"
    if ("max" in h0 and "temp" in h0) or "tmax" in h0: return "tmax"
    if ("min" in h0 and "temp" in h0) or "tmin" in h0: return "tmin"
    if h0.startswith("date"): return "date"
    if "wind" in h0 and "speed" in h0: return "wind speed"
    if "wind" in h0 and ("dir" in h0 or "degree" in h0): return "wind direction"
    if "cloud" in h0: return "cloud cover"
    if "warn" in h0 or "remark" in h0 or "note" in h0: return "warnings"
    return h0 or "col"

In [5]:
def parse_date(val):
    s = clean_cell(val)
    if not s: return pd.NaT
    # common bulletins: dd-mm-yyyy / dd/mm/yyyy
    for fmt in ("%d-%m-%Y", "%d/%m/%Y", "%Y-%m-%d"):
        try: return pd.to_datetime(s, format=fmt)
        except: pass
    try: return pd.to_datetime(dateparser.parse(s, dayfirst=True))
    except: return pd.NaT

In [6]:
def parse_num(val):
    s = clean_cell(val)
    m = re.search(r"-?\d+(?:\.\d+)?", s)
    return float(m.group(0)) if m else None

In [7]:
def looks_like_weather(cols):
    c = [x.lower() for x in cols]
    have_date = any(x.startswith("date") for x in c)
    have_rain = any("rain" in x for x in c)
    have_temp = any(("tmax" in x) or ("tmin" in x) or ("temp" in x) for x in c)
    return have_date and have_rain and have_temp

In [8]:
def extract_weather_table(pdf_path: str) -> pd.DataFrame:
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(pdf_path)

    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        tables = page.extract_tables()
        if not tables:
            raise ValueError("No tables found on page 1.")

    # pick first weather-like table; else first table
    chosen = None
    for tbl in tables:
        # headers guess = first non-empty row
        non_empty = [row for row in tbl if any(clean_cell(c) for c in row)]
        if not non_empty: continue
        hdr = [clean_cell(c) for c in non_empty[0]]
        if looks_like_weather(hdr):
            chosen = tbl
            break
    if chosen is None:
        chosen = tables[0]

    # clean rows & derive header
    rows = [[clean_cell(c) for c in r] for r in chosen if any(clean_cell(c) for c in r)]
    if not rows: raise ValueError("Detected table is empty.")
    header = rows[0]
    data_rows = rows[1:]

    # handle multi-line/merged headers: collapse duplicates & canonize
    header = [canonize(h) for h in header]
    # pad/truncate rows to header length
    width = len(header)
    fixed = []
    for r in data_rows:
        rr = (r + [""] * width)[:width]
        fixed.append(rr)

    df = pd.DataFrame(fixed, columns=header)

    # if header is too generic, force a safe minimal schema if present
    wanted = ["date","rainfall","tmax","tmin","rh i","rh ii","wind speed","wind direction","cloud cover","warnings"]
    # drop all-empty cols; dedupe columns
    df = df.dropna(axis=1, how="all")
    # re-order if our wanted names exist
    present = [c for c in wanted if c in df.columns]
    others = [c for c in df.columns if c not in present]
    df = df[present + others]

    # type parsing
    if "date" in df.columns: df["date"] = df["date"].apply(parse_date)
    for c in ["rainfall","tmax","tmin","rh i","rh ii","wind speed","wind direction","cloud cover"]:
        if c in df.columns:
            df[c] = df[c].apply(parse_num)

    # remove empty rows
    df = df.dropna(how="all").reset_index(drop=True)
    return df

In [9]:
def list_pdfs(folder: str, recursive: bool = False):
    pattern = "**/*.pdf" if recursive else "*.pdf"
    return sorted(glob(os.path.join(folder, pattern), recursive=recursive))

In [10]:
pdf_path = "/Users/m/Sites/kkuvam.github/crop-ai/data/mausam/2025/Aug/Andhra_Pradesh"

In [11]:
pdfs = list_pdfs(pdf_path)

In [None]:
for pdf in pdfs:
    print(f"Processing {pdf} ...")
    df = extract_weather_table(pdf)
    print(df)
    input("...")
    clear_output(wait=True)
    

Processing /Users/m/Sites/kkuvam.github/crop-ai/data/mausam/2025/Aug/Andhra_Pradesh/andhra_pradesh_east_godavari_2025-08-01.pdf ...
Empty DataFrame
Columns: []
Index: []
