# Document Extractor

This notebook contains the document extraction pipeline used by the `document_processor` agent. It includes code to:

- Load PDFs and images.
- Preprocess documents (OCR, image correction, cropping).
- Extract structured fields and tables.
- Save extracted results to the project data directory.

Changes made

- Ensure any helper functions or configuration imports use the `src` package imports (e.g., `from src.config import ...`) so the notebook runs when the repository root is in `PYTHONPATH` or run from the project root.

How to run

1. Activate the project's virtual environment:

```bash
source venv/bin/activate
```

2. From the repository root run the notebook (e.g., via Jupyter Lab) or execute cells in order. Make sure `src/` is on `PYTHONPATH` if running cells from a different working directory.

Notes

- If you added or modified helper modules, confirm they are imported at the top of the notebook. If imports fail, run `export PYTHONPATH=. ` (or use the Python module runner) from the project root.
- This cell was added automatically to document the code changes; update it with more details if you add new extraction steps or dependencies.


In [4]:
# --- Optional: install (Colab) ---
#%pip install pdfplumber pandas openpyxl

#from tkinter import Tk, filedialog

#Tk().withdraw()  # hides the root window
#pdf_path = filedialog.askopenfilename(
#    title="Select a PDF file",
#    filetypes=[("PDF files", "*.pdf")]
#)

#print("Selected:", pdf_path)

import json, re, os
import pdfplumber
import pandas as pd

# ========= SET YOUR PDF PATH HERE =========
pdf_path = r"D:\git\utility-billing-ai\data\raw\National Grid Usage Statement-With Overcharge.pdf"

#pdf_path = "National Grid Usage Statement-With Overcharge.pdf"
#pdf_path = "National Grid Usage Statement-Without Overcharge.pdf"
assert os.path.exists(pdf_path), f"PDF not found at: {pdf_path}"

# ========= YOUR CHAIN OBJECTS =========
# Assumes you already defined: SchemaMain, llm, and create_extraction_chain(SchemaMain, llm)
# If these aren't defined yet, define/import them before running this cell.

# ---------- 1) Run your chain over each page ----------
final = []
with pdfplumber.open(pdf_path) as pdf:
    for page_num in range(len(pdf.pages)):
        print(f"on page {page_num + 1}")
        page = pdf.pages[page_num]
        text = page.extract_text() or ""
        try:
            inp = text
            chain = create_extraction_chain(SchemaMain, llm)
            output = chain.run(inp)
            final.append(output)
        except Exception as e:
            print(f"Error processing page {page_num+1}: {e}")
            # Keep the raw page text as a fallback record so we can inspect later
            final.append({"_error": str(e), "_page_index": page_num, "_raw_text": text})

# ---------- 2) Normalize outputs into Python objects ----------
def extract_json(obj):
    """
    Accepts dicts or strings. If string contains JSON with extra text,
    extract the outermost {...} or [...] and json.loads it.
    """
    if isinstance(obj, (dict, list)):
        return obj
    if isinstance(obj, str):
        s = obj.strip()
        # If already clean JSON:
        try:
            return json.loads(s)
        except Exception:
            pass
        # Try to extract the largest JSON object/array
        start_obj = s.find("{"); end_obj = s.rfind("}")
        start_arr = s.find("["); end_arr = s.rfind("]")
        cand = None
        if start_obj != -1 and end_obj != -1 and end_obj > start_obj:
            cand = s[start_obj:end_obj+1]
        elif start_arr != -1 and end_arr != -1 and end_arr > start_arr:
            cand = s[start_arr:end_arr+1]
        if cand:
            try:
                return json.loads(cand)
            except Exception:
                return {"_raw_response": s}
        return {"_raw_response": s}
    # Fallback
    return {"_raw_response": str(obj)}

objs = [extract_json(x) for x in final]

# Save raw NDJSON for debugging (optional)
with open("llm_page_outputs.ndjson", "w", encoding="utf-8") as f:
    for o in objs:
        f.write(json.dumps(o, ensure_ascii=False) + "\n")

# ---------- 3) Build the Summary sheet (top-level fields only) ----------
summary_df = pd.json_normalize(objs, max_level=1)

# Drop nested lists/dicts from the summary
nested_cols = [c for c in summary_df.columns
               if summary_df[c].apply(lambda v: isinstance(v, (list, dict))).any()]
summary_df = summary_df.drop(columns=nested_cols, errors="ignore")

# Try to parse common date/number columns (best guess)
date_like = [c for c in summary_df.columns if re.search(r"(date|period|start|end|due)", c, re.I)]
for c in date_like:
    try:
        summary_df[c] = pd.to_datetime(summary_df[c], errors="coerce")
    except Exception:
        pass

def to_num(x):
    if pd.isna(x): return pd.NA
    s = str(x).replace("$","").replace(",","").strip()
    return pd.to_numeric(s, errors="coerce")

num_like = [c for c in summary_df.columns if re.search(r"(kwh|amount|charge|tax|rate|demand|rkva|usage|total|balance)", c, re.I)]
for c in num_like:
    summary_df[c] = summary_df[c].apply(to_num)

# ---------- 4) Find repeated/nested list keys and make separate sheets ----------
# Collect all list-valued keys that appear in any object
list_keys = set()
for o in objs:
    if isinstance(o, dict):
        for k, v in o.items():
            if isinstance(v, list):
                list_keys.add(k)

sheets = {"Summary": summary_df}

for key in list_keys:
    frames = []
    for idx, o in enumerate(objs):
        if not isinstance(o, dict) or key not in o or not isinstance(o[key], list):
            continue
        # meta fields = the scalar (non-list/dict) fields we want to carry along
        meta = {mk: mv for mk, mv in o.items() if not isinstance(mv, (list, dict))}
        df_li = pd.json_normalize(o, record_path=[key])
        # annotate with page index + meta
        df_li["SourcePageIndex"] = idx
        for mk, mv in meta.items():
            df_li[mk] = mv
        frames.append(df_li)
    if frames:
        li_df = pd.concat(frames, ignore_index=True)
        # basic numeric cleanup on obvious amount/qty fields
        for c in li_df.columns:
            if re.search(r"(kwh|qty|amount|charge|tax|rate|demand|rkva|usage|total|price|unit)", str(c), re.I):
                li_df[c] = li_df[c].apply(lambda x: to_num(x))
        sheets[key[:31] or "Items"] = li_df  # Excel sheet name ≤ 31 chars

# ---------- 5) Save to Excel (multi-sheet) + CSV ----------
summary_df.to_csv("bill_extraction_summary.csv", index=False)

with pd.ExcelWriter("bill_extraction.xlsx", engine="openpyxl") as writer:
    for name, df in sheets.items():
        # Ensure valid, unique sheet names
        safe = re.sub(r"[:\\/?*\[\]]", "_", name)[:31] or "Sheet"
        df.to_excel(writer, sheet_name=safe, index=False)

print("Wrote:")
print(" - bill_extraction_summary.csv")
print(" - bill_extraction.xlsx (Summary + one sheet per nested list, e.g., charges/line_items)")
print(" - llm_page_outputs.ndjson (raw per-page LLM outputs for debugging)")


on page 1
Error processing page 1: name 'create_extraction_chain' is not defined
on page 2
Error processing page 2: name 'create_extraction_chain' is not defined
on page 3
Error processing page 3: name 'create_extraction_chain' is not defined
on page 4
Error processing page 4: name 'create_extraction_chain' is not defined
Wrote:
 - bill_extraction_summary.csv
 - bill_extraction.xlsx (Summary + one sheet per nested list, e.g., charges/line_items)
 - llm_page_outputs.ndjson (raw per-page LLM outputs for debugging)


In [5]:
# Streaming parser for "Monthly Electric History" -> CSV with PDF-matching headers.
# Handles generic account #s, glued dates, negative kWh, ghost $ amounts before/after "Retracted Amt",
# and trailing date / "Page X of Y" footers.

import pandas as pd, re, unicodedata
from datetime import datetime
from pathlib import Path

INPUT_TEXT_CSV = Path("bill_extraction_summary.csv")
OUTPUT_CSV     = Path("bill_extraction_summary_formatted.csv")

DEST_COLS = [
    "Bill Account","Customer","Bill Date","Read Date","Days Used","Billed Kwh",
    "Billed Demand","Load Factor","Billed Rkva","Bill Amount","Sales Tax Amt",
    "Bill Amount w/Sales Tax","Retracted Amt","Sales Tax Factor",
]

# ---------------- helpers ----------------
def normspace(s: str) -> str:
    if s is None: return ""
    s = unicodedata.normalize("NFKC", str(s)).replace("\u00A0"," ")
    return re.sub(r"\s+", " ", s).strip()

DATE_RE = re.compile(r"^\d{1,2}/\d{1,2}/\d{2,4}$")

def as_date(tok: str):
    tok = tok.strip()
    if not DATE_RE.match(tok): return None
    for fmt in ("%m/%d/%Y","%m/%d/%y","%Y-%m-%d","%Y/%m/%d"):
        try:
            dt = datetime.strptime(tok, fmt)
            return f"{dt.month}/{dt.day}/{dt.year}"
        except: pass
    return tok

def is_intlike(tok: str):
    return re.fullmatch(r"^-?[\d,]+$", tok) is not None

def parse_intlike(tok: str):
    m = re.sub(r"[^\d\-]", "", tok)
    if m in ("", "-",): return ""
    return f"{int(m):,}"

def is_num(tok: str):
    return re.fullmatch(r"^-?[\d,]*\.?\d+$", tok.replace(",", "")) is not None

def parse_float(tok: str, nd=2):
    s = tok.replace(",", "")
    try:
        v = float(s)
        out = f"{v:.{nd}f}".rstrip("0").rstrip(".")
        return out
    except:
        return tok

def is_money(tok: str):
    return re.fullmatch(r"^\$?-?[\d,]*\.?\d+$", tok) is not None

def parse_money(tok: str):
    s = tok.replace("$","").replace(",","")
    if s in ("", "-", ".", "-."): return ""
    try:
        return f"${float(s):,.2f}"
    except:
        return tok

def pull(text, pats, group=1, default=""):
    for pat in pats:
        m = re.search(pat, text, flags=re.IGNORECASE)
        if m:
            return normspace(m.group(group))
    return default

# ---------------- load text ----------------
df_raw = pd.read_csv(INPUT_TEXT_CSV, encoding="utf-8-sig", engine="python", dtype=str, keep_default_na=False)
full_text = "\n".join(df_raw.apply(lambda r: " ".join(r.astype(str)), axis=1))
full_text = normspace(full_text)

DATE    = r"\d{1,2}/\d{1,2}/\d{2,4}"
ACCOUNT = r"\b\d{7,12}\b"  # any 7–12 digit account #

# Split ANY adjacent dates (no lookbehind)
full_text = re.sub(rf"({DATE})(?={DATE})", r"\1 ", full_text)
# Ensure space between account and customer if merged (e.g., 464004117TOWN)
full_text = re.sub(rf"({ACCOUNT})(?=TOWN)", r"\1 ", full_text)
# Insert a row break BEFORE each account number
full_text = re.sub(rf"\s*(?={ACCOUNT})", "\n", full_text)

# Header fields
bill_account = pull(full_text, [rf"Bill Account[:\s]+({ACCOUNT})"])
customer     = pull(full_text, [r"Customer[:\s]+([A-Z0-9 ,'\-&/]+?)(?=\s+Post Office:|\s+Service Address:|$)"])

# Tokenize once
tokens = full_text.split()

rows = []
i = 0
N = len(tokens)

while i + 1 < N:
    d1 = as_date(tokens[i])
    d2 = as_date(tokens[i+1]) if d1 else None
    if not (d1 and d2):
        i += 1
        continue

    j = i + 2
    # Days Used
    if j >= N or not is_intlike(tokens[j]): i += 1; continue
    days = parse_intlike(tokens[j]); j += 1

    # Billed Kwh (allow negative)
    if j >= N or not is_intlike(tokens[j].lstrip("-")): i += 1; continue
    kwh = parse_intlike(tokens[j]); j += 1

    # Billed Demand (float)
    if j >= N or not is_num(tokens[j]): i += 1; continue
    demand = parse_float(tokens[j], nd=1); j += 1

    # Load Factor (float)
    if j >= N or not is_num(tokens[j]): i += 1; continue
    load_factor = parse_float(tokens[j], nd=2); j += 1

    # Billed Rkva (int)
    if j >= N or not is_intlike(tokens[j]): i += 1; continue
    rkva = parse_intlike(tokens[j]); j += 1

    # Bill Amount (money)
    if j >= N or not is_money(tokens[j]): i += 1; continue
    bill_amt = parse_money(tokens[j]); j += 1

    # Sales Tax Amt (money)
    if j >= N or not is_money(tokens[j]): i += 1; continue
    sales_tax_amt = parse_money(tokens[j]); j += 1

    # Bill Amount w/Sales Tax (money or plain number)
    if j >= N or not (is_money(tokens[j]) or is_num(tokens[j])): i += 1; continue
    bill_with_tax = parse_money(tokens[j]) if is_money(tokens[j]) else parse_money(tokens[j]); j += 1

    # OPTIONAL "ghost" money BEFORE Retracted (can be multiple in messy OCR)
    while (j + 2 < N and (is_money(tokens[j]) or is_num(tokens[j])) 
           and (is_money(tokens[j+1]) or is_num(tokens[j+1]))
           and (is_num(tokens[j+2]) or as_date(tokens[j+2]) or tokens[j+2].lower()=="page")):
        j += 1  # skip ghost

    # Retracted Amt
    if j >= N or not (is_money(tokens[j]) or is_num(tokens[j])): i += 1; continue
    retracted_amt = parse_money(tokens[j]); j += 1

    # OPTIONAL "ghost" money AFTER Retracted (support multiple), before factor/footer
    while j < N and is_money(tokens[j]):
        peek_next = tokens[j+1] if j+1 < N else ""
        if is_num(peek_next) or as_date(peek_next) or peek_next.lower() == "page":
            # stop if next looks like factor/date/footer
            break
        j += 1  # skip additional ghost $ tokens

    # Sales Tax Factor (small number) — allow empty if footer immediately follows
    factor = ""
    if j < N and is_num(tokens[j]):
        factor = parse_float(tokens[j], nd=2)
        j += 1

    # Swallow any trailing dates and/or "Page X of Y" footer bits
    while j < N:
        if as_date(tokens[j]):        # a trailing statement date
            j += 1
            continue
        # "Page X of Y"
        if (j + 3 < N and tokens[j].lower() == "page"
                and is_intlike(tokens[j+1]) and tokens[j+2].lower() == "of" and is_intlike(tokens[j+3])):
            j += 4
            continue
        break

    rows.append({
        "Bill Account": bill_account or "",
        "Customer":     customer or "",
        "Bill Date":    d1,
        "Read Date":    d2,
        "Days Used":    days,
        "Billed Kwh":   kwh,
        "Billed Demand":demand,
        "Load Factor":  load_factor,
        "Billed Rkva":  rkva,
        "Bill Amount":  bill_amt,
        "Sales Tax Amt":sales_tax_amt,
        "Bill Amount w/Sales Tax": bill_with_tax,
        "Retracted Amt":retracted_amt,
        "Sales Tax Factor": factor,
    })

    i = j  # move past the row we just consumed

# Build output
df_out = pd.DataFrame(rows)

# ensure columns present & ordered
for c in DEST_COLS:
    if c not in df_out.columns: df_out[c] = ""
df_out = df_out[DEST_COLS]

df_out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"Parsed {len(df_out)} rows -> {OUTPUT_CSV.resolve()}")
display(df_out.head(50))


Parsed 73 rows -> D:\git\utility-billing-ai\src\agents\document_processor\bill_extraction_summary_formatted.csv


Unnamed: 0,Bill Account,Customer,Bill Date,Read Date,Days Used,Billed Kwh,Billed Demand,Load Factor,Billed Rkva,Bill Amount,Sales Tax Amt,Bill Amount w/Sales Tax,Retracted Amt,Sales Tax Factor
0,3288390002,TOWN OF HALFMOON,7/9/2019,7/2/2019,28,15680,44.0,0.53,0,$702.11,$0.00,$702.11,$0.00,
1,3288390002,TOWN OF HALFMOON,8/8/2019,8/2/2019,31,19120,48.8,0.53,0,$806.48,$0.00,$806.48,$0.00,
2,3288390002,TOWN OF HALFMOON,9/10/2019,9/4/2019,33,18720,44.0,0.54,0,$694.54,$0.00,$694.54,$0.00,
3,3288390002,TOWN OF HALFMOON,10/8/2019,10/2/2019,28,15520,50.4,0.46,0,$740.16,$0.00,$740.16,$0.00,
4,3288390002,TOWN OF HALFMOON,11/6/2019,10/31/2019,29,16960,55.2,0.44,0,$831.72,$0.00,$831.72,$0.00,
5,3288390002,TOWN OF HALFMOON,12/9/2019,12/3/2019,33,29600,80.8,0.46,0,"$1,245.98",$0.00,"$1,245.98",$0.00,
6,3288390002,TOWN OF HALFMOON,1/9/2020,1/3/2020,31,34000,88.8,0.51,0,"$1,354.17",$0.00,"$1,354.17",$0.00,
7,3288390002,TOWN OF HALFMOON,2/7/2020,2/3/2020,31,33440,90.4,0.5,0,"$1,338.99",$0.00,"$1,338.99",$0.00,
8,3288390002,TOWN OF HALFMOON,3/9/2020,3/3/2020,29,33040,85.6,0.55,0,"$1,325.69",$0.00,"$1,325.69",$0.00,
9,3288390002,TOWN OF HALFMOON,4/8/2020,4/2/2020,30,25360,81.6,0.43,0,"$1,209.09",$0.00,"$1,209.09",$0.00,
