In [10]:
TEXT_PROMPT_CSV = "../zm_scraper/items-prompt.csv"
GDINO_IMG = "../zm_scraper/auctions/gdino/output"
GDINO_FINAL = "../zm_scraper/auctions/gdino/final"
SAM_IMG = "../zm_scraper/auctions/sam/postprocessed/images"
SAM_FINAL = "../zm_scraper/auctions/sam/final"
COMPILED= "./compiled"

## Create comparison pdfs between GDINO and SAM

In [15]:
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import csv, json
from PIL import Image
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4, landscape
from reportlab.lib.units import mm
from reportlab.lib.utils import ImageReader

WRITE_PER_ID_PDFS = True
COMBINED_PDF_NAME = "combined.pdf"

# Page layout
PAGE_W, PAGE_H = landscape(A4)
MARGIN         = 15 * mm
GAP_COL        = 10 * mm
HEADER_GAP     = 6 * mm
TEXT_IMG_GAP   = 4 * mm
SECTION_GAP    = 8 * mm

# Fonts
FONT_REG  = "Helvetica"
FONT_BOLD = "Helvetica-Bold"
SIZE_TITLE   = 14
SIZE_SUB     = 11
SIZE_TEXT    = 9

VALID_IMG_EXT = {".jpg", ".jpeg", ".png", ".webp"}

# --- HELPERS ---
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def read_ids_from_csv(csv_path: Path) -> List[str]:
    with csv_path.open("r", encoding="utf-8") as f:
        rows = list(csv.reader(f))
    if not rows:
        return []
    headers = [h.strip().lower() for h in rows[0]]
    body = rows[1:] if headers else rows
    candidates = ["id", "item_id", "listing_id", "sku"]
    if headers:
        id_col = next((headers.index(c) for c in candidates if c in headers), 0)
    else:
        id_col = 0
    ids = [r[id_col].strip() for r in body if r and len(r) > id_col and r[id_col].strip()]
    seen, out = set(), []
    for _id in ids:
        if _id not in seen:
            seen.add(_id)
            out.append(_id)
    return out

def collect_images(folder: Path) -> Dict[str, Path]:
    out = {}
    if folder.exists():
        for p in folder.iterdir():
            if p.suffix.lower() in VALID_IMG_EXT:
                out[p.stem] = p
    return out

def collect_json(folder: Path) -> Dict[str, Path]:
    out = {}
    if folder.exists():
        for p in folder.iterdir():
            if p.suffix.lower() == ".json":
                out[p.stem] = p
    return out

def load_json(p: Optional[Path]) -> dict:
    if not p or not p.exists():
        return {}
    try:
        return json.loads(p.read_text(encoding="utf-8"))
    except:
        return {}

def fit_image_box(img_path: Path, max_w: float, max_h: float) -> Tuple[float, float]:
    with Image.open(img_path) as im:
        w, h = im.size
    scale = min(max_w / w, max_h / h) if w > 0 and h > 0 else 1.0
    return (w * scale, h * scale)

def wrap_text(c: canvas.Canvas, text: str, max_width: float, font_name: str, font_size: int) -> List[str]:
    c.setFont(font_name, font_size)
    lines = []
    for para in (text or "").split("\n"):
        words = para.split()
        if not words:
            lines.append("")
            continue
        cur = words[0]
        for w in words[1:]:
            test = f"{cur} {w}"
            if c.stringWidth(test, font_name, font_size) <= max_width:
                cur = test
            else:
                lines.append(cur)
                cur = w
        lines.append(cur)
    return lines

def draw_block(c: canvas.Canvas, text: str, x: float, y_top: float, max_width: float,
               font_name: str = FONT_REG, font_size: int = SIZE_TEXT, leading: float = None) -> float:
    if leading is None:
        leading = font_size * 1.2
    lines = wrap_text(c, text, max_width, font_name, font_size)
    c.setFont(font_name, font_size)
    y = y_top
    for ln in lines:
        c.drawString(x, y, ln)
        y -= leading
    return y

def dict_to_lines(d: dict) -> str:
    if not isinstance(d, dict) or not d:
        return "—"
    keys = sorted(d.keys(), key=lambda k: (len(str(k)), str(k)))
    return "\n".join(f"{k}: {d.get(k, '')}" for k in keys)

# --- PAGE RENDERING ---
def render_listing_page(c: canvas.Canvas,
                        item_id: str, stem: str,
                        gdino_img: Optional[Path], sam_img: Optional[Path],
                        gdino_json: dict, sam_json: dict):
    c.setFont(FONT_BOLD, SIZE_TITLE)
    x = MARGIN
    y = PAGE_H - MARGIN
    c.drawString(x, y, f"Item {item_id} — Auction {stem}")
    y -= (SIZE_TITLE * 1.4)

    c.setFont(FONT_REG, SIZE_SUB)
    c.drawString(x, y, "Left: GDINO • Right: SAM")
    y -= (SIZE_SUB * 1.6)

    col_w = (PAGE_W - 2*MARGIN - GAP_COL) / 2
    col_left_x  = MARGIN
    col_right_x = MARGIN + col_w + GAP_COL

    # text blocks
    c.setFont(FONT_BOLD, SIZE_SUB)
    y_text_top = y - (SIZE_SUB * 1.2)

    y_left_bottom  = draw_block(c, dict_to_lines(gdino_json.get("gdino_readable", {})),
                                col_left_x,  y_text_top, col_w, FONT_REG, SIZE_TEXT)
    y_right_bottom = draw_block(c, dict_to_lines(sam_json.get("sam_readable", {})),
                                col_right_x, y_text_top, col_w, FONT_REG, SIZE_TEXT)
    y_after_text = min(y_left_bottom, y_right_bottom) - TEXT_IMG_GAP

    img_max_h = y_after_text - MARGIN - SECTION_GAP

    # images
    if gdino_img and gdino_img.exists():
        w, h = fit_image_box(gdino_img, col_w, img_max_h)
        c.drawImage(ImageReader(str(gdino_img)), col_left_x, y_after_text - h,
                    width=w, height=h, preserveAspectRatio=True, anchor='sw')
        link_text = f"http://127.0.0.1:8888/edit/zm_scraper/listing/gdino/final/{item_id}/{stem}.json"
        c.setFont(FONT_REG, SIZE_TEXT)
        c.drawString(col_left_x, (y_after_text - h) - SIZE_TEXT - 2, link_text)

    else:
        c.drawString(col_left_x, y_after_text - SIZE_TEXT, "No GDINO image")

    if sam_img and sam_img.exists():
        w, h = fit_image_box(sam_img, col_w, img_max_h)
        c.drawImage(ImageReader(str(sam_img)), col_right_x, y_after_text - h,
                    width=w, height=h, preserveAspectRatio=True, anchor='sw')
        link_text = f"http://127.0.0.1:8888/edit/zm_scraper/listing/sam/final/{item_id}/{stem}.json"
        c.setFont(FONT_REG, SIZE_TEXT)
        c.drawString(col_right_x, (y_after_text - h) - SIZE_TEXT - 2, link_text)

    else:
        c.drawString(col_right_x, y_after_text - SIZE_TEXT, "No SAM image")

    c.showPage()

# --- MAIN LOOP ---
compiled_root = Path(COMPILED)
ensure_dir(compiled_root)

ids = read_ids_from_csv(Path(TEXT_PROMPT_CSV))
print(f"Found {len(ids)} IDs from CSV")

all_pages_data = []
for item_id in ids:
    gdino_jsons = collect_json(Path(GDINO_FINAL) / item_id)  # BASELINE
    if not gdino_jsons:
        print(f"[skip] {item_id}: no GDINO_FINAL JSONs found")
        continue

    gdino_imgs = collect_images(Path(GDINO_IMG) / item_id)
    sam_imgs   = collect_images(Path(SAM_IMG) / item_id)
    sam_jsons  = collect_json(Path(SAM_FINAL) / item_id)

    pages_data = []
    for stem, gj_path in gdino_jsons.items():
        gj = load_json(gj_path)
        sj = load_json(sam_jsons.get(stem))
        gi = gdino_imgs.get(stem)
        si = sam_imgs.get(stem)
        pages_data.append((stem, gj, sj, gi, si))

    # per-ID PDF
    if WRITE_PER_ID_PDFS:
        c = canvas.Canvas(str(compiled_root / f"{item_id}.pdf"), pagesize=landscape(A4))
        for stem, gj, sj, gi, si in pages_data:
            render_listing_page(c, item_id, stem, gi, si, gj, sj)
        c.save()
        print(f"Wrote {compiled_root / f'{item_id}.pdf'} ({len(pages_data)} pages)")

    all_pages_data.append((item_id, pages_data))

# combined PDF
if all_pages_data:
    c = canvas.Canvas(str(compiled_root / COMBINED_PDF_NAME), pagesize=landscape(A4))
    for item_id, pages_data in all_pages_data:
        for stem, gj, sj, gi, si in pages_data:
            render_listing_page(c, item_id, stem, gi, si, gj, sj)
    c.save()
    print(f"Wrote {compiled_root / COMBINED_PDF_NAME}")
else:
    print("No pages to combine.")


Found 5 IDs from CSV
Wrote compiled/1.pdf (60 pages)
Wrote compiled/2.pdf (24 pages)
Wrote compiled/3.pdf (36 pages)
Wrote compiled/4.pdf (30 pages)
Wrote compiled/5.pdf (56 pages)
Wrote compiled/combined.pdf
