# Slidedit

This notebook is a demo showing how to use an **instructed LLM** to automatically rearrange the format of texts & images in `.pptx` slides. 

This pipeline demonstrates multimodal understanding: the model receives slides with __Misaligned & Random-Layout__ content and outputs layout corrections. The corrections contains two stages in this demo: __Rematch & Relayout__.

This pipeline is designed by human, and GPT5 is used to assist python programming here.


**Part I: Setup LLM API**

   - In this demo, we use **GPT-5** as the LLM API.  
   
   - Please set your API key by creating a `.env` file next to this notebook. A sample can be found in the root of this repo.


In [1]:
import os
from dotenv import load_dotenv

_ = load_dotenv()  # read .env from current folder

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_BASE = os.getenv("OPENAI_BASE", "").strip() or None
MODEL = os.getenv("OPENAI_MODEL", "gpt-5")

assert OPENAI_API_KEY, "Please set OPENAI_API_KEY in a .env file next to this notebook."
print("API key loaded. Model:", MODEL, "| Base:", OPENAI_BASE or "default")

API key loaded. Model: gpt-5 | Base: default


In [2]:
import io, json, re, base64
from typing import Dict, Any, List, Tuple
from io import BytesIO
from pptx import Presentation
from pptx.util import Pt, Inches
from pptx.enum.shapes import MSO_SHAPE_TYPE
from PIL import Image, ImageDraw, ImageFont

# OpenAI-compatible SDK
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE) if OPENAI_BASE else OpenAI(api_key=OPENAI_API_KEY)

# EMU <-> px
EMU_PER_IN, DPI = 914400, 96
def emu2px(v): return int(round(float(v)/EMU_PER_IN*DPI))
def px2emu(v): return int(round(float(v)/DPI*EMU_PER_IN))

**Part II: Rematch**

   - In this part, we define functions for utility, which will help GPT5 get information helpful in image-text rematch.

In [3]:
# We first extract information from .pptx to json files, with which we can utilize pptx library in python to carry out later actions.
def ppt_to_json(pptx_path: str, max_slides: int = 3) -> Dict[str, Any]:
    prs = Presentation(pptx_path)
    slides = []
    for si, slide in enumerate(prs.slides):
        if si >= max_slides:
            break
        shapes = []
        for i, s in enumerate(slide.shapes):
            entry = {
                "id": f"s{si}_sh{i}",
                "bbox_px": {"x": emu2px(s.left), "y": emu2px(s.top),
                            "w": emu2px(s.width), "h": emu2px(s.height)},
                "type": "other"
            }
            if s.shape_type == MSO_SHAPE_TYPE.PICTURE:
                entry["type"] = "image"
            elif getattr(s, "has_text_frame", False) and s.has_text_frame:
                entry["type"] = "text"
                paras = []
                for p in s.text_frame.paragraphs:
                    txt = "".join([r.text for r in p.runs]) if p.runs else p.text
                    paras.append(txt)
                entry["text"] = "\n".join(paras).strip()
            shapes.append(entry)
        slides.append({
            "index": si,
            "size_px": {"w": emu2px(prs.slide_width), "h": emu2px(prs.slide_height)},
            "shapes": shapes
        })
    return {"slides": slides}

In [4]:
# This function processes images extracted from slides into url, which will appear in instruction used by GPT5 in rematch.
def _thumb_b64_from_picture_shape(pic_shape, max_side=256) -> str:
    """
    Extract embedded image from PPTX picture shape, make a small PNG thumbnail, return base64 data URL.
    """
    try:
        blob = pic_shape.image.blob  # python-pptx Picture.image -> Image object with .blob
    except Exception:
        return ""
    img = Image.open(BytesIO(blob)).convert("RGB")
    w, h = img.size
    if max(w, h) > max_side:
        ratio = max_side / max(w, h)
        img = img.resize((int(w*ratio), int(h*ratio)))
    bio = BytesIO()
    img.save(bio, format="PNG")
    b64 = base64.b64encode(bio.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{b64}"

# This function returns the information needed in rematch. GPT5 uses image & text lists here to do rematch.
def collect_rematch_catalog(pptx_path: str, max_slides: int = 6):
    """
    Return:
      - slides_json (for later planning)
      - images_info: [{"slide":si,"id":sid,"image_b64":data_url}]
      - captions_info: [{"slide":si,"id":sid,"text":text}]
    """
    prs = Presentation(pptx_path)
    slides_json = ppt_to_json(pptx_path, max_slides=max_slides)

    images_info, captions_info = [], []
    for si, slide in enumerate(prs.slides):
        if si >= max_slides:
            break
        for i, s in enumerate(slide.shapes):
            sid = f"s{si}_sh{i}"
            if s.shape_type == MSO_SHAPE_TYPE.PICTURE:
                images_info.append({
                    "slide": si,
                    "id": sid,
                    "image_b64": _thumb_b64_from_picture_shape(s)
                })
            elif getattr(s, "has_text_frame", False) and s.has_text_frame:
                paras = []
                for p in s.text_frame.paragraphs:
                    paras.append(("".join(r.text for r in p.runs) or p.text))
                txt = "\n".join(paras).strip()
                if txt:
                    captions_info.append({
                        "slide": si,
                        "id": sid,
                        "text": txt
                    })
    return slides_json, images_info, captions_info

In [5]:
# Instruction in Rematch stage.
REMATCH_SYSTEM_PROMPT = (
    "You are an image–caption matcher. For each slide image, pick the single best caption "
    "from the provided caption pool. Return ONLY JSON with an array 'pairs': "
    "[{'slide': <int>, 'image': '<shape_id>', 'caption': '<caption_shape_id>'}, ...]. "
    "No markdown, no extra text."
)

# This function constructs the user prompt for Rematch.
def build_rematch_messages(images_info: List[Dict[str,Any]], captions_info: List[Dict[str,Any]]):
    """
    Build a multimodal user message: text listing the caption pool + embedded images per slide.
    """
    # textual listing of caption pool
    pool_lines = []
    for c in captions_info:
        # clip very long captions to control tokens
        text = c["text"]
        if len(text) > 300:
            text = text[:297] + "…"
        pool_lines.append(f"- {c['id']} (slide {c['slide']}): {text}")
    pool_txt = "Caption pool:\n" + "\n".join(pool_lines)

    # one message with text + a series of images (with IDs in the text)
    content = [{"type":"text","text": pool_txt + "\n\nNow match each image below to exactly one caption ID."}]
    # add each image as an image_url item with its ID in adjacent text
    for img in images_info:
        content.append({"type":"text","text": f"Image ID: {img['id']} (slide {img['slide']})"})
        if img.get("image_b64"):
            content.append({"type":"image_url","image_url":{"url": img["image_b64"]}})
        else:
            # if image missing, at least keep the placeholder text
            content.append({"type":"text","text":"(no preview available)"})
    return content

# Rematch Function
def call_llm_for_rematch(images_info: List[Dict[str,Any]], captions_info: List[Dict[str,Any]]) -> Dict[str, Any]:
    user_content = build_rematch_messages(images_info, captions_info)
    result = None
    save_path = "rematch_result.json"

    # Try structured JSON response first
    try:
        resp = client.chat.completions.create(
            model=MODEL,
            response_format={"type":"json_object"},
            messages=[
                {"role":"system","content": REMATCH_SYSTEM_PROMPT},
                {"role":"user","content": user_content}
            ],
        )
        result = json.loads(resp.choices[0].message.content)
    except Exception:
        # Fallback: no response_format
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role":"system","content": REMATCH_SYSTEM_PROMPT},
                {"role":"user","content": user_content}
            ],
        )
        txt = resp.choices[0].message.content
        m = re.search(r"\{.*\}", txt, flags=re.S)
        if not m:
            raise ValueError("Rematch model did not return JSON.")
        result = json.loads(m.group(0))

    # Save result to file for inspection
    try:
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)
        print(f"[OK] Rematch result saved to {save_path}")
    except Exception as e:
        print(f"[WARN] Could not save rematch result: {e}")

    return result

**Part III: Relayout**

   - In this part, we define functions for utility, which will help GPT5 get information helpful in slides Relayout.

In [6]:
def _load_font(size: int):
    # Try common fonts on Windows/Linux; fallback to default
    try_paths = [
        "C:/Windows/Fonts/arial.ttf",
        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
    ]
    for p in try_paths:
        try:
            return ImageFont.truetype(p, size)
        except Exception:
            pass
    return ImageFont.load_default()

def measure_and_wrap(text, size, max_w, line_spacing=1.25):
    font = _load_font(size)
    lines = []
    for para in (text.split("\n") or [""]):
        if para == "":
            lines.append(""); continue
        cur = ""
        draw = ImageDraw.Draw(Image.new("RGB",(10,10)))
        for w in para.split(" "):
            t = (cur + " " + w).strip() if cur else w
            if draw.textbbox((0,0), t, font=font)[2] > max_w and cur:
                lines.append(cur); cur = w
            else:
                cur = t
        lines.append(cur)
    height = int(size*line_spacing)*len(lines)
    width = 0
    draw = ImageDraw.Draw(Image.new("RGB",(10,10)))
    for ln in lines:
        width = max(width, draw.textbbox((0,0), ln, font=font)[2])
    return width, height, lines, font

def fit_textbox(tb, min_pt=10, max_pt=32, line_spacing=1.25, margin_px=24):
    text = "\n".join(("".join(r.text for r in p.runs) or p.text) for p in tb.text_frame.paragraphs).strip()
    L, T, W, H = map(emu2px, (tb.left, tb.top, tb.width, tb.height))
    for s in range(max_pt, min_pt-1, -2):
        w, h, lines, font = measure_and_wrap(text, s, max(10, W-2*margin_px), line_spacing)
        if w <= W-2*margin_px and h <= H-2*margin_px:
            tf = tb.text_frame; tf.clear()
            for i, ln in enumerate(lines):
                p = tf.paragraphs[0] if i==0 else tf.add_paragraph()
                p.text = ln
                for r in p.runs: r.font.size = Pt(s)
            return True
    for p in tb.text_frame.paragraphs:
        for r in p.runs: r.font.size = Pt(min_pt)
    return False

In [7]:
def _normalize_id(item):
    if isinstance(item, str): return item
    if isinstance(item, dict):
        for k in ("id","shape_id","shape","target","sid"):
            v = item.get(k)
            if isinstance(v, str): return v
    return None

def _to_list_ids(x):
    if x is None: return []
    if isinstance(x, list):
        out = []
        for it in x:
            sid = _normalize_id(it)
            if sid is not None: out.append(sid)
            elif isinstance(it, str): out.append(it)
        return out
    sid = _normalize_id(x)
    return [sid] if sid is not None else []

# unwrap raw contents GPT5 returns.
def _unwrap_plan(raw_plan):
    if isinstance(raw_plan, str):
        try: raw_plan = json.loads(raw_plan)
        except Exception: return {}
    if not isinstance(raw_plan, dict): return {}
    for k in ("plan","result","data","output"):
        if k in raw_plan and isinstance(raw_plan[k], dict):
            return raw_plan[k]
    return raw_plan

def _to_float(v):
    if v is None: return None
    if isinstance(v, (int,float)): return float(v)
    if isinstance(v, str):
        s = v.strip()
        try:
            if s.endswith("%"): return float(s[:-1]) / 100.0
            return float(s)
        except Exception:
            return None
    return None

def _sanitize_bbox(b):
    if b is None: return {}
    if isinstance(b, (list,tuple)) and len(b) >= 4:
        x, y, w, h = (_to_float(b[0]), _to_float(b[1]), _to_float(b[2]), _to_float(b[3]))
        out = {}
        if x is not None: out["x"] = x
        if y is not None: out["y"] = y
        if w is not None: out["w"] = w
        if h is not None: out["h"] = h
        return out
    if isinstance(b, dict):
        out = {}
        x = _to_float(b.get("x", b.get("left")))
        y = _to_float(b.get("y", b.get("top")))
        w = _to_float(b.get("w", b.get("width")))
        h = _to_float(b.get("h", b.get("height")))
        if x is not None: out["x"] = x
        if y is not None: out["y"] = y
        if w is not None: out["w"] = w
        if h is not None: out["h"] = h
        return out
    return {}

def _extract_angle(obj):
    for k in ("angle","rotation","rotation_deg","deg"):
        if k in obj:
            try: return float(obj[k])
            except Exception: pass
    return None

def normalize_operations(plan):
    if not isinstance(plan, dict): return []
    out = []

    def _append_op(op_name, obj, slide=None):
        if isinstance(obj, dict):
            targets = _to_list_ids(obj.get("targets", obj.get("target")))
            out.append({
                "op": op_name,
                "targets": targets,
                "bbox": _sanitize_bbox(obj.get("bbox")),
                "style": obj.get("style", {}) or {},
                "text": obj.get("text"),
                "angle": _extract_angle(obj),
                "slide": obj.get("slide", slide),
            })
        else:
            out.append({"op": op_name, "targets": _to_list_ids(obj), "slide": slide})

    ops = plan.get("operations", [])
    if isinstance(ops, dict):
        for k, v in ops.items(): _append_op(k, v, slide=None)
    elif isinstance(ops, list):
        for item in ops:
            if not isinstance(item, dict): continue
            opn = item.get("op") or item.get("type")
            if not opn: continue
            _append_op(opn, item, slide=item.get("slide"))

    if "slides" in plan and isinstance(plan["slides"], list):
        for splan in plan["slides"]:
            si = splan.get("index")
            sops = splan.get("operations", [])
            if isinstance(sops, dict):
                for k, v in sops.items(): _append_op(k, v, slide=si)
            elif isinstance(sops, list):
                for item in sops:
                    if not isinstance(item, dict): continue
                    opn = item.get("op") or item.get("type")
                    if not opn: continue
                    _append_op(opn, item, slide=si)

    normalized = []
    for it in out:
        opn = it.get("op")
        if not opn: continue
        if opn in ("swap","set_style","fit_text","set_text","set_rotation") and not it.get("targets"):
            continue
        normalized.append(it)
    return normalized

# This function utilizes generation of GPT5 and reconstruct .pptx.
def execute_plan(pptx_in: str, plan_raw: Dict[str, Any], pptx_out: str):
    prs = Presentation(pptx_in)
    plan = _unwrap_plan(plan_raw)
    # extract parameters from GPT5 generation. use default only when generation fails to provide the needed parameters..
    policy = plan.get("policy", {
        "margins_px": 24,
        "font_min_pt": 10,
        "font_max_pt": 32,
        "line_spacing": 1.25,
        "units": "px",
        "zero_rotation_first": True
    })
    ops_all = normalize_operations(plan) or []

    SW, SH = emu2px(prs.slide_width), emu2px(prs.slide_height)

    def _apply_bbox(s, bbox, units):
        vals = [bbox.get(k) for k in ("x","y","w","h") if k in bbox]
        if units not in ("px","fraction"):
            units = "fraction" if (vals and all(isinstance(v,(int,float)) and 0<=v<=1 for v in vals)) else "px"
        if units == "fraction":
            if "x" in bbox: s.left   = px2emu(int(float(bbox["x"]) * SW))
            if "y" in bbox: s.top    = px2emu(int(float(bbox["y"]) * SH))
            if "w" in bbox: s.width  = px2emu(int(float(bbox["w"]) * SW))
            if "h" in bbox: s.height = px2emu(int(float(bbox["h"]) * SH))
        else:
            if "x" in bbox: s.left   = px2emu(int(float(bbox["x"])))
            if "y" in bbox: s.top    = px2emu(int(float(bbox["y"])))
            if "w" in bbox: s.width  = px2emu(int(float(bbox["w"])))
            if "h" in bbox: s.height = px2emu(int(float(bbox["h"])))

    for si, slide in enumerate(prs.slides):
        if policy.get("zero_rotation_first"):
            for s in slide.shapes:
                try: s.rotation = 0
                except Exception: pass

        sid_map = {f"s{si}_sh{i}": s for i, s in enumerate(slide.shapes)}
        ops = [op for op in ops_all if (op.get("slide") is None or op.get("slide") == si)]

        for op in ops:
            name = op.get("op")
            targets = _to_list_ids(op.get("targets"))

            if name == "set_rotation":
                angle = op.get("angle")
                if angle is None: angle = 0
                try: angle = float(angle)
                except Exception: angle = 0
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s: continue
                    try: s.rotation = angle
                    except Exception: pass

            elif name == "swap" and len(targets) == 2:
                a, b = sid_map.get(targets[0]), sid_map.get(targets[1])
                if not a or not b: continue
                a.left, b.left = b.left, a.left
                a.top,  b.top  = b.top,  a.top
                a.width, b.width = b.width, a.width
                a.height, b.height = b.height, a.height

            elif name == "move_resize":
                bbox = op.get("bbox", {}) or {}
                units = policy.get("units", "px")
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s: continue
                    _apply_bbox(s, bbox, units)

            elif name == "set_style":
                st = op.get("style", {}) or {}
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s or not getattr(s,"has_text_frame",False): continue
                    for p in s.text_frame.paragraphs:
                        for r in p.runs:
                            fs = st.get("font_size_pt")
                            if fs:
                                try: r.font.size = Pt(int(float(fs)))
                                except Exception: pass

            elif name == "fit_text":
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s or not getattr(s,"has_text_frame",False): continue
                    fit_textbox(
                        s,
                        min_pt=policy.get("font_min_pt") or 10,
                        max_pt=policy.get("font_max_pt") or 32,
                        line_spacing=policy.get("line_spacing") or 1.25,
                        margin_px=policy.get("margins_px") or 24,
                    )

            elif name == "set_text":
                new_text = op.get("text", "")
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s or not getattr(s,"has_text_frame",False): continue
                    tf = s.text_frame; tf.clear()
                    p = tf.paragraphs[0]; p.text = str(new_text)

    prs.save(pptx_out)

In [None]:
''' 
Instruction I used that contains explicit numbers
'''
# PLAN_SCHEMA = {
#   "version": "1.0",
#   "policy": {
#     "margins_px": 24,
#     "font_min_pt": 10,
#     "font_max_pt": 32,
#     "line_spacing": 1.25,
#     "units": "fraction",      # normalized [0..1]
#     "avoid_overlap": True,
#     "respect_margins": True,
#     "zero_rotation_first": True
#   },
#   "operations": [
#     # examples:
#     # {"op":"set_rotation","targets":["s0_sh1"],"angle":0,"slide":0}
#     # {"op":"move_resize","targets":["s0_sh1"],"bbox":{"x":0.06,"y":0.06,"w":0.54,"h":0.88}}
#     # {"op":"fit_text","targets":["s0_sh2"]}
#   ]
# }

# SYSTEM_PROMPT = (
#     "You are a layout planner. Return ONLY a valid JSON object following the given schema. "
#     "No markdown, no prose. Every operation MUST include an 'op' and a 'targets' array (even if single target). "
#     "Use fractional coordinates [0..1] for 'bbox'. Keep ~24px margins and avoid overlaps. "
#     "If any shape is rotated, add 'set_rotation' with angle=0 before other ops."
# )

# def build_user_prompt(instruction: str, slides_json: Dict[str, Any]) -> str:
#     canonical_slots = {
#         "two_column": {
#             "image": {"x":0.06,"y":0.06,"w":0.54,"h":0.88},
#             "text":  {"x":0.62,"y":0.06,"w":0.32,"h":0.88}
#         },
#         "title_content": {
#             "title": {"x":0.06,"y":0.06,"w":0.88,"h":0.14},
#             "image": {"x":0.06,"y":0.24,"w":0.56,"h":0.70},
#             "text":  {"x":0.64,"y":0.24,"w":0.30,"h":0.70}
#         }
#     }
#     return (
#         "Instruction:\n" + instruction + "\n\n" +
#         "Slides JSON:\n" + json.dumps(slides_json, ensure_ascii=False) + "\n\n" +
#         "Schema:\n" + json.dumps(PLAN_SCHEMA, ensure_ascii=False) + "\n\n" +
#         "Canonical slots:\n" + json.dumps(canonical_slots) + "\n\n" +
#         "Tasks:\n"
#         "1) If any image/text is rotated, add 'set_rotation' to 0 degrees first.\n"
#         "2) Choose a clean layout (prefer two_column) and place image/text via 'move_resize' with fractional bbox.\n"
#         "3) Add 'fit_text' for all text; respect ~24px margins; avoid overlaps.\n"
#         "Return JSON only."
#     )

''' 
Instruction with no explicit number as parameters. The rearranged .pptx in the repo is generated based on this instruction.
'''
PLAN_SCHEMA = {
  "version": "1.0",
  "policy": {
    "margins_px": None,          # model may set a sensible pixel margin internally
    "font_min_pt": None,         # model may choose a reasonable minimum font
    "font_max_pt": None,         # model may choose a reasonable maximum font
    "line_spacing": None,        # model may set a readable line spacing multiplier
    "units": None,               # should be "fraction" (normalized [0..1]) or "px"
    "avoid_overlap": None,       # model should ensure shapes do not overlap
    "respect_margins": None,     # model should keep clear margins to slide edges
    "zero_rotation_first": None  # model may normalize rotation before placement
  },
  "operations": [
    # each operation object should include:
    # {
    #   "op": "<operation_name>",                  # e.g., "move_resize", "fit_text", "set_rotation", "set_text", "set_style", "swap"
    #   "targets": ["<shape_id>", ...],            # e.g., ["s0_sh1"]
    #   "bbox": {"x": <float>, "y": <float>, "w": <float>, "h": <float>},   # if applicable (fractional [0..1] or px per policy.units)
    #   "angle": <float>,                          # if applicable (e.g., 0 for upright orientation)
    #   "text": "<string>",                        # if applicable
    #   "style": {"font_size_pt": <int>, ...},     # if applicable
    #   "slide": <int>                             # slide index if operation targets a specific slide
    # }
  ]
}

SYSTEM_PROMPT = (
    "You are a layout planner for slide decks. Your job is to output ONLY a valid JSON object that conforms to the "
    "provided schema keys. Do not include markdown or explanations. Every operation MUST include an 'op' field and a "
    "'targets' array (even if there is a single target).\n\n"
    "General guidance (no numeric hints):\n"
    "- Operate per slide. Identify the main image(s) (picture shapes) and main caption(s) (text shapes). Resolve ties by larger area and centrality.\n"
    "- Use normalized fractional coordinates [0..1] for any 'bbox' you output when policy.units is 'fraction'.\n"
    "- Keep all shapes fully inside slide bounds. Do not overlap shapes. Keep visually clear margins around edges and between blocks.\n"
    "- Prefer a clean, balanced composition. A common pattern is a two-block layout where the image region is larger than the text region, with aligned edges and a clear gap between them.\n"
    "- Preserve visual plausibility of images (avoid extreme stretch). If needed, place the image within its rectangle in a visually centered way.\n"
    "- If rotation prevents clean layout, include a 'set_rotation' operation to normalize to an upright orientation before placement.\n"
    "- Always include 'fit_text' for text after placement so it remains readable within its box.\n"
    "- Keep the overall composition consistent across slides, but adapt to content density (longer captions deserve more area than very short ones).\n"
    "- Do not invent new shapes; only operate on provided shape ids.\n\n"
    "Validation checklist (self-check before returning JSON):\n"
    "1) All bbox values you output are within [0..1] when using fractional units; each placed shape satisfies x>=0, y>=0, x+w<=1, y+h<=1.\n"
    "2) No two placed shapes overlap; there is a visible gap between blocks and from slide edges.\n"
    "3) Image region is appropriately larger than text region if using a two-block composition; edges appear aligned and orderly.\n"
    "4) Include 'fit_text' for every text target you place or resize.\n"
    "5) If rotation was an issue, you included 'set_rotation' before other placement ops.\n\n"
    "Output JSON only—no prose."
)

def build_user_prompt(instruction: str, slides_json: Dict[str, Any]) -> str:
    return (
        "Instruction:\n" + instruction + "\n\n" +
        "Slides JSON:\n" + json.dumps(slides_json, ensure_ascii=False) + "\n\n" +
        "Schema (structure only; no numeric hints):\n" + json.dumps(PLAN_SCHEMA, ensure_ascii=False) + "\n\n" +
        "Tasks:\n"
        "1) For each slide, produce a coherent layout plan using operations over the provided shape ids.\n"
        "2) If any shape (image or text) is rotated at a non-zero angle, you MUST include a 'set_rotation' operation with angle=0 BEFORE any other placement ops.\n"
        "3) Use 'move_resize' to place shapes with fractional [0..1] coordinates (or px if you set policy.units accordingly).\n"
        "4) Always ensure images are upright and not skewed. Correct any rotation before resizing.\n"
        "5) Add 'fit_text' for all text to ensure readability inside its assigned box while avoiding overlaps and maintaining margins.\n"
        "6) Keep slides clean, balanced, and consistent.\n"
        "Return JSON only."
    )

In [9]:
def _extract_json(text: str) -> Dict[str, Any]:
    try: return json.loads(text)
    except Exception: pass
    m = re.search(r"\{.*\}", text, flags=re.S)
    if m:
        return json.loads(m.group(0))
    raise ValueError("Model did not return valid JSON.")

# Relayout Function
def call_llm_for_plan(instruction: str, slides_json: Dict[str, Any]) -> Dict[str, Any]:
    user = build_user_prompt(instruction, slides_json)
    try:
        resp = client.chat.completions.create(
            model=MODEL,
            response_format={"type":"json_object"},
            messages=[
                {"role":"system","content": SYSTEM_PROMPT},
                {"role":"user","content": user}
            ],
        )
        return json.loads(resp.choices[0].message.content)
    except Exception:
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role":"system","content": SYSTEM_PROMPT},
                {"role":"user","content": user}
            ],
        )
        return _extract_json(resp.choices[0].message.content)

In [10]:
def synthesize_set_text_ops_for_rematch(pptx_path: str, mapping: Dict[str, Any], captions_info: List[Dict[str,Any]]) -> Dict[str, Any]:
    """
    mapping JSON (from model) looks like:
      {"pairs":[{"slide":0,"image":"s0_sh1","caption":"s3_sh5"}, ...]}
    We'll set each slide's MAIN text shape's content to the text from 'caption' id.
    """
    # Build lookup: caption_id -> text
    cap_text = {c["id"]: c["text"] for c in captions_info}

    prs = Presentation(pptx_path)
    ops = []
    for pair in mapping.get("pairs", []):
        tgt_slide = pair.get("slide")
        cap_id = pair.get("caption")
        if cap_id not in cap_text or tgt_slide is None:
            continue

        # find largest text shape on tgt_slide
        slide = prs.slides[tgt_slide]
        txt_shapes = [(i, s) for i, s in enumerate(slide.shapes) if getattr(s,"has_text_frame",False) and s.has_text_frame]
        if not txt_shapes:
            # if no text shape exists, create one occupying a reasonable area (right column)
            SW, SH = emu2px(prs.slide_width), emu2px(prs.slide_height)
            x, y, w, h = int(0.62*SW), int(0.06*SH), int(0.32*SW), int(0.88*SH)
            tb = slide.shapes.add_textbox(px2emu(x), px2emu(y), px2emu(w), px2emu(h))
            # assign a synthetic id: use last index assumption
            tid = f"s{tgt_slide}_sh{len(slide.shapes)-1}"
        else:
            # choose the largest by area
            idx, s = max(txt_shapes, key=lambda t: emu2px(t[1].width)*emu2px(t[1].height))
            tid = f"s{tgt_slide}_sh{idx}"

        ops.append({"op":"set_text","targets":[tid],"text":cap_text[cap_id], "slide": tgt_slide})

    return {"version":"1.0","policy":{"margins_px":24,"font_min_pt":10,"font_max_pt":32,"line_spacing":1.25,"units":"px"}, "operations": ops}

**Part IV: Load Slides & Rearrange**

   - In this part, we define functions for utility, which will help GPT5 get information helpful in slides Relayout.

In [11]:
# Paths to .pptx files
INPUT_PPTX      = "coco_raw_20.pptx"   # your mismatched, dirty deck
REMATCHED_PPTX  = "coco_rematched.pptx"
OUTPUT_PPTX     = "coco_relayout_out.pptx"

MAX_SLIDES = 20  # keep prompts reasonable for vision + planning

# Build catalogs for rematch
slides_json_dirty, images_info, captions_info = collect_rematch_catalog(INPUT_PPTX, max_slides=MAX_SLIDES)
print("Slides in prompt:", len(slides_json_dirty["slides"]), "| images:", len(images_info), "| captions:", len(captions_info))

# Ask GPT to rematch (image -> caption_id)
rematch_map = call_llm_for_rematch(images_info, captions_info)
print("Rematch map (truncated):")
print(json.dumps(rematch_map, indent=2, ensure_ascii=False)[:1000], "...")

# Synthesize and apply set_text ops to produce 'rematched' deck
plan_settext = synthesize_set_text_ops_for_rematch(INPUT_PPTX, rematch_map, captions_info)
execute_plan(INPUT_PPTX, plan_settext, REMATCHED_PPTX)
print("Rematched deck saved:", REMATCHED_PPTX)

Slides in prompt: 20 | images: 20 | captions: 20
[OK] Rematch result saved to rematch_result.json
Rematch map (truncated):
{
  "pairs": [
    {
      "slide": 0,
      "image": "s0_sh0",
      "caption": "s13_sh1"
    },
    {
      "slide": 1,
      "image": "s1_sh0",
      "caption": "s4_sh1"
    },
    {
      "slide": 2,
      "image": "s2_sh0",
      "caption": "s16_sh1"
    },
    {
      "slide": 3,
      "image": "s3_sh0",
      "caption": "s8_sh1"
    },
    {
      "slide": 4,
      "image": "s4_sh0",
      "caption": "s9_sh1"
    },
    {
      "slide": 5,
      "image": "s5_sh0",
      "caption": "s12_sh1"
    },
    {
      "slide": 6,
      "image": "s6_sh0",
      "caption": "s10_sh1"
    },
    {
      "slide": 7,
      "image": "s7_sh0",
      "caption": "s5_sh1"
    },
    {
      "slide": 8,
      "image": "s8_sh0",
      "caption": "s14_sh1"
    },
    {
      "slide": 9,
      "image": "s9_sh0",
      "caption": "s3_sh1"
    },
    {
      "slide": 10,
      "image

In [12]:
# Plan a clean layout (two-column, margins, no overlap) on the rematched deck
slides_json_after = ppt_to_json(REMATCHED_PPTX, max_slides=MAX_SLIDES)
instruction = (
    "Arrange each slide into a clean two-column layout with the image on the left and its caption on the right. "
    "Use fractional coordinates and ensure: no overlaps; keep ~24px margins; text fully readable. "
    "Add fit_text for all text blocks."
)
layout_plan = call_llm_for_plan(instruction, slides_json_after)
print("Layout plan (truncated):")
print(json.dumps(layout_plan, indent=2, ensure_ascii=False)[:1000], "...")
with open("layout_plan.json", "w", encoding="utf-8") as f:
    json.dump(layout_plan, f, indent=2, ensure_ascii=False)
print("Layout plan saved to layout_plan.json")

# Execute final layout plan
execute_plan(REMATCHED_PPTX, layout_plan, OUTPUT_PPTX)
print("Final fixed deck saved:", OUTPUT_PPTX)

Layout plan (truncated):
{
  "version": "1.0",
  "policy": {
    "margins_px": null,
    "font_min_pt": null,
    "font_max_pt": null,
    "line_spacing": null,
    "units": "fraction",
    "avoid_overlap": true,
    "respect_margins": true,
    "zero_rotation_first": true
  },
  "operations": [
    {
      "op": "move_resize",
      "targets": [
        "s0_sh0"
      ],
      "bbox": {
        "x": 0.01875,
        "y": 0.0333,
        "w": 0.58,
        "h": 0.9334
      }
    },
    {
      "op": "move_resize",
      "targets": [
        "s0_sh1"
      ],
      "bbox": {
        "x": 0.61875,
        "y": 0.0333,
        "w": 0.3625,
        "h": 0.9334
      }
    },
    {
      "op": "fit_text",
      "targets": [
        "s0_sh1"
      ]
    },
    {
      "op": "move_resize",
      "targets": [
        "s1_sh0"
      ],
      "bbox": {
        "x": 0.01875,
        "y": 0.0333,
        "w": 0.58,
        "h": 0.9334
      }
    },
    {
      "op": "move_resize",
      "targets