In [1]:
import os
from dotenv import load_dotenv

_ = load_dotenv()  # read .env from current folder

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
OPENAI_BASE = os.getenv("OPENAI_BASE", "").strip() or None
MODEL = os.getenv("OPENAI_MODEL", "gpt-5")

assert OPENAI_API_KEY, "Please set OPENAI_API_KEY in a .env file next to this notebook."
print("API key loaded. Model:", MODEL, "| Base:", OPENAI_BASE or "default")

API key loaded. Model: gpt-5 | Base: default


In [None]:
import io, json, re
from typing import Dict, Any, List
from io import BytesIO

from pptx import Presentation
from pptx.util import Pt
from pptx.enum.shapes import MSO_SHAPE_TYPE

from PIL import Image, ImageDraw, ImageFont

# OpenAI-compatible SDK
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE) if OPENAI_BASE else OpenAI(api_key=OPENAI_API_KEY)

# EMU <-> px
EMU_PER_IN, DPI = 914400, 96
def emu2px(v): return int(round(float(v)/EMU_PER_IN*DPI))
def px2emu(v): return int(round(float(v)/DPI*EMU_PER_IN))

In [None]:
def ppt_to_json(pptx_path: str, max_slides: int = 3) -> Dict[str, Any]:
    prs = Presentation(pptx_path)
    slides = []
    for si, slide in enumerate(prs.slides):
        if si >= max_slides:
            break
        shapes = []
        for i, s in enumerate(slide.shapes):
            entry = {
                "id": f"s{si}_sh{i}",
                "bbox_px": {"x": emu2px(s.left), "y": emu2px(s.top),
                            "w": emu2px(s.width), "h": emu2px(s.height)},
                "type": "other"
            }
            if s.shape_type == MSO_SHAPE_TYPE.PICTURE:
                entry["type"] = "image"
            elif getattr(s, "has_text_frame", False) and s.has_text_frame:
                entry["type"] = "text"
                paras = []
                for p in s.text_frame.paragraphs:
                    txt = "".join([r.text for r in p.runs]) if p.runs else p.text
                    paras.append(txt)
                entry["text"] = "\n".join(paras).strip()
            shapes.append(entry)
        slides.append({
            "index": si,
            "size_px": {"w": emu2px(prs.slide_width), "h": emu2px(prs.slide_height)},
            "shapes": shapes
        })
    return {"slides": slides}

In [None]:
PLAN_SCHEMA = {
  "version": "1.0",
  "policy": {
    "margins_px": 24,
    "font_min_pt": 10,
    "font_max_pt": 32,
    "line_spacing": 1.25,
    "units": "px"
  },
  "operations": []
}

SYSTEM_PROMPT = (
    "You are a layout planner. Return ONLY a valid JSON object following the given schema. "
    "No markdown, no prose. Every operation MUST include an 'op' and a 'targets' array (even if single target). "
    'If you have nothing to change, return {"version":"1.0","policy":{...},"operations":[]}.'
)

def build_user_prompt(instruction: str, slides_json: Dict[str, Any]) -> str:
    return (
        "Instruction:\n" + instruction + "\n\n" +
        "Slides JSON:\n" + json.dumps(slides_json, ensure_ascii=False) + "\n\n" +
        "Schema:\n" + json.dumps(PLAN_SCHEMA, ensure_ascii=False) + "\n\n" +
        "Your tasks:\n"
        "1) Identify the main image and main text on each slide (largest area if unclear).\n"
        "2) Add a 'swap' between them.\n"
        "3) Add 'fit_text' for every text shape; if overflow likely, add 'move_resize' before 'fit_text'.\n"
        "Return JSON only."
    )

In [5]:
def _extract_json(text: str) -> Dict[str, Any]:
    """
    Best-effort: find the first top-level JSON object in text and parse it.
    """
    # Quick path
    try:
        return json.loads(text)
    except Exception:
        pass
    # Fallback: regex find {...}
    m = re.search(r"\{.*\}", text, flags=re.S)
    if m:
        try:
            return json.loads(m.group(0))
        except Exception:
            pass
    raise ValueError("Model did not return valid JSON.")

def call_llm_for_plan(instruction: str, slides_json: Dict[str, Any]) -> Dict[str, Any]:
    user = build_user_prompt(instruction, slides_json)
    # Try strict JSON response first
    try:
        resp = client.chat.completions.create(
            model=MODEL,
            response_format={"type":"json_object"},  # some endpoints fully support this
            messages=[
                {"role":"system","content": SYSTEM_PROMPT},
                {"role":"user","content": user}
            ],
        )
        return json.loads(resp.choices[0].message.content)
    except Exception:
        # Fallback: no response_format, then extract JSON from text
        resp = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role":"system","content": SYSTEM_PROMPT},
                {"role":"user","content": user}
            ],
        )
        content = resp.choices[0].message.content
        return _extract_json(content)

In [6]:
def _load_font(size: int):
    # Try common fonts on Windows/Linux; fallback to default
    try_paths = [
        "C:/Windows/Fonts/arial.ttf",
        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
    ]
    for p in try_paths:
        try:
            return ImageFont.truetype(p, size)
        except Exception:
            pass
    return ImageFont.load_default()

def measure_and_wrap(text, size, max_w, line_spacing=1.25):
    font = _load_font(size)
    lines = []
    for para in (text.split("\n") or [""]):
        if para == "":
            lines.append(""); continue
        cur = ""
        draw = ImageDraw.Draw(Image.new("RGB",(10,10)))
        for w in para.split(" "):
            t = (cur + " " + w).strip() if cur else w
            if draw.textbbox((0,0), t, font=font)[2] > max_w and cur:
                lines.append(cur); cur = w
            else:
                cur = t
        lines.append(cur)
    height = int(size*line_spacing)*len(lines)
    width = 0
    draw = ImageDraw.Draw(Image.new("RGB",(10,10)))
    for ln in lines:
        width = max(width, draw.textbbox((0,0), ln, font=font)[2])
    return width, height, lines, font

def fit_textbox(tb, min_pt=10, max_pt=32, line_spacing=1.25, margin_px=24):
    text = "\n".join(("".join(r.text for r in p.runs) or p.text) for p in tb.text_frame.paragraphs).strip()
    L, T, W, H = map(emu2px, (tb.left, tb.top, tb.width, tb.height))
    for s in range(max_pt, min_pt-1, -2):
        w, h, lines, font = measure_and_wrap(text, s, max(10, W-2*margin_px), line_spacing)
        if w <= W-2*margin_px and h <= H-2*margin_px:
            tf = tb.text_frame; tf.clear()
            for i, ln in enumerate(lines):
                p = tf.paragraphs[0] if i==0 else tf.add_paragraph()
                p.text = ln
                for r in p.runs: r.font.size = Pt(s)
            return True
    for p in tb.text_frame.paragraphs:
        for r in p.runs: r.font.size = Pt(min_pt)
    return False

In [7]:
def _to_list(x):
    if x is None:
        return []
    if isinstance(x, list):
        return x
    return [x]

def _unwrap_plan(raw_plan):
    # Allow {"plan": {...}}, {"result": {...}}, {"data": {...}}, {"output": {...}} or plain dict
    if isinstance(raw_plan, str):
        try:
            raw_plan = json.loads(raw_plan)
        except Exception:
            return {}
    if not isinstance(raw_plan, dict):
        return {}
    for k in ("plan", "result", "data", "output"):
        if k in raw_plan and isinstance(raw_plan[k], dict):
            return raw_plan[k]
    return raw_plan

def normalize_operations(plan):
    if not isinstance(plan, dict):
        return []
    out = []

    ops = plan.get("operations", [])
    if isinstance(ops, dict):
        # {"swap": [...], "fit_text": [...]} or {"move_resize": {"targets":[...],"bbox":{...}}}
        for k, v in ops.items():
            if k == "move_resize" and isinstance(v, dict):
                out.append({
                    "op": "move_resize",
                    "targets": _to_list(v.get("targets")),
                    "bbox": v.get("bbox", {}),
                    "slide": v.get("slide"),
                })
            else:
                out.append({"op": k, "targets": _to_list(v), "slide": None})
    elif isinstance(ops, list):
        for item in ops:
            if not isinstance(item, dict): continue
            opn = item.get("op") or item.get("type")
            if not opn: continue
            targets = item.get("targets") if "targets" in item else item.get("target")
            out.append({
                "op": opn,
                "targets": _to_list(targets),
                "bbox": item.get("bbox", {}),
                "style": item.get("style", {}),
                "slide": item.get("slide"),
            })

    # per-slide
    if "slides" in plan and isinstance(plan["slides"], list):
        for splan in plan["slides"]:
            si = splan.get("index")
            sops = splan.get("operations", [])
            if isinstance(sops, dict):
                for k, v in sops.items():
                    if k == "move_resize" and isinstance(v, dict):
                        out.append({"op":"move_resize","targets":_to_list(v.get("targets")),
                                    "bbox":v.get("bbox",{}),"slide":si})
                    else:
                        out.append({"op":k,"targets":_to_list(v),"slide":si})
            elif isinstance(sops, list):
                for item in sops:
                    if not isinstance(item, dict): continue
                    opn = item.get("op") or item.get("type")
                    if not opn: continue
                    targets = item.get("targets") if "targets" in item else item.get("target")
                    out.append({
                        "op": opn,
                        "targets": _to_list(targets),
                        "bbox": item.get("bbox", {}),
                        "style": item.get("style", {}),
                        "slide": si,
                    })

    # filter malformed
    normalized = []
    for it in out:
        opn = it.get("op")
        if not opn: continue
        if opn in ("swap", "set_style", "fit_text") and not it.get("targets"):
            continue
        normalized.append(it)
    return normalized

In [8]:
def _largest_by_area(shapes, pred):
    cand = [s for s in shapes if pred(s)]
    if not cand:
        return None
    def area_px(s): return emu2px(s.width) * emu2px(s.height)
    return max(cand, key=area_px)

def execute_plan(pptx_in: str, plan_raw: Dict[str, Any], pptx_out: str):
    prs = Presentation(pptx_in)
    plan = _unwrap_plan(plan_raw)
    policy = plan.get("policy", {"margins_px":24,"font_min_pt":10,"font_max_pt":32,"line_spacing":1.25,"units":"px"})
    ops_all = normalize_operations(plan) or []

    for si, slide in enumerate(prs.slides):
        sid_map = {f"s{si}_sh{i}": s for i, s in enumerate(slide.shapes)}
        ops = [op for op in ops_all if (op.get("slide") is None or op.get("slide") == si)]

        if not ops:
            # Fallback: swap largest image & text, then fit all texts
            pic = _largest_by_area(slide.shapes, lambda s: s.shape_type == MSO_SHAPE_TYPE.PICTURE)
            txt = _largest_by_area(slide.shapes, lambda s: getattr(s, "has_text_frame", False) and s.has_text_frame)
            if pic and txt:
                lp, tp, wp, hp = pic.left, pic.top, pic.width, pic.height
                lt, tt, wt, ht = txt.left, txt.top, txt.width, txt.height
                pic.left, pic.top, pic.width, pic.height = lt, tt, wt, ht
                txt.left, txt.top, txt.width, txt.height = lp, tp, wp, hp
            for s in slide.shapes:
                if getattr(s, "has_text_frame", False) and s.has_text_frame:
                    fit_textbox(
                        s,
                        min_pt=policy.get("font_min_pt",10),
                        max_pt=policy.get("font_max_pt",32),
                        line_spacing=policy.get("line_spacing",1.25),
                        margin_px=policy.get("margins_px",24),
                    )
            continue

        for op in ops:
            name = op.get("op")
            targets = _to_list(op.get("targets"))

            if name == "swap" and len(targets) == 2:
                a, b = sid_map.get(targets[0]), sid_map.get(targets[1])
                if not a or not b: 
                    continue
                a.left, b.left = b.left, a.left
                a.top,  b.top  = b.top,  a.top
                a.width,b.width= b.width,a.width
                a.height,b.height=b.height,a.height

            elif name == "move_resize":
                bbox = op.get("bbox", {}) or {}
                units = policy.get("units", "px")
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s: 
                        continue
                    if units == "px":
                        if "x" in bbox: s.left  = px2emu(int(bbox["x"]))
                        if "y" in bbox: s.top   = px2emu(int(bbox["y"]))
                        if "w" in bbox: s.width = px2emu(int(bbox["w"]))
                        if "h" in bbox: s.height= px2emu(int(bbox["h"]))
                    else:
                        SW, SH = emu2px(prs.slide_width), emu2px(prs.slide_height)
                        if "x" in bbox: s.left  = px2emu(int(bbox["x"]*SW))
                        if "y" in bbox: s.top   = px2emu(int(bbox["y"]*SH))
                        if "w" in bbox: s.width = px2emu(int(bbox["w"]*SW))
                        if "h" in bbox: s.height= px2emu(int(bbox["h"]*SH))

            elif name == "set_style":
                st = op.get("style", {}) or {}
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s or not getattr(s, "has_text_frame", False):
                        continue
                    for p in s.text_frame.paragraphs:
                        for r in p.runs:
                            if "font_size_pt" in st and st["font_size_pt"]:
                                r.font.size = Pt(int(st["font_size_pt"]))

            elif name == "fit_text":
                for tid in targets:
                    s = sid_map.get(tid)
                    if not s or not getattr(s, "has_text_frame", False):
                        continue
                    fit_textbox(
                        s,
                        min_pt=policy.get("font_min_pt",10),
                        max_pt=policy.get("font_max_pt",32),
                        line_spacing=policy.get("line_spacing",1.25),
                        margin_px=policy.get("margins_px",24),
                    )

    prs.save(pptx_out)

In [10]:
INPUT_PPTX = "coco_raw_100.pptx"   
OUTPUT_PPTX = "coco_relayout_out.pptx"
MAX_SLIDES = 20                      
instruction = (
    "Swap the main image and the main text on each slide. "
    "Ensure all text fits inside boxes by shrinking font size down to 10pt if needed, "
    "reflowing lines, and only enlarge text boxes if still overflowing; keep ~24px margins."
)

slides_json = ppt_to_json(INPUT_PPTX, max_slides=MAX_SLIDES)
print("Slides included in prompt:", len(slides_json["slides"]))

plan = call_llm_for_plan(instruction, slides_json)
print("Plan preview (truncated):")
print(json.dumps(plan, indent=2, ensure_ascii=False)[:1200], "...")

execute_plan(INPUT_PPTX, plan, OUTPUT_PPTX)
print("Saved:", OUTPUT_PPTX)

Slides included in prompt: 20
Plan preview (truncated):
{
  "version": "1.0",
  "policy": {
    "margins_px": 24,
    "font_min_pt": 10,
    "font_max_pt": 32,
    "line_spacing": 1.25,
    "units": "px"
  },
  "operations": [
    {
      "op": "swap",
      "targets": [
        "s0_sh0",
        "s0_sh1"
      ]
    },
    {
      "op": "fit_text",
      "targets": [
        "s0_sh1"
      ],
      "params": {
        "min_pt": 10,
        "max_pt": 32,
        "line_spacing": 1.25,
        "padding_px": 24,
        "reflow": true
      }
    },
    {
      "op": "swap",
      "targets": [
        "s1_sh0",
        "s1_sh1"
      ]
    },
    {
      "op": "fit_text",
      "targets": [
        "s1_sh1"
      ],
      "params": {
        "min_pt": 10,
        "max_pt": 32,
        "line_spacing": 1.25,
        "padding_px": 24,
        "reflow": true
      }
    },
    {
      "op": "swap",
      "targets": [
        "s2_sh0",
        "s2_sh1"
      ]
    },
    {
      "op": "fit_tex