<a href="https://colab.research.google.com/github/jgstern/reference-free-scalable-llm-benchmarking/blob/main/absurd_drawings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [105]:
import re, json, random, itertools
from collections import Counter, defaultdict
from typing import Dict, Any, List, Optional, Tuple

# ----------------------------
# 1) Seed prompts
# ----------------------------
SEED_PROMPTS = [
    "Generate an SVG of a giraffe playing the violin on a tightrope",
    "Generate an SVG of an octopus painting a self-portrait with all eight arms",
    "Generate an SVG of a hedgehog piloting a hot air balloon made of teacups",
    "Generate an SVG of a snail racing a cheetah in roller skates",
    "Generate an SVG of a walrus conducting an orchestra of penguins",
    "Generate an SVG of a kangaroo delivering pizza on a unicycle",
    "Generate an SVG of a frog DJing at a nightclub made of mushrooms",
    "Generate an SVG of a raccoon juggling watermelons while surfing",
    "Generate an SVG of a flamingo practicing karate in a dojo",
    "Generate an SVG of a sloth hanging from a chandelier in an opera house",
    "Generate an SVG of a llama driving a bumper car through a library",
    "Generate an SVG of a crab building a sandcastle shaped like a skyscraper",
    "Generate an SVG of a fox reading a newspaper inside a subway car",
    "Generate an SVG of a parrot steering a pirate ship made of spaghetti",
    "Generate an SVG of a bear playing chess against a robot waiter",
    "Generate an SVG of a goat skydiving with a parachute shaped like a banana",
    "Generate an SVG of a duck teaching yoga to a class of turtles",
    "Generate an SVG of a hedgehog surfing a giant slice of pizza",
    "Generate an SVG of a giraffe wearing a tuxedo and playing a grand piano",
    "Generate an SVG of a kangaroo piloting a hot air balloon made of jellybeans",
    "Generate an SVG of a penguin skateboarding down a rainbow",
    "Generate an SVG of an octopus conducting an orchestra underwater",
    "Generate an SVG of a squirrel in a spacesuit juggling acorns on the moon",
    "Generate an SVG of a llama painting a self-portrait with a paintbrush in its mouth",
    "Generate an SVG of a walrus driving a convertible through the desert",
    "Generate an SVG of a raccoon wearing goggles and operating a steam engine",
    "Generate an SVG of a flamingo ice-skating on a frozen lake with elegance",
    "Generate an SVG of a crocodile sipping tea in a Victorian parlor",
    "Generate an SVG of a fox flying a kite shaped like a dragon",
    "Generate an SVG of a snail carrying a miniature castle on its shell",
    "Generate an SVG of a toucan playing chess with a robot",
    "Generate an SVG of a bear riding a unicycle while balancing plates",
    "Generate an SVG of a cat in a wizard robe casting spells from a book",
    "Generate an SVG of a dolphin jumping through hoops made of clouds",
]


# =========================
# Regex patterns & lexicons
# =========================
PREFIX_RE = re.compile(r"^\s*Generate an SVG of\s+", re.I)

# Clause-level (extract these before generic PPs)
MADE_OF_RE     = re.compile(
    r"\bmade of\s+([^,]+?)(?=$|[.,;]|(?:\s+(?:in|on|inside|through|with|while|to|against|of|at|down|over|across)\b))",
    re.I,
)
SHAPED_LIKE_RE = re.compile(
    r"\bshaped like\s+([^,]+?)(?=$|[.,;]|(?:\s+(?:in|on|inside|through|with|while|to|against|of|at|down|over|across)\b))",
    re.I,
)
# exclude stylistic WITH tails like "with elegance|style|grace"
WITH_RE        = re.compile(
    r"\bwith\s+(?!(?:elegance|style|grace|flair|panache|poise|finesse)\b)"
    r"(all\s+\w+\s+\w+|\d+\s+\w+(?:\s+\w+)?|(?:\w+\s+)?\d+\s+\w+.*?)(?=$|[.,;]|(?:\s+(?:in|on|inside|through|with|while|to|against|of|at|down|over|across)\b))",
    re.I,
)

WITH_COMPANION_RE = re.compile(
    r"\bwith\s+(?!all\b|\d+\b)(?!(?:elegance|style|grace|flair|panache|poise|finesse)\b)"
    r"(?:a|an|the)?\s*([^,]+)", re.I
)
WHILE_RE       = re.compile(r"\bwhile\s+([^,]+)", re.I)
AGAINST_RE     = re.compile(r"\bagainst\s+([^,]+)", re.I)
TO_RE          = re.compile(r"\bto\s+(?:a|an|the)?\s*([^,]+)", re.I)

# Ensemble handling split into head + "of ..." tail
ENSEMBLE_HEAD_RE = re.compile(
    r"\b(orchestra|band|choir|class|crowd|parade|flock|school|herd|team|crew|cast|council|ensemble)\b",
    re.I,
)
OF_TAIL_RE       = re.compile(
    r"(?<!made )(?<!shaped )\bof\s+([^,]+?)(?=$|[.,;]|(?:\s+(?:in|on|inside|through|with|while|to|against|at|down|over|across)\b))",
    re.I
)

FREE_LOC_RE = re.compile(r"\b(underwater|underground|indoors|outdoors|overhead)\b", re.I)

# Locations/supports
INSIDE_RE  = re.compile(r"\binside\s+(?:of\s+)?([^,]+)", re.I)
IN_RE      = re.compile(r"\bin\s+(?:a|an|the)?\s*([^,]+)", re.I)
ON_RE      = re.compile(r"\bon\s+(?:a|an|the)?\s*([^,]+)", re.I)
THROUGH_RE = re.compile(r"\bthrough\s+(?:a|an|the)?\s*([^,]+)", re.I)
AT_RE      = re.compile(r"\bat\s+(?:a|an|the)?\s*([^,]+)", re.I)
DOWN_RE    = re.compile(r"\bdown\s+(?:a|an|the)?\s*([^,]+)", re.I)
OVER_RE    = re.compile(r"\bover\s+(?:a|an|the)?\s*([^,]+)", re.I)
ACROSS_RE  = re.compile(r"\bacross\s+(?:a|an|the)?\s*([^,]+)", re.I)

# Action cues
ACTION_CUES = [
    "playing","painting","piloting","racing","conducting","delivering","djing","juggling",
    "practicing","hanging","driving","building","reading","steering","skydiving","teaching",
    "surfing","skateboarding","ice-skating","ice skating", "skating", "riding","operating","sipping","flying","carrying",
    "casting","jumping","rollerblading"
]
if "operating" not in ACTION_CUES:
    ACTION_CUES.append("operating")
ACTION_HEADS_RE = re.compile(
    r"\b(" + "|".join(re.escape(w) for w in ACTION_CUES) + r")\b[-\s]*([^,]*)",
    re.I,
)

# Subject boundary includes attire-introducing "in"
PRE_ACTION_BOUNDARIES = ACTION_CUES + ["wearing", "in"]
SUBJECT_RE = re.compile(
    r"^((?:(?:an?|the)\s+|(?:\d+|one|two|three|four|five|six|seven|eight|nine|ten|several|many)\s+)?([^,]+?))\s+"
    r"(?=(?:[a-z]+ing|" + "|".join(map(re.escape, PRE_ACTION_BOUNDARIES)) + r"))",
    re.I,
)

# Vehicles vs. Surfaces
VEHICLE_RE = re.compile(
    r"\b(unicycle|bicycle|skateboard|surfboard|bumper car|pirate ship|ship|hot air balloon|hot-air balloon|balloon|"
    r"convertible|car|subway car|roller skates?|blimp|parachute)\b",
    re.I,
)
SURFACE_RE = re.compile(
    r"\b(tightrope|rainbow|clouds?|lily pad|frozen lake|moon|ice|rope bridge|catwalk|waterfall|sand dune)\b",
    re.I,
)

# Equipment/Apparel
WEARING_RE   = re.compile(
    r"\b(?:wearing|in)\s+(?:a|an|the)?\s*(tuxedo|goggles|spacesuit|wizard robe|roller skates?|helmet|cape|bowtie|backpack|gloves?)\b",
    re.I,
)
EQUIPMENT_RE = re.compile(
    r"\b(tuxedo|goggles|spacesuit|wizard robe|roller skates?|helmet|cape|bowtie|backpack|gloves?)\b",
    re.I,
)

# Instrument/tool and animal head
INSTRUMENT_WITH_RE = re.compile(
    r"\bwith\s+(?:a|an|the)?\s*((?:paintbrush|violin(?:\s+bow)?|drumsticks?|baton|megaphone|wrench|hammer|spatula|"
    r"teacup|lantern|umbrella|saxophone|trumpet|microphone|palette|quill|pen|pencil)(?:\s+[^,;]+)?)\b",
    re.I,
)
IN_MOUTH_HAND_RE = re.compile(r"\b(in|between|on)\s+(?:its|their|the)\s+(?:mouth|beak|teeth|hands?|paws?|head|back|shoulder)\b", re.I)
ANIMAL_HEAD_RE   = re.compile(r"^(?:an?|the)\s+([a-z-]+)", re.I)

# Verb-specific extraction
VERB_OBJECT_PATTERNS = [
    (re.compile(r"\bconducting\s+(?:a|an|the)\s+([^,]+?)(?=\s+(?:underwater|in|on|inside|through|with|to|at|down|over|across)\b|$)", re.I), "object"),
    (re.compile(r"\bplaying\s+chess\b", re.I), "object_literal_chess"),
    (re.compile(r"\bplaying\s+(?:the|a|an)\s+([^,]+)", re.I), "object"),
    (re.compile(r"\bdelivering\s+([^,]+)", re.I), "object"),
    (re.compile(r"\bjuggling\s+([^,]+)", re.I), "object"),
    (re.compile(r"\breading\s+(?:the|a|an)\s+([^,]+)", re.I), "object"),
    (re.compile(r"\bpracticing\s+([^,]+)", re.I), "object"),
    (re.compile(r"\bbuilding\s+(?:a|an|the)\s+([^,]+)", re.I), "object"),
    (re.compile(r"\b(driving|steering|piloting|riding)\s+(?:a|an|the)\s+([^,]+)", re.I), "vehicle"),
    (re.compile(r"\boperating\s+(?:a|an|the)\s+([^,]+)", re.I), "object"),
]
VERB_DEFAULT_VEHICLE = {
    "skateboarding": "skateboard",
    "surfing": "surfboard",
    "ice-skating": "ice skates",
    "skating": "ice skates",
    "rollerblading": "rollerblades",
    "skydiving": "parachute",
}

INTRANSITIVE = {"jumping","skydiving","djing","conducting","ice-skating","skateboarding","surfing","rollerblading"}

INSTRUMENT_TOKENS = {
    "paintbrush","violin","bow","saxophone","trumpet","baton","microphone","palette",
    "quill","pen","pencil","drumstick","drumsticks","piano","grand piano"
}

# =========================
# Helpers
# =========================
def norm_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def normalize_punct(s: str) -> str:
    return (s.replace("–","-").replace("—","-")
            .replace("“","\"").replace("”","\"").replace("’","'"))

def extract_once(text: str, pattern: re.Pattern) -> Tuple[str, Optional[str]]:
    """Return (new_text, value) and blank out the matched span to avoid collisions."""
    m = pattern.search(text)
    if not m:
        return text, None
    start, end = m.span()
    grp1 = 1 if m.lastindex and m.lastindex >= 1 else 0
    val = norm_spaces(m.group(grp1))
    text = text[:start] + (" " * (end - start)) + text[end:]
    return text, val

def noun_phrase_after_action(text: str, action_span: Optional[Tuple[int,int]]) -> Optional[str]:
    if not action_span:
        return None
    tail = text[action_span[1]:]
    cut = re.split(r"\b(?:on|in|inside|through|with|while|to|against|at|made of|shaped like|down|over|across)\b",
                   tail, 1, flags=re.I)
    obj = norm_spaces(cut[0])
    if not obj or obj.lower() in {"a","an","the"}:
        return None
    return obj

def extract_verb_specific(text: str) -> Tuple[Optional[str], Optional[str]]:
    obj, veh = None, None
    for rx, kind in VERB_OBJECT_PATTERNS:
        m = rx.search(text)
        if m:
            if kind == "object_literal_chess" and not obj:
                obj = "chess"
            elif kind == "object" and not obj:
                obj = norm_spaces(m.group(1))
            elif kind == "vehicle" and not veh:
                veh = norm_spaces(m.group(2))
    return obj, veh

def clean_object_head(obj: Optional[str]) -> Optional[str]:
    if not obj: return None
    obj = re.sub(r"^(?:a|an|the)\s+", "", obj, flags=re.I)
    obj = re.split(
        r"\b(?:on|in|inside|through|with|to|against|at|made of|shaped like|underwater|underground|indoors|outdoors|overhead|down|over|across)\b",
        obj, 1, flags=re.I
    )[0]
    return norm_spaces(obj)

def extract_equipment(text: str) -> Optional[List[str]]:
    equips = [norm_spaces(m.group(1)) for m in WEARING_RE.finditer(text)]
    return sorted(set(equips)) or None

def classify_vehicle(v: str) -> str:
    v = (v or "").lower()
    if re.search(r"\b(steam engine|locomotive|train)\b", v): return "land_vehicle"
    if re.search(r"\b(ship|submarine)\b", v): return "sea_vehicle"
    if re.search(r"\b(balloon|blimp|parachute)\b", v): return "air_vehicle"
    if re.search(r"\b(car|convertible|subway car|bumper car)\b", v): return "land_vehicle"
    if re.search(r"\b(surfboard|skateboard|unicycle)\b", v): return "board_or_unicycle"
    if v in {"tightrope","rainbow","cloud","clouds","lily pad","frozen lake","ice","moon","catwalk","rope bridge"}:
        return "support_surface"
    return "unknown"

def detect_vehicle_or_surface(on_surface, in_loc, through_loc, at_loc, inside_loc, action_phrase, core, down_loc=None, over_loc=None, across_loc=None) -> Tuple[Optional[str], Optional[str]]:
    if action_phrase:
        m = VEHICLE_RE.search(action_phrase)
        if m: return m.group(0), "in_action"
        m = SURFACE_RE.search(action_phrase)
        if m: return m.group(0), "in_action"

    for chunk, origin in [(on_surface,"on"), (down_loc,"down"), (over_loc,"over"), (across_loc,"across")]:
        if chunk:
            m = VEHICLE_RE.search(chunk)
            if m: return m.group(0), origin
            m = SURFACE_RE.search(chunk)
            if m: return m.group(0), origin

    for chunk, origin in [(in_loc, "in"), (inside_loc, "inside"), (through_loc, "through"), (at_loc, "at")]:
        if chunk:
            m = VEHICLE_RE.search(chunk)
            if m: return m.group(0), origin

    m = VEHICLE_RE.search(core)
    if m: return m.group(0), "anywhere"
    return None, None

def derive_companions(of_clause, to_clause, against_clause, with_companion=None) -> Optional[str]:
    bits = []
    if of_clause: bits.append(of_clause)
    if to_clause: bits.append(to_clause)
    if against_clause: bits.append(against_clause)
    # instrument-vs-companion disambiguation
    if with_companion and not any(tok in with_companion.lower() for tok in INSTRUMENT_TOKENS):
        if re.search(r"\b(\w+(?:s|er|or|man|men|people|class|team|crew|robot|penguin)s?)\b", with_companion, re.I):
            bits.append(with_companion)
    return ", ".join(bits) if bits else None

def derive_tags(subject_head, action, vehicle_or_surface, made_of, shaped_like, locations, equipment, instrument=None) -> List[str]:
    tags = set()
    if vehicle_or_surface and VEHICLE_RE.search(vehicle_or_surface): tags.add("vehicle/locomotion")
    if vehicle_or_surface and SURFACE_RE.search(vehicle_or_surface): tags.add("balance/support")
    if action and re.search(r"\b(conducting|playing|painting|djing|practicing|reading|casting)\b", action, re.I):
        tags.add("performance/arts")
    if (action and re.search(r"\bdjing\b", action, re.I)) or any(
        loc for loc in locations if loc and re.search(r"\b(nightclub|club|stage|opera house)\b", loc, re.I)
    ):
        tags.add("performance/arts")
    if (action and re.search(r"\b(building|carrying)\b", action, re.I)) or shaped_like:
        tags.add("construction/design")
    if action and re.search(r"\b(teaching|delivering|sipping)\b", action, re.I):
        tags.add("occupation/service")
    if action and re.search(r"\b(racing|playing\s+chess|against)\b", action, re.I):
        tags.add("competition/duel")
    if action and re.search(
        r"\b(driving|steering|piloting|riding|skateboarding|surfing|ice[-\s]?skating|rollerblading|racing|operating)\b",
        action, re.I
    ):
        tags.add("vehicle/locomotion")
    if made_of: tags.add("material_absurdity")
    if shaped_like: tags.add("shape_absurdity")
    if any(loc for loc in locations if loc): tags.add("setting/location")
    if equipment: tags.add("equipment/apparel")
    if instrument: tags.add("instrument/tool")
    if (subject_head and re.search(r"\b(octopus|spider|centipede)\b", subject_head, re.I)) or (action and re.search(r"\bwith all\b", action, re.I)):
        tags.add("anatomy_constraint")

    if action and re.search(r"\b(skateboarding|surfing|ice[-\s]?skating|riding)\b", action, re.I):
        if vehicle_or_surface and SURFACE_RE.search(vehicle_or_surface):
            tags.add("balance/support")
            tags.add("vehicle/locomotion")

    return sorted(tags)

def primary_family(tags: List[str]) -> str:
    priority = [
        "vehicle/locomotion",
        "performance/arts",
        "construction/design",
        "material_absurdity",
        "shape_absurdity",
        "competition/duel",
        "occupation/service",
        "setting/location",
        "anatomy_constraint",
        "balance/support",
        "equipment/apparel",
        "instrument/tool",
    ]
    for t in priority:
        if t in tags:
            return t
    return "misc"

# =========================
# Core parser
# =========================
def parse_prompt(prompt: str) -> Dict[str, Any]:
    raw = normalize_punct(prompt.strip())
    core = PREFIX_RE.sub("", raw)
    w = core  # working string

    # high-signal clauses
    w, made_of        = extract_once(w, MADE_OF_RE)
    w, shaped_like    = extract_once(w, SHAPED_LIKE_RE)
    # instrument/tool FIRST to avoid misclassifying as companion
    w, instrument     = extract_once(w, INSTRUMENT_WITH_RE)
    w, constraints    = extract_once(w, WITH_RE)
    # companions/targets after instrument
    w, with_companion = extract_once(w, WITH_COMPANION_RE)
    w, while_clause   = extract_once(w, WHILE_RE)
    w, against_clause = extract_once(w, AGAINST_RE)
    w, to_clause      = extract_once(w, TO_RE)
    # additional instrument-vs-companion guard

    # ---- ACTION (skip 'wearing' / 'in' heads) ----
    candidates = list(ACTION_HEADS_RE.finditer(w))
    action_phrase, action_span = None, None
    for am in candidates:
        head = (am.group(1) or "").lower()
        if head in {"wearing", "in"}:
            continue
        rest = am.group(2) or ""
        action_phrase = norm_spaces(f"{head} {rest}".strip())
        action_phrase = re.sub(r"\s+and\s+.*$", "", action_phrase, flags=re.I)  # trim long tails
        action_phrase = action_phrase.replace("ice skating", "ice-skating")     # normalize variant
        action_span = am.span()
        break

    # locations/support
    w, inside_loc  = extract_once(w, INSIDE_RE)
    w, in_loc      = extract_once(w, IN_RE)
    w, on_surface  = extract_once(w, ON_RE)
    w, through_loc = extract_once(w, THROUGH_RE)
    w, at_loc      = extract_once(w, AT_RE)
    w, free_loc    = extract_once(w, FREE_LOC_RE)
    w, down_loc    = extract_once(w, DOWN_RE)
    w, over_loc    = extract_once(w, OVER_RE)
    w, across_loc  = extract_once(w, ACROSS_RE)

    # subject (NP + head)
    sm = SUBJECT_RE.search(w)
    subject_np = norm_spaces(sm.group(1)) if sm else None
    subject_head = norm_spaces(sm.group(2)) if sm else None
    # animal head
    animal_head = None
    if subject_np:
        m = ANIMAL_HEAD_RE.search(subject_np)
        if m:
            animal_head = m.group(1).lower()

    # verb-specific extraction
    obj_from_verb, veh_from_verb = extract_verb_specific(w)

    # ensemble support
    ensemble_obj, ensemble_of = None, None
    if action_phrase and re.search(r"\bconducting\b", action_phrase, re.I):
        mh = ENSEMBLE_HEAD_RE.search(w)
        if mh:
            ensemble_obj = mh.group(1)
            mt = OF_TAIL_RE.search(w[mh.end():])
            if mt:
                ensemble_of = norm_spaces(mt.group(1))

    # fallback: "conducting ... of X" without explicit orchestra/band/choir
    if action_phrase and re.search(r"\bconducting\b", action_phrase, re.I):
        if not ensemble_obj and of_clause:
            ensemble_obj = "ensemble"
            ensemble_of = of_clause

    # generic object fallback (skip for intransitives)
    obj_generic = None
    if not (action_phrase and action_phrase.split()[0].lower() in INTRANSITIVE):
        obj_generic = noun_phrase_after_action(w, action_span)

    chosen_obj = obj_from_verb or ensemble_obj or obj_generic
    obj = clean_object_head(chosen_obj)

    # equipment/apparel from the *original* core
    equipment = extract_equipment(core)

    # Promote equipment-in-loc (e.g., "in roller skates") to vehicle for locomotion verbs, then clear it
    is_equipment_in_loc = bool(in_loc and re.search(EQUIPMENT_RE, in_loc.strip()))
    if is_equipment_in_loc and action_phrase and re.search(
        r"\b(racing|riding|skateboarding|surfing|ice[-\s]?skating|rollerblading)\b",
        action_phrase, re.I
    ):
        veh_from_verb = veh_from_verb or in_loc  # treat skates as vehicle/surface

    # now clear so it doesn't linger as a location
    if is_equipment_in_loc:
        in_loc = None


    # vehicle/surface promotion
    vehicle_or_surface, v_origin = detect_vehicle_or_surface(
        on_surface, in_loc, through_loc, at_loc, inside_loc, action_phrase, core,
        down_loc=down_loc, over_loc=over_loc, across_loc=across_loc
    )
    if not vehicle_or_surface and veh_from_verb:
        vehicle_or_surface, v_origin = veh_from_verb, "in_action"

    # default vehicle for locomotion verbs
    if action_phrase and not vehicle_or_surface:
        head = action_phrase.split()[0].lower()
        if head == "ice" and action_phrase.lower().startswith("ice skating"):
            vehicle_or_surface, v_origin = "ice skates", "verb_default"
        elif head in VERB_DEFAULT_VEHICLE and VERB_DEFAULT_VEHICLE[head]:
            vehicle_or_surface, v_origin = VERB_DEFAULT_VEHICLE[head], "verb_default"

    # companions
    companions = derive_companions(
        ensemble_of, to_clause, against_clause, with_companion
    )

    # placement (optional)
    placement = None
    m_place = IN_MOUTH_HAND_RE.search(core)
    if m_place:
        placement = m_place.group(0)

    locations = [inside_loc, in_loc, on_surface, through_loc, at_loc, free_loc, down_loc, over_loc, across_loc]
    tags = derive_tags(subject_head, action_phrase, vehicle_or_surface, made_of, shaped_like, locations, equipment, instrument=instrument)

    return {
        "raw": raw,
        "animal_head": animal_head,
        "subject_np": subject_np,
        "subject_head": subject_head,
        "action": action_phrase,
        "object": obj,
        "instrument": instrument,
        "placement": placement,
        "vehicle_or_surface": vehicle_or_surface,
        "vehicle_or_surface_origin": v_origin,
        "location_inside": inside_loc,
        "location_in": in_loc,
        "location_on": on_surface,
        "location_through": through_loc,
        "location_at": at_loc,
        "location_free": free_loc,
        "location_down": down_loc,
        "location_over": over_loc,
        "location_across": across_loc,
        "made_of": made_of,
        "shaped_like": shaped_like,
        "constraints": constraints,
        "companions": companions,
        "equipment": equipment,
        "raw_clauses": {
            "while": while_clause,
            "against": against_clause,
            "to": to_clause,
            "with_companion": with_companion,
            "ensemble_of": ensemble_of,
        },
        "tags": tags,
        "primary_family": primary_family(tags),
    }

# =========================
# Reporting & QA
# =========================
def histogram(records: List[Dict[str,Any]], field: str):
    counts = Counter()
    for r in records:
        val = r.get(field)
        if not val:
            counts["(none)"] += 1
        elif isinstance(val, list):
            for v in val:
                counts[v] += 1
        else:
            counts[val] += 1
    return counts.most_common()

def group_by(records: List[Dict[str,Any]], field: str):
    groups = defaultdict(list)
    for r in records:
        key = r.get(field)
        if not key:
            groups["(none)"].append(r["raw"])
        elif isinstance(key, list):
            for k in key:
                groups[k].append(r["raw"])
        else:
            groups[key].append(r["raw"])
    return groups

def group_by_primary_family(records: List[Dict[str,Any]]):
    fam = defaultdict(list)
    for r in records:
        fam[r["primary_family"]].append(r["raw"])
    return dict(sorted(fam.items(), key=lambda kv: (-len(kv[1]), kv[0])))

def quality_checks(r: Dict[str,Any]) -> List[str]:
    issues = []
    # object expected for certain verbs (EXCEPT "playing chess")
    if r.get("action") and re.search(r"\b(playing|delivering|juggling|reading|practicing|building)\b", r["action"], re.I):
        if not re.search(r"\bplaying\s+chess\b", r["action"], re.I):
            if not r.get("object"):
                issues.append("Missing object for action that usually takes one")
    # vehicle/surface expected for locomotion verbs
    if r.get("action") and re.search(r"\b(driving|steering|piloting|riding|skateboarding|surfing|ice[-\s]?skating|rollerblading|skydiving|racing)\b", r["action"], re.I):
        if not r.get("vehicle_or_surface"):
            issues.append("Expected vehicle_or_surface for locomotion action")
    # equipment leakage into location_in
    if r.get("location_in") and re.search(EQUIPMENT_RE, r["location_in"].strip()):
        issues.append("Equipment parsed as location_in (should be in equipment)")
    # surface implies balance/support tag
    if r.get("vehicle_or_surface") and re.search(SURFACE_RE, r["vehicle_or_surface"]) and "balance/support" not in r.get("tags", []):
        issues.append("Surface detected but missing balance/support tag")
    # conducting should have an ensemble (companions or object head)
    if r.get("action") and re.search(r"\bconducting\b", r["action"], re.I):
        if not (r.get("companions")
                or (r.get("object") and re.search(r"\b(orchestra|band|choir|ensemble)\b", r["object"], re.I))):
            issues.append("Conducting without clear ensemble (expect 'orchestra of ...' or similar)")
    # stylistic WITH clause
    wc = r.get("raw_clauses", {}).get("with_companion")
    if wc and wc.strip().lower() in {"elegance", "style", "grace"}:
        issues.append("WITH clause looks stylistic, not a companion")
    # locomotion on surface: ensure vehicle + support tags both appear
    if r.get("action") and re.search(r"\b(skateboarding|surfing|ice[-\s]?skating|riding)\b", r["action"], re.I):
        if r.get("vehicle_or_surface") and re.search(SURFACE_RE, r["vehicle_or_surface"]):
            if "balance/support" not in r.get("tags", []):
                issues.append("Locomotion on surface but missing balance/support tag")
            if "vehicle/locomotion" not in r.get("tags", []):
                issues.append("Locomotion on surface but missing vehicle/locomotion tag")
    return issues

def dump_json(records: List[Dict[str,Any]], path: str = "parsed_prompts.json") -> None:
    with open(path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

In [106]:
# =========================
# 6) Run parsing
# =========================
parsed = [parse_prompt(p) for p in SEED_PROMPTS]

print("=== Primary families ===")
for fam, items in group_by_primary_family(parsed).items():
    print(f"- {fam}: {len(items)}")

print("\n=== Vehicle/Surface histogram ===")
for k, c in histogram(parsed, "vehicle_or_surface"):
    print(f"{k}: {c}")

print("\n=== Instruments histogram ===")
for k, c in histogram(parsed, "instrument"):
    print(f"{k}: {c}")

print("\n=== Equipment histogram ===")
for k, c in histogram(parsed, "equipment"):
    print(f"{k}: {c}")

print("\n=== Sample parsed rows (first 5) ===")
for r in parsed[:5]:
    print({k: r.get(k) for k in [
        "raw","animal_head","subject_head","action","object","instrument","vehicle_or_surface","vehicle_or_surface_origin",
        "location_on","location_in","location_inside","location_through","location_at","location_down",
        "made_of","shaped_like","constraints","companions","equipment","tags","primary_family"
    ]})

print("\n=== Quality checks (flagged only) ===")
flagged = 0
for r in parsed:
    issues = quality_checks(r)
    if issues:
        flagged += 1
        print(f"- {r['raw']}")
        for msg in issues:
            print(f"    · {msg}")
if not flagged:
    print("No obvious issues detected.")

# JSON dump
dump_json(parsed, "parsed_prompts.json")
print("\nWrote parsed_prompts.json")

# Optional DataFrame preview
try:
    import pandas as pd
    cols = ["raw","animal_head","subject_head","action","object","instrument","vehicle_or_surface","made_of","shaped_like","companions","tags","primary_family"]
    df = pd.DataFrame([{k:r.get(k) for k in cols} for r in parsed])
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    display(df)   # display() comes from IPython / Jupyter
except Exception as e:
    print("Pandas display skipped:", e)

=== Primary families ===
- vehicle/locomotion: 14
- performance/arts: 12
- construction/design: 3
- occupation/service: 2
- balance/support: 1
- material_absurdity: 1
- misc: 1

=== Vehicle/Surface histogram ===
(none): 19
hot air balloon: 2
unicycle: 2
tightrope: 1
roller skates: 1
bumper car: 1
subway car: 1
pirate ship: 1
parachute: 1
surfboard: 1
rainbow: 1
moon: 1
convertible: 1
ice: 1

=== Instruments histogram ===
(none): 33
paintbrush in its mouth: 1

=== Equipment histogram ===
(none): 29
roller skates: 1
tuxedo: 1
spacesuit: 1
goggles: 1
wizard robe: 1

=== Sample parsed rows (first 5) ===
{'raw': 'Generate an SVG of a giraffe playing the violin on a tightrope', 'animal_head': 'giraffe', 'subject_head': 'giraffe', 'action': 'playing the violin on a tightrope', 'object': 'violin', 'instrument': None, 'vehicle_or_surface': 'tightrope', 'vehicle_or_surface_origin': 'in_action', 'location_on': 'tightrope', 'location_in': None, 'location_inside': None, 'location_through': None, 'l

Unnamed: 0,raw,animal_head,subject_head,action,object,instrument,vehicle_or_surface,made_of,shaped_like,companions,tags,primary_family
0,Generate an SVG of a giraffe playing the violi...,giraffe,giraffe,playing the violin on a tightrope,violin,,tightrope,,,,"[balance/support, performance/arts, setting/lo...",performance/arts
1,Generate an SVG of an octopus painting a self-...,octopus,octopus,painting a self-portrait,,,,,,,"[anatomy_constraint, performance/arts]",performance/arts
2,Generate an SVG of a hedgehog piloting a hot a...,hedgehog,hedgehog,piloting a hot air balloon,,,hot air balloon,teacups,,,"[material_absurdity, vehicle/locomotion]",vehicle/locomotion
3,Generate an SVG of a snail racing a cheetah in...,snail,snail,racing a cheetah in roller skates,,,roller skates,,,,"[competition/duel, equipment/apparel, vehicle/...",vehicle/locomotion
4,Generate an SVG of a walrus conducting an orch...,walrus,walrus,conducting an orchestra of penguins,orchestra of penguins,,,,,penguins,[performance/arts],performance/arts
5,Generate an SVG of a kangaroo delivering pizza...,kangaroo,kangaroo,delivering pizza on a unicycle,pizza,,unicycle,,,,"[occupation/service, setting/location, vehicle...",vehicle/locomotion
6,Generate an SVG of a frog DJing at a nightclub...,frog,frog,djing at a nightclub,,,,mushrooms,,,"[material_absurdity, performance/arts, setting...",performance/arts
7,Generate an SVG of a raccoon juggling watermel...,raccoon,raccoon,juggling watermelons,watermelons,,,,,,[],misc
8,Generate an SVG of a flamingo practicing karat...,flamingo,flamingo,practicing karate in a dojo,karate,,,,,,"[performance/arts, setting/location]",performance/arts
9,Generate an SVG of a sloth hanging from a chan...,sloth,sloth,hanging from a chandelier in an opera house,,,,,,,"[performance/arts, setting/location]",performance/arts


In [107]:
# =========================
# 7) Generator (mad-libs)
# =========================

# --- preposition/article helpers ---
ON_SURFACES = {"tightrope","rainbow","skateboard","surfboard","frozen lake","moon","cloud","clouds","catwalk","rope bridge","lily pad","ice"}
SUPPORT_WORDS = {"tightrope","rainbow","cloud","clouds","lily pad","frozen lake","ice","moon","catwalk","rope bridge"}
IN_VEHICLES   = {"car","convertible","subway car","balloon","hot air balloon","ship","pirate ship","bumper car","blimp","parachute"}

def action_head(action: Optional[str]) -> str:
    if not action: return ""
    return action.split()[0].lower()

def compatible_object_for_action(action: Optional[str], slots: Dict[str, List[str]]) -> Optional[str]:
    h = action_head(action)
    if h in INTRANSITIVE:
        return None
    if h in {"playing"}:
        pool = slots.get("OBJ_INSTRUMENT", []) or []
        if pool:
            return choose(pool)
        return "chess"
    if h in {"conducting","djing"}:
        return None
    if h in {"painting"}:
        pool = slots.get("OBJ_PROP", []) or []
        return choose(pool, "a self-portrait")
    if h in {"building","carrying"}:
        pool = slots.get("OBJ_STRUCTURE", []) or []
        return choose(pool, "a sandcastle")
    if h in {"delivering","sipping","reading","practicing","casting"}:
        if h == "delivering":
            pool = slots.get("OBJ_FOOD", []) or slots.get("OBJ_PROP", [])
            return choose(pool, "pizza")
        if h == "sipping":
            return "tea"
        if h == "reading":
            return "newspaper"  # articleless; normalized later
        if h == "practicing":
            return choose(slots.get("TOPIC", []), "karate")
        if h == "casting":
            return choose(slots.get("OBJ_PROP", []), "a spell")
    # fallback
    for key in ["OBJ_PROP","OBJ_MISC","OBJ_FOOD","OBJ_STRUCTURE","OBJ_INSTRUMENT","OBJ_GAME"]:
        pool = slots.get(key, [])
        if pool:
            return choose(pool)
    return None

def veto(action: Optional[str], obj: Optional[str]) -> bool:
    if not action or not obj: return False
    a = action_head(action); o = obj.lower()
    if a in {"building"} and o in {"chess","karate","yoga"}: return True
    if a in {"djing","conducting"} and re.search(r"\b(grand piano|violin)\b", o): return True
    if a in {"reading","sipping"} and re.search(r"\b(sandcastle|steam engine)\b", o): return True
    return False

def needs_article(s: str) -> bool:
    return not re.match(r"^(?:a|an|the)\b", (s or "").strip(), flags=re.I)

def with_article(s: Optional[str]) -> Optional[str]:
    if not s: return s
    s = s.strip()
    if re.match(r"^(?:a|an|the)\b", s, re.I):
        return s
    # crude plural check (avoid adding article to obvious plurals)
    if re.search(r"(s|ies)\b$", s) and not re.search(r"\b(glasses|chess)\b$", s):
        return s
    # vowel/consonant sound exceptions
    if re.match(r"^(hour|honest|heir|herb)\b", s, re.I):
        return "an " + s
    if re.match(r"^(uni|use|eu|one-?|u[sn]\b)", s, re.I):  # uni/you, use/you, eu/you, one, US/U.N.
        return "a " + s
    return ("an " if re.match(r"^[aeiou]", s, re.I) else "a ") + s

def normalize_np_with_article(s: Optional[str]) -> Optional[str]:
    if not s: return s
    s = s.strip()
    # group nouns take an article even if trailing word is plural
    if re.search(r"\b(class|team|crew|flock|school|herd|crowd|parade|choir|orchestra|band|council|ensemble)\b", s, re.I):
        if not re.match(r"^(?:a|an|the)\b", s, re.I):
            return "a " + s
        return s
    # keep given articles
    if re.match(r"^(?:a|an|the)\b", s, re.I):
        return s
    # obvious plurals (no article)
    if re.search(r"\b(people|men|women|penguins|turtles|robots|waiters|plates|hoops|jellybeans|teacups|acorns|clouds|skates)\b", s, re.I) or s.endswith('s'):
        return s
    return ("an " if re.match(r"^[aeiou]", s, re.I) else "a ") + s

def starts_with_det(s: str) -> bool:
    return bool(re.match(r"^(?:a|an|the|this|that|these|those|my|your|his|her|its|our|their)\b", s or "", re.I))

def ensure_article_np(s: Optional[str]) -> Optional[str]:
    if not s: return s
    s = s.strip()
    if starts_with_det(s): return s
    # plural/mass guard (very light)
    if re.search(r"(?:s|ies)\b$", s) and not re.search(r"\b(glasses|chess)\b$", s, re.I):
        return s
    return ("an " if re.match(r"^[aeiou]", s, re.I) else "a ") + s

def strip_article(s: Optional[str]) -> str:
    return re.sub(r"^(?:a|an|the)\s+", "", s or "", flags=re.I)

def sanitize_action(action: str) -> str:
    a = (action or "").strip()
    a = re.sub(r"\bplaying\s+chess\s+chess\b", "playing chess", a, flags=re.I)
    a = re.sub(r"\bself-portrait\b", "", a, flags=re.I)   # drop inline self-portrait to avoid doubling
    a = re.sub(r"\bnewspaper\b", "", a, flags=re.I)       # remove spill token
    a = re.sub(r"\s{2,}", " ", a).strip()
    return a

def veh_phrase(veh: Optional[str]) -> str:
    if not veh: return ""
    v_raw = (veh or "").strip()
    v = v_raw.lower()
    # special-case common surfaces that should not take articles
    if v in {"ice", "the ice"}:
        return " on ice"
    if v in {"cloud", "clouds"}:
        return " on " + v
    klass = classify_vehicle(v)
    if klass in {"board_or_unicycle","support_surface"}:
        prep = "on"
    elif klass in {"sea_vehicle","air_vehicle","land_vehicle"}:
        prep = "in"
    else:
        prep = "on"
    return f" {prep} {with_article(v_raw)}"

def loc_phrase(loc: Optional[str]) -> str:
    if not loc: return ""
    t = loc.strip()
    if re.fullmatch(r"(underwater|indoors|outdoors|overhead|underground)", t, re.I):
        return " " + t
    prep = "on" if any(w == t.lower() or w in t.lower() for w in SUPPORT_WORDS) else "in"
    # normalize certain well-known surfaces
    if t.lower() == "ice":
        return " on ice"
    if t.lower() in {"cloud","clouds"}:
        return " on " + t.lower()
    return f" {prep} {with_article(t)}"

def polish(s: str) -> str:
    # preposition duplication
    s = re.sub(r"\b(in|on)\s+(in|on)\b", r"\2", s, flags=re.I)

    # article cleanups (broad)
    # 1) turn "a n <vowel>" into "an <vowel>" in ALL contexts
    s = re.sub(r"\ba\s+n\s+(?=[aeiou])", "an ", s, flags=re.I)
    # also capture common preps just in case they interact oddly
    s = re.sub(r"\b(in|on|at|over|under|through)\s+a\s+n\s+(?=[aeiou])", r"\1 an ", s, flags=re.I)

    # Keep 'an' before vowel-sounding acronyms (all-caps starting with these letters)
    # A, E, F, H, I, L, M, N, O, R, S, X are pronounced with an initial vowel sound
    s = re.sub(r"\b(?:a|an)\s+([AEFHILMNORSX])([A-Z]{1,})\b", r"an \1\2", s)

    # Demote 'an' -> 'a' only when the next word starts lowercase consonant
    s = re.sub(r"\ban\s+([b-df-hj-np-tv-z])(?=[a-z])", r"a \1", s)

    # 3) collapse double articles
    s = re.sub(r"\b(a|an)\s+(a|an)\b", r"\2", s, flags=re.I)

    # special surfaces
    s = re.sub(r"\bon an?\s+ice\b", "on ice", s, flags=re.I)
    s = re.sub(r"\bon a moon\b", "on the moon", s, flags=re.I)

    # articleless topics
    s = re.sub(r"\b(?:a|an)\s+(yoga|karate|chess)\b", r"\1", s, flags=re.I)

    # venue de-dupes
    s = re.sub(r"\b(in|on)\s+(?:an?\s+)?(desert|dojo|library|nightclub|opera house)\s+\1\b",
               r" \1 \2", s, flags=re.I)

    # generic whitespace collapse
    s = re.sub(r"\s{2,}", " ", s).strip()
    return s


def force_group_article(s: Optional[str]) -> Optional[str]:
    if not s: return s
    s = s.strip()
    if re.search(r"^(?:a|an|the)\b", s, re.I):
        return s
    if re.search(r"\b(class|choir|orchestra|band|flock|school|herd|parade|ensemble)\b", s, re.I):
        return "a " + s
    return s


# --- typed slot builder ---
def build_slot_tables(records: List[Dict[str, Any]]) -> Dict[str, List[str]]:
    slots = defaultdict(set)
    groups, opponents, countables = set(), set(), set()
    vehicles_surfaces, locations = set(), set()

    for r in records:
        # subjects
        if r.get("subject_head"):
            subj_np = r.get("subject_np") or f"a {r['subject_head']}"
            slots["SUBJECT"].add(subj_np.strip())

        # actions (trim trailing prepositional tails)
        if r.get("action"):
            ap = re.split(r"\b(on|in|inside|through|with|to|against|at|down|over|across)\b", r["action"], 1, flags=re.I)[0]
            slots["ACTION"].add(norm_spaces(ap))

        # objects → typed pools
        obj = r.get("object")
        if obj:
            lo = obj.lower()
            if re.search(r"\b(violin|piano|grand piano|saxophone|trumpet|drum|baton|microphone|palette)\b", lo):
                slots["OBJ_INSTRUMENT"].add(obj)
            elif lo in {"chess"}:
                slots["OBJ_GAME"].add(obj)
            elif re.search(r"\b(sandcastle|castle|skyscraper|kite|steam engine)\b", lo):
                slots["OBJ_STRUCTURE"].add(obj)
            elif re.search(r"\b(pizza|watermelons?|acorns?|jellybeans?|teacups?)\b", lo):
                slots["OBJ_FOOD"].add(obj)
            elif re.search(r"\b(newspaper|book|paintbrush)\b", lo):
                slots["OBJ_PROP"].add(obj)
            else:
                slots["OBJ_MISC"].add(obj)

            if re.search(r"s\b", lo) or lo in {"acorns","watermelons","plates","hoops","jellybeans","teacups"}:
                countables.add(obj)

        # companions → group/opponent buckets
        if r.get("companions"):
            c = r["companions"].lower()
            if "class" in c or " of " in c:
                groups.add(r["companions"])
            if re.search(r"\b(robot|cheetah|waiter)\b", c):
                opponents.add(r["companions"])

        # vehicles/surfaces/locations
        if r.get("vehicle_or_surface"):
            vehicles_surfaces.add(r["vehicle_or_surface"])
        for k in ["location_on","location_in","location_inside","location_through","location_at","location_free","location_down","location_over","location_across"]:
            if r.get(k): locations.add(r[k])

        if r.get("made_of"): slots["MADE_OF"].add(r["made_of"])
        if r.get("shaped_like"): slots["SHAPED_LIKE"].add(r["shaped_like"])
        if r.get("instrument"): slots["INSTRUMENT_PHRASE"].add(r["instrument"])
        if r.get("equipment"):
            for e in r["equipment"]: slots["EQUIPMENT"].add(e)
        if r.get("constraints"): slots["CONSTRAINT"].add(r["constraints"])
        if r.get("companions"): slots["COMPANIONS"].add(r["companions"])

    # defaults/fallbacks
    slots["VEHICLE_SURFACE"] = vehicles_surfaces
    slots["LOCATION"] = locations
    slots["TOPIC"] = slots.get("TOPIC", set()) or {"yoga","karate"}
    slots["GROUP"] = groups or {"a class of turtles"}
    slots["OPPONENT"] = opponents or {"a robot"}
    slots["COUNTABLE_OBJECT"] = countables or {"kites","plates","acorns","umbrellas"}

    return {k: sorted(v) for k,v in slots.items()}

TEMPLATES = [
    # T1: performance on improbable surface/vehicle
    "Generate an SVG of {SUBJECT} {ACTION}{OBJ_OPT}{ON_OR_USING_OPT}{LOC_OPT}{COMPANIONS_OPT}{MADE_OF_OPT}{CONSTRAINT_OPT}",
    # T2: construction/design with shape/material
    "Generate an SVG of {SUBJECT} building {OBJECT}{SHAPED_OPT}{MADE_OF_OPT}{LOC_OPT}",
    # T3: vehicle/locomotion mismatch
    "Generate an SVG of {SUBJECT} {VEH_VERB} {VEHICLE}{LOC_OPT}{COMPANIONS_OPT}{MADE_OF_OPT}",
    # T4: occupation/service in odd venue
    "Generate an SVG of {SUBJECT} {OCCUPATION_VERB} {OBJECT_OR_ROLE}{LOC_OPT}{COMPANIONS_OPT}",
    # T5: competition/duel
    "Generate an SVG of {SUBJECT} {COMPETE_VERB} {OPPONENT}{WITH_EQUIP_OPT}{LOC_OPT}{CONSTRAINT_OPT}",
    # T6: teaching/leading group
    "Generate an SVG of {SUBJECT} teaching {TOPIC} to {GROUP}{LOC_OPT}",
]

VEH_VERBS = ["driving", "steering", "piloting", "riding"]  # will be overridden per vehicle class
OCCUPATION_VERBS = ["delivering", "reading", "conducting", "djing", "practicing", "sipping", "painting", "casting"]
COMPETE_VERBS = ["racing", "playing chess against", "competing with"]

COLOR_ADJS = ["red", "blue", "green", "golden", "striped"]
COUNT_PHRASES = ["exactly three", "five", "a pair of", "seven"]
STYLE_TAILS = ["under starlight", "during a snowstorm", "at dawn"]

def choose(lst: List[str], default: Optional[str]=None) -> Optional[str]:
    if not lst:
        return default
    return random.choice(lst)

def ngrams(s: str, n: int) -> set:
    toks = s.lower().split()
    return set(tuple(toks[i:i+n]) for i in range(max(0, len(toks)-n+1)))

BAD_LOCATION_RE = re.compile(r"\b(its shell|his shell|her shell|their shell)\b", re.I)

THROUGH_ONLY = {"hoops"}  # nouns that usually appear with "through" and shouldn't be rendered as "in"

def pick_one_location(slots) -> Optional[str]:
    locs = []
    veh_pat = VEHICLE_RE
    surf_words = {w.lower() for w in SUPPORT_WORDS}
    for l in slots.get("LOCATION", []):
        if not l:
            continue
        lo = l.lower().strip()
        if veh_pat.search(lo):
            continue
        if lo in surf_words:
            continue
        if BAD_LOCATION_RE.search(l):
            continue
        if any(tok == lo or tok in lo for tok in THROUGH_ONLY):
            continue
        locs.append(l)
    return choose(locs)

def maybe_decorate_object_for_counting(obj: Optional[str], slots) -> Optional[str]:
    if not obj: return None
    if obj not in slots.get("COUNTABLE_OBJECT", []):
        return obj
    if random.random() < 0.35:
        return f"{choose(COUNT_PHRASES)} {choose(COLOR_ADJS)} {obj}"
    return obj

def ensure_absurdity(prompt: str) -> bool:
    p = prompt.lower()
    if "made of" in p or "shaped like" in p: return True
    if any(tok in p for tok in ["tightrope","rainbow","moon","opera house","subway car"]): return True
    if any(tok in p for tok in ["playing","conducting","djing","teaching","reading","delivering","skydiving"]): return True
    return False

def too_close_to_seed(prompt: str, seeds: List[str], max_trigram_overlap: int = 2) -> bool:
    tri_p = ngrams(prompt.replace("Generate an SVG of", "").strip(), 3)
    for s in seeds:
        tri_s = ngrams(s.replace("Generate an SVG of", "").strip(), 3)
        if len(tri_p & tri_s) > max_trigram_overlap:
            return True
    return False

def dedupe(prompts: List[str]) -> List[str]:
    seen = set()
    out = []
    for p in prompts:
        key = p.lower()
        if key not in seen:
            seen.add(key)
            out.append(p)
    return out

PERFORMANCE_HEADS = {"playing","painting","conducting","djing","practicing","reading","casting","building"}

def pick_performance_action(slots):
    pool = [a for a in slots.get("ACTION", []) if a.split()[0].lower() in PERFORMANCE_HEADS]
    return choose(pool, "playing the saxophone")


# =========================
# Article & preposition policy (drop-in patch)
# =========================

# 1) Word policy sets
ARTICLELESS_TOPICS = {
    "yoga","karate","ballet","origami","calligraphy","calculus","astronomy","juggling","fencing","go","chess"
}

# NPs that generally prefer no article when used standalone as a surface or environment
ZERO_ARTICLE_SURFACES = {
    "ice","cloud","clouds","space","underwater","indoors","outdoors","overhead"
}

# Locations that typically take "the" (when used as a place, not as part of a longer NP with its own article)
DEFINITE_LOCS = {
    "moon","desert","ocean","sea","sky","library roof","rooftop garden","subterranean station"
}

# Mass/uncountable materials (never "a/an", never "the" after "made of")
MASS_MATERIALS = {
    "glitter","starlight","glass","crystals","gingerbread","clockwork gears","playing cards","stained glass",
    "paperclips","soap bubbles","origami cranes","leaves","feathers"
}

# 2) a/an exceptions based on pronunciation (leading tokens)
A_AN_VOWEL_SOUND_INITIALISMS = set(list("AEFHILMNORSX"))  # letters pronounced with leading vowel sounds
A_AN_CONSONANT_SOUND_EXCEPTIONS = {
    r"uni[^ ]*", r"euro[^ ]*", r"one[^ ]*", r"u[sn]\b"  # unicorn, European, one-, US/UN
}

def _needs_an(word: str) -> bool:
    """Decide if we should use 'an' before a token."""
    w = word.strip()
    if not w:
        return False
    # Acronyms / initialisms in ALL CAPS (e.g., MRI, MBA) -> 'an'
    if re.match(r"^[A-Z]{2,}$", w) and w[0] in A_AN_VOWEL_SOUND_INITIALISMS:
        return True
    # Vowel letter start: candidates for 'an'
    if re.match(r"^[aeiouAEIOU]", w):
        # But carve out consonant-sound exceptions
        for pat in A_AN_CONSONANT_SOUND_EXCEPTIONS:
            if re.match(pat, w, flags=re.I):
                return False
        return True
    return False

def is_plural_like(s: str) -> bool:
    """Very light plural guess; avoid articles for obvious plurals."""
    t = (s or "").strip().lower()
    if not t:
        return False
    # irregulars that look singular but are plural collectives
    if t in {"scissors","pants","glasses","species","series"}:
        return True
    # groups already headed by a classifier take their own articles elsewhere
    if re.search(r"\b(class|flock|school|herd|choir|orchestra|band|parade|council|troupe|brigade|platoon|league|gaggle)\b", t):
        return False
    return bool(re.search(r"(?:s|ies)\b$", t)) and t not in {"chess"}  # 'chess' ends with 'ss' but is mass/game

def is_mass_noun(s: str) -> bool:
    t = (s or "").strip().lower()
    return t in MASS_MATERIALS or t in {"music","water","sand","light","air"}

def wants_zero_article(s: str) -> bool:
    t = (s or "").strip().lower()
    return (t in ZERO_ARTICLE_SURFACES) or (t in ARTICLELESS_TOPICS) or is_mass_noun(t)

def wants_definite_article(s: str) -> bool:
    t = (s or "").strip().lower()
    # bare head matches (e.g., "moon") favor "the", but if already part of a larger NP with its own article, don't force it
    return t in DEFINITE_LOCS

# 3) Safer article functions (override your earlier versions)

def with_article(s: Optional[str]) -> Optional[str]:
    if not s:
        return s
    s = s.strip()
    if not s:
        return s
    # respect existing determiners
    if re.match(r"^(?:a|an|the|this|that|these|those|my|your|his|her|its|our|their)\b", s, re.I):
        return s
    # zero-article cases
    if wants_zero_article(s) or is_plural_like(s):
        return s
    # definite locations (bare)
    if wants_definite_article(s):
        return "the " + s
    # a/an decision
    head = s.split()[0]
    return ("an " if _needs_an(head) else "a ") + s

def normalize_np_with_article(s: Optional[str]) -> Optional[str]:
    if not s:
        return s
    s = re.sub(r"\s+", " ", s).strip()
    if not s:
        return s
    # keep given determiners
    if re.match(r"^(?:a|an|the|this|that|these|those|my|your|his|her|its|our|their)\b", s, re.I):
        return s
    # group nouns (class/flock/… of N) want an article; use 'a'
    if re.search(r"\b(class|flock|school|herd|choir|orchestra|band|parade|council|troupe|brigade|platoon|league|gaggle)\b", s, re.I):
        return "a " + s
    # bare zero-article cases
    if wants_zero_article(s) or is_plural_like(s):
        return s
    if wants_definite_article(s):
        return "the " + s
    head = s.split()[0]
    return ("an " if _needs_an(head) else "a ") + s

# 4) Vehicle/location phrasing (small override)

# Prefer "with/using" for jetpack; "in" for cars/ships/balloons; "on" for boards/unicycle/surfaces.
VEHICLE_PREP = {
    "jetpack": "with", "a jetpack": "with",
    "skateboard": "on","a skateboard": "on",
    "surfboard": "on","a surfboard": "on",
    "unicycle": "on","a unicycle": "on",
    "bumper car": "in","a bumper car":"in",
    "convertible": "in","a convertible":"in",
    "car":"in","a car":"in",
    "subway car":"in","a subway car":"in",
    "hot air balloon":"in","a hot air balloon":"in","balloon":"in","a balloon":"in","blimp":"in","a blimp":"in","zeppelin":"in","a zeppelin":"in",
    "pirate ship":"in","a pirate ship":"in","ship":"in","a ship":"in",
    "canoe":"in","a canoe":"in","gondola":"in","a gondola":"in",
    "snowmobile":"on","a snowmobile":"on",  # both are used; 'on' reads okay
    "hovercraft":"in","a hovercraft":"in",
    "bathysphere":"in","a bathysphere":"in","submersible":"in","a submersible":"in",
}

def veh_phrase(veh: Optional[str]) -> str:
    if not veh:
        return ""
    v_raw = veh.strip()
    v = v_raw.lower()
    # surfaces short-circuit
    if v in ZERO_ARTICLE_SURFACES or re.fullmatch(r"(cloud|clouds|ice)", v):
        return " on " + v
    # pick prep; default by classify
    prep = VEHICLE_PREP.get(v, None)
    if not prep:
        klass = classify_vehicle(v)
        prep = "on" if klass in {"board_or_unicycle","support_surface"} else "in"
    # jetpack phrasing feels better as "with"
    if prep == "with":
        return f" with {with_article(v_raw)}"
    return f" {prep} {with_article(v_raw)}"

def loc_phrase(loc: Optional[str]) -> str:
    if not loc:
        return ""
    t = re.sub(r"\s+", " ", loc.strip())
    if not t:
        return ""
    low = t.lower()
    # free adverbial envs (underwater, indoors, outdoors, overhead)
    if re.fullmatch(r"(underwater|indoors|outdoors|overhead|underground)", low):
        return " " + low
    # surfaces prefer "on"
    if low in ZERO_ARTICLE_SURFACES or re.fullmatch(r"(cloud|clouds|ice|the moon|moon)", low):
        # normalize the moon
        if low in {"moon"}:
            return " on the moon"
        return " on " + (low if low.startswith("the ") else low)
    # default: 'in' with normalized article
    return f" in {with_article(t)}"

# 5) Minor polish hooks (keep your existing polish() but add two fixes)
def _polish_articles_extra(s: str) -> str:
    # avoid "with with"
    s = re.sub(r"\bwith\s+with\b", "with", s, flags=re.I)
    # collapse "in in"
    s = re.sub(r"\b(in|on)\s+\1\b", r"\1", s, flags=re.I)
    # ensure 'shaped like'/'made of' never gain articles
    s = re.sub(r"\b(shaped like|made of)\s+(?:a|an|the)\s+", r"\1 ", s, flags=re.I)
    return s


def generate_one(slots, seeds: List[str]) -> str:
    idx = random.randrange(len(TEMPLATES))
    t = TEMPLATES[idx]
    subj = choose(slots.get("SUBJECT", []), "a marmot")

    # Pick vehicle/location once, then allow at most one location phrase
    veh = choose(slots.get("VEHICLE_SURFACE", []))
    loc = pick_one_location(slots)

    # Common side slots
    opponent   = choose(slots.get("OPPONENT", []), "a robot")
    companions = normalize_np_with_article(choose(slots.get("COMPANIONS", [])))
    group      = force_group_article(choose(slots.get("GROUP", []), "a class of turtles"))
    made       = choose(slots.get("MADE_OF", []))
    shape      = choose(slots.get("SHAPED_LIKE", []))
    constraint = choose(slots.get("CONSTRAINT", []))
    # bind octopus-only constraint
    if constraint and re.search(r"\beight arms?\b", constraint, re.I):
        if not re.search(r"\boctopus|spider\b", (subj or ""), re.I):
            constraint = None
    if constraint and not re.search(r"^\s*with\b", constraint or "", re.I):
        constraint = "with " + constraint

    # Template-scoped action/object logic
    action, obj = None, None
    veh_verb = choose(VEH_VERBS, "riding")
    comp_verb = choose(COMPETE_VERBS, "racing")

    if idx == 0:  # T1: performance on surface/vehicle
        action = sanitize_action(pick_performance_action(slots))
        head = action_head(action)
        if head == "painting":
            obj = "a self-portrait"  # ensure single instance with article
        elif head == "playing":
            contains_instr = any(tok in action.lower() for tok in INSTRUMENT_TOKENS)
            obj = None if contains_instr else compatible_object_for_action(action, slots)
        else:
            obj = compatible_object_for_action(action, slots)
        # if the action already includes the object token (e.g., "practicing karate"), don't add OBJ_OPT
        if obj:
            obj_core = strip_article(obj).lower()
            if re.search(rf"\b{re.escape(obj_core)}\b", action.lower()):
                obj = None
        if veto(action, obj):
            obj = None

    elif idx == 1:  # T2: construction/design
        action = "building"
        obj = choose(slots.get("OBJ_STRUCTURE", []), "a sandcastle")

    elif idx == 2:  # T3: vehicle/locomotion mismatch
        action = None  # verb is VEH_VERB
        obj = None
        klass = classify_vehicle(veh or "")
        if klass == "sea_vehicle":
            veh_verb = "steering"
        elif klass == "air_vehicle":
            veh_verb = "piloting"
        elif klass == "land_vehicle":
            veh_verb = "driving"
        elif klass in {"board_or_unicycle"}:
            veh_verb = "riding"
        elif klass == "support_surface":
            veh_verb = "balancing on"
        else:
            veh_verb = "riding"

    elif idx == 3:  # T4: occupation/service
        occ = choose(OCCUPATION_VERBS, "delivering")
        action = occ
        if occ == "delivering":
            obj = choose(slots.get("OBJ_FOOD", []) or slots.get("OBJ_PROP", []), "pizza")
        elif occ == "reading":
            obj = "newspaper"
        elif occ == "sipping":
            obj = "tea"
        elif occ == "painting":
            obj = choose(slots.get("OBJ_PROP", []), "a self-portrait")
        elif occ == "practicing":
            obj = choose(slots.get("TOPIC", []), "karate")
        elif occ in {"conducting","djing"}:
            obj = None  # no object for these
        elif occ == "casting":
            obj = choose(slots.get("OBJ_PROP", []), "a spell")

    elif idx == 4:  # T5: competition/duel
        action = None
        obj = None

    elif idx == 5:  # T6: teaching/leading group
        action = "teaching"
        obj = choose(slots.get("TOPIC", []), "yoga")

    # If the selected action is locomotion, don't attach a direct object
    if action and re.match(r"^(riding|driving|steering|piloting|skateboarding|surfing|ice[-\s]?skating|rollerblading|racing)\b", action, re.I):
        obj = None

    ARTICLELESS_TOPICS = {"yoga","karate","chess"}

    # normalize object article usage (single pass)
    if obj:
        if obj.lower() in ARTICLELESS_TOPICS:
            pass  # leave as-is (no article)
        else:
            obj = ensure_article_np(obj)

    # Build optional pieces with safer rules
    OBJ_OPT = f" {obj}" if (action and obj and not re.search(r"\bplaying\s+chess\b", action or "", re.I)) else ""

    # vehicle phrase (kept separate from location)
    ON_OR_USING_OPT = veh_phrase(veh)

    # single location phrase (don’t stack multiple)
    LOC_OPT = ""
    if loc and (not veh or (loc.lower() not in (veh or '').lower())):
        tl = (loc or "").lower()
        if "moon" in tl:
            LOC_OPT = " on the moon"
        elif "ice" in tl and not re.search(r"ice skates?", tl):
            LOC_OPT = " on ice"
        else:
            LOC_OPT = loc_phrase(loc)

    # avoid repeats or conflicting preps
    if ON_OR_USING_OPT and LOC_OPT and ON_OR_USING_OPT.split()[-1].lower() == LOC_OPT.split()[-1].lower():
        LOC_OPT = ""
    if veh and classify_vehicle(veh) in {"land_vehicle","air_vehicle","sea_vehicle"} and LOC_OPT.startswith(" in "):
        LOC_OPT = ""

    # material/shape (attach MADE_OF to the nearest salient head via templates)
    MADE_OF_OPT = f" made of {made}" if made else ""
    shape_val = None
    if shape:
        shape_val = strip_article(shape)
        shape_val = normalize_np_with_article(shape_val)  # ensures "a skyscraper"
    SHAPED_OPT  = f" shaped like {shape_val}" if shape_val else ""


    # companions/groups with articles (normalize BEFORE using them)
    if companions:
        companions = normalize_np_with_article(companions)
    if group:
        group = ensure_article_np(group)

    # companions: "teaching → to GROUP", else "with COMPANIONS" if present
    COMPANIONS_OPT = ""
    if action == "teaching" and group:
        COMPANIONS_OPT = f" to {group}"
    elif companions and action != "teaching":
        COMPANIONS_OPT = f" with {companions}"

    CONSTRAINT_OPT = f" {constraint}" if constraint else ""

    role_default = {"delivering": "mail", "reading": "newspaper", "sipping": "tea"}
    role = None
    if action in role_default:
        role = role_default[action]
    elif action in {"conducting","djing"}:
        role = ""  # no role/object string
    elif action == "practicing" and not obj:
        role = choose(slots.get("TOPIC", []), "karate")
    elif action == "painting" and not obj:
        role = choose(slots.get("OBJ_PROP", []), "self-portrait")

    OBJECT_OR_ROLE = (obj or role or "").strip()

    # Fill
    TOPIC_VAL = strip_article(obj if (action=="teaching" and obj) else choose(slots.get("TOPIC", []), "yoga"))
    filled = t.format(
        SUBJECT=subj,
        ACTION=action or "",
        OBJ_OPT=OBJ_OPT,
        ON_OR_USING_OPT=ON_OR_USING_OPT,
        LOC_OPT=LOC_OPT,
        MADE_OF_OPT=MADE_OF_OPT,
        SHAPED_OPT=SHAPED_OPT,
        COMPANIONS_OPT=COMPANIONS_OPT,
        CONSTRAINT_OPT=CONSTRAINT_OPT,
        OBJECT=obj or "a kite",
        VEH_VERB=veh_verb,
        VEHICLE=with_article(veh) or "a bumper car",
        OCCUPATION_VERB=action or "delivering",
        OBJECT_OR_ROLE=OBJECT_OR_ROLE,  # <-- no default "mail" here
        COMPETE_VERB=comp_verb,
        OPPONENT=opponent,
        WITH_EQUIP_OPT="",
        TOPIC=TOPIC_VAL,
        GROUP=group or "a class of turtles"
    )

    # Light polish
    filled = re.sub(r"\b(in|on)\s+(in|on)\b", r"\2", filled)
    filled = polish(filled)
    # collapse repeated venue preps
    filled = re.sub(
        r"\b(in|on)\s+(?:an?\s+)?(desert|dojo|library|nightclub|opera house)\s+\1\b",
        r" \1 \2", filled, flags=re.I
    )
    filled = re.sub(r"\b(in|on)\s+(?:an?\s+)?(desert|dojo|library|nightclub|opera house)\s+in\s+(?:an?\s+)?\1\b", r"in \2", filled, flags=re.I)
    filled = re.sub(r"\bmade of\s+(?:a|an|the)\s+", "made of ", filled, flags=re.I)
    filled = re.sub(r"\bshaped like\s+(?:a|an|the)\s+", "shaped like ", filled, flags=re.I)
    filled = re.sub(r"\s{2,}", " ", filled).strip()

    # Absurdity guard
    if " made of " not in filled.lower() and " shaped like " not in filled.lower() and not ensure_absurdity(filled):
        if made:
            filled += f" made of {made}"
        else:
            filled += " at dawn"

    # Novelty guard (resample only ONE feature per retry)
    tries = 0
    while too_close_to_seed(filled, seeds) and tries < 4:
        choice_key = random.choice(["loc","veh","mat","shape"])
        made2  = choose(slots.get("MADE_OF", []))
        shape2 = choose(slots.get("SHAPED_LIKE", []))
        loc2   = pick_one_location(slots)
        veh2   = choose(slots.get("VEHICLE_SURFACE", []))

        # wipe trailing heads and re-attach only one
        filled = re.sub(r"\s+(shaped like|made of| in | on )\s+[^,]+(?=$)", "", filled)

        # avoid stacking extra loc/veh tails if we already have one
        if choice_key in {"loc","veh"} and re.search(r"(?:\s(on|in)\s)", filled[-40:], re.I):
            tries += 1
            continue

        append = ""
        if choice_key == "loc" and loc2:
            append += loc_phrase(loc2)
        elif choice_key == "veh" and veh2:
            append += veh_phrase(veh2)
        elif choice_key == "mat" and made2 and " made of " not in filled.lower():
            append += f" made of {made2}"
        elif choice_key == "shape" and shape2 and " shaped like " not in filled.lower():
            append += f" shaped like {shape2}"
        filled = norm_spaces(f"{filled}{append}")
        tries += 1

    # Last-ditch novelty head swap (competition verbs)
    if too_close_to_seed(filled, seeds):
        comp_verb2 = choose([v for v in COMPETE_VERBS if v not in filled], "competing with")
        filled = re.sub(r"\b(racing|playing chess against|competing with)\b", comp_verb2, filled)

    # very end of generate_one(), just before return:
    filled = polish(filled)
    filled = _polish_articles_extra(filled)
    return filled


def generate_prompts(records: List[Dict[str, Any]], n: int = 40, seed: int = 42) -> List[str]:
    random.seed(seed)
    slots = build_slot_tables(records)
    seeds = [r["raw"] for r in records]
    out = []
    for _ in range(n * 3):  # oversample then dedupe
        out.append(generate_one(slots, seeds))
    out = dedupe(out)
    seed_set = {s.lower() for s in seeds}
    out = [p for p in out if p.lower() not in seed_set]
    return out[:n]

# ---- Demo run ----
generated = generate_prompts(parsed, n=40, seed=42)

# Save to relative paths (Notebook-friendly)
out_json = "generated_prompts.json"
out_csv  = "generated_prompts.csv"
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(generated, f, ensure_ascii=False, indent=2)

try:
    import pandas as pd
    pd.DataFrame({"prompt": generated}).to_csv(out_csv, index=False)
except Exception as e:
    print("CSV save skipped:", e)

print(f"\nGenerated {len(generated)} prompts.")
n=40
print(f"\nSample {n} of the generated prompts:")
for i, p in enumerate(generated[:n], 1):
    print(f"{i:2d}. {p}")

# --- Minimal sanity checks + micro regression tests ---
assert all(p.startswith("Generate an SVG of ") for p in generated)
assert not any(" in a n " in p.lower() for p in generated)
assert not any(" with a penguins" in p.lower() for p in generated)
assert not any("playing chess chess" in p.lower() for p in generated)
for x in generated:
    assert not re.search(r"\b(steering|driving)\s+(?:a|an)\s+(surfboard|skateboard|ice|rainbow|tightrope)\b", x.lower()), x
print("\nSanity checks passed.")
print()
bad = [p for p in generated if re.search(r"\b(a a|grand piano grand piano|conducting mail|djing mail|teaching a (yoga|karate))\b", p.lower())]
print("Bad patterns found:", len(bad))
for b in bad[:10]: print(" -", b)
# ======================================================================================


Generated 40 prompts.

Sample 40 of the generated prompts:
 1. Generate an SVG of a dolphin teaching karate to a class of turtles in a Victorian parlor made of jellybeans
 2. Generate an SVG of a dolphin practicing yoga with a robot waiter made of teacups
 3. Generate an SVG of a crocodile teaching yoga to a class of turtles on a surfboard
 4. Generate an SVG of a giraffe conducting with a robot waiter
 5. Generate an SVG of a snail teaching karate to a class of turtles in a library
 6. Generate an SVG of a flamingo sipping a tea in a Victorian parlor with penguins shaped like skyscraper
 7. Generate an SVG of a snail casting a newspaper with a robot waiter made of mushrooms
 8. Generate an SVG of a crocodile building a sandcastle in a library
 9. Generate an SVG of a fox painting a newspaper in a dojo with a robot waiter made of spaghetti
10. Generate an SVG of a penguin casting a newspaper in a Victorian parlor with a robot waiter made of mushrooms
11. Generate an SVG of a giraffe b

In [108]:
# =========================
# De-novo lexicon (supplementary)
# =========================
# NOTE: This merges into your parsed slots. It does not replace them.

LEXICON = {
    # Subjects (animals/creatures)
    "SUBJECT": [
        "a meerkat","a badger","a capybara","a red panda","an armadillo","a platypus","a koala","a ferret",
        "a wombat","a beaver","a hyena","a chameleon","a pangolin","a lynx","a mongoose","a manatee",
        "a moose","a toucan","a jackalope","a weasel","an ibex","a tapir","a lemur","a hare","a crow",
        "a magpie","a heron","a stork","a porcupine","a coyote","a bison","a yak","a kiwi","a dingo",
        "a salamander","a gecko","an axolotl","a narwhal","a seahorse","a starling","a crane","a puffin",
        "a wombat","an otter","a vole","a marmot","a hedgehog","a civet","a dik-dik"
    ],

    # Objects: instruments/tools/props/foods/structures/games/topics
    "OBJ_INSTRUMENT": [
        "a saxophone","a trumpet","a banjo","a harp","a theremin","a tuba","a cello","a clarinet",
        "a marimba","a tambourine","a bagpipe","a mandolin","a keytar"
    ],
    "OBJ_PROP": [
        "a kite","a lantern","a typewriter","a telescope","a paintbrush","a newspaper","a blueprint",
        "a map","a scroll","a banner","a quill pen","a megaphone","a palette","a umbrella"
    ],
    "OBJ_FOOD": [
        "pizza","watermelons","cupcakes","marshmallows","bagels","honeycombs","doughnuts","macarons",
        "pumpkins","ice-cream cones","lollipops","gingerbread cookies","pretzels"
    ],
    "OBJ_STRUCTURE": [
        "a sandcastle","a lighthouse","a drawbridge","a windmill","a wooden roller coaster","a treehouse",
        "a castle gate","a marble fountain","a paper bridge","a clock tower","a spiral staircase"
    ],
    "OBJ_GAME": ["chess","checkers","go","backgammon"],
    "TOPIC": ["yoga","karate","ballet","origami","calligraphy","calculus","astronomy","juggling","fencing"],

    # Vehicles and supports/surfaces
    "VEHICLE": [
        "a unicycle","a skateboard","a surfboard","a bumper car","a convertible","a hot air balloon","a blimp",
        "a pirate ship","a subway car","a scooter","a tandem bicycle","a hovercraft","a gondola","a snowmobile",
        "a canoe","a zeppelin","a jetpack","a bathysphere","a submersible"
    ],
    "SURFACE": [
        "a tightrope","a rainbow","clouds","a lily pad","ice","the moon","a rope bridge","a sand dune",
        "a piano keyboard","a stack of books","a beam of light","a glacier","a giant leaf"
    ],

    # Locations / venues / environments
    "LOCATION": [
        "a library","an opera house","a dojo","a greenhouse","a museum","a crystal cave","a moonlit pier",
        "a Victorian parlor","a glass greenhouse","a moon base","a coral reef","a bamboo forest","a candy factory",
        "a floating market","a clockwork workshop","a volcano observatory","a space station",
        "a bioluminescent lagoon","a snowy village square","a lighthouse balcony","a canyon overlook",
        "a subterranean station","a rooftop garden","a foggy harbor","a mirrored ballroom","a neon arcade"
    ],

    # Materials / shapes
    "MADE_OF": [
        "spaghetti","teacups","jellybeans","newspapers","origami cranes","soap bubbles","gummy bears",
        "glass","crystals","gingerbread","clockwork gears","playing cards","stained glass","glitter","paperclips",
        "candy canes","rubber ducks","feathers","leaves","starlight"
    ],
    "SHAPED_LIKE": [
        "a dragon","a comet","a pineapple","a pretzel","a teapot","a zeppelin","a spiral","a snowflake",
        "a labyrinth","a lighthouse","a top hat","a Möbius strip","a musical note","a fractal tree","a keyhole"
    ],

    # Apparel / equipment
    "EQUIPMENT": [
        "goggles","a spacesuit","a wizard robe","a bowtie","a cape","roller skates","a helmet",
        "a backpack","mittens","a monocle","a diving suit","a chef hat"
    ],

    # Group companions / opponents / countables / colors / style tails
    "COMPANIONS": [
        "a class of turtles","a flock of parrots","a band of robots","a council of owls","a parade of crabs",
        "a troupe of raccoons","a choir of penguins","a brigade of beetles","a platoon of ants",
        "a league of squirrels","a school of goldfish","a herd of goats","a gaggle of geese"
    ],
    "OPPONENT": ["a robot waiter","a cheetah","a chess automaton","a mime","a magician","a giant puppet","a scarecrow"],
    "COUNTABLE_OBJECT": ["kites","plates","umbrellas","balloons","lanterns","marbles","flags","paper planes","paintbrushes"],
    "COLOR": ["red","blue","green","golden","violet","striped","polka-dotted","turquoise","scarlet","amber","cerulean"],
    "STYLE_TAIL": ["under starlight","during a snowstorm","at dawn","in a gentle drizzle","under lantern light","at sunset"],
}

In [109]:
LEXICON = {
    # Subjects (animals/creatures)
    "SUBJECT": [
        "a meerkat","a badger","a capybara","a red panda","an armadillo","a platypus","a koala","a ferret",
        "a wombat","a beaver","a hyena","a chameleon","a pangolin","a lynx","a mongoose","a manatee",
        "a moose","a toucan","a jackalope","a weasel","an ibex","a tapir","a lemur","a hare","a crow",
        "a magpie","a heron","a stork","a porcupine","a coyote","a bison","a yak","a kiwi","a dingo",
        "a salamander","a gecko","an axolotl","a narwhal","a seahorse","a starling","a crane","a puffin",
        "a wombat","an otter","a vole","a marmot","a hedgehog","a civet","a dik-dik",
        # New entries (pumpkin spice videodrome inspired)
        "a pumpkin-headed crow",
        "a scarecrow made of VHS tape",
        "a cinnamon moth",
        "a spectral raccoon",
        "a pumpkin spice fox",
        "a harvest beetle",
        "a static-laced owl",
        "a velvet bat",
        "a digital jack-o-lantern",
        "a glitching deer",
        "a spectral hare",
        "a pumpkin spice salamander",
        "a vine-wrapped boar",
        "a cinnamon toad",
        "a haunted turkey",
        "a VHS-eyed wolf",
        "a pumpkin rat",
        "a phantom squirrel",
        "a harvest raven",
        "a cinnamon-dusted hedgehog",
        "a pumpkin spice spider",
        "a scarecrow crow",
        "a VHS jackrabbit",
        "a vine-entangled stag",
        "a pumpkin golem",
        "a spectral opossum",
        "a static fox",
        "a cinnamon sparrow",
        "a jack-o’-lantern frog",
        "a pumpkin spice moth",
        "a glitchy heron",
        "a spectral beaver",
        "a harvest locust",
        "a pumpkin-eyed goat",
        "a cinnamon raccoon",
        "a VHS-headed magpie",
        "a pumpkin spice eel"
    ],
    # Objects: instruments/tools/props/foods/structures/games/topics
    "OBJ_INSTRUMENT": [
        "a saxophone","a trumpet","a banjo","a harp","a theremin","a tuba","a cello","a clarinet",
        "a marimba","a tambourine","a bagpipe","a mandolin","a keytar",
        # New additions (37)
        "a washboard",
        "a kazoo",
        "a slide whistle",
        "a conch shell horn",
        "a didgeridoo",
        "a steel drum",
        "a jaw harp",
        "a toy piano",
        "a melodica",
        "a gong",
        "a vibraphone",
        "a cajón",
        "a djembe",
        "a hurdy-gurdy",
        "a glass harmonica",
        "an ocarina",
        "a slapstick",
        "a güiro",
        "a flexatone",
        "a musical saw",
        "a stompbox",
        "a street drum kit made of buckets",
        "a cigar-box guitar",
        "a bassoon",
        "an oboe",
        "a sousaphone",
        "a penny whistle",
        "a bass clarinet",
        "a talking drum",
        "a cuíca",
        "a bullroarer",
        "a kalimba",
        "a zither",
        "a harmonium",
        "a prepared piano",
        "a circuit-bent synthesizer",
        "a waterphone",
        "a shoebox shaker"
    ],
    "OBJ_PROP": [
        "a kite","a lantern","a typewriter","a telescope","a paintbrush","a newspaper","a blueprint",
        "a map","a scroll","a banner","a quill pen","a megaphone","a palette","a umbrella",
        # New additions (37)
        "a shopping cart with a broken wheel",
        "a liquidation sign",
        "a boarded-up storefront",
        "a cracked escalator",
        "a flickering fluorescent light",
        "a pawned diamond ring",
        "a hollow mannequin",
        "a padlocked entrance gate",
        "a dusty cash register",
        "a foreclosed deed",
        "a pile of unsold appliances",
        "a coupon book with missing pages",
        "a corporate balance sheet",
        "a hedge maze of red tape",
        "a golden parachute",
        "a moth-eaten catalog",
        "a shuttered mall directory",
        "a revolving door",
        "a half-empty parking lot",
        "a clearance sticker",
        "a crumpled pink slip",
        "a ghost logo",
        "a corporate raider’s briefcase",
        "a shattered display case",
        "a hollow safe",
        "a neon ‘Open’ sign that won’t light",
        "a banker’s pen with no ink",
        "a bankruptcy filing folder",
        "a collection of IOUs",
        "a rusted delivery truck",
        "a demolished warehouse blueprint",
        "a broken shopping bag handle",
        "a loyalty card with expired points",
        "a trophy from a bygone merger",
        "a skeleton key to empty offices",
        "a maze of empty shelves",
        "a ledger of vanished pensions",
        "a gavel from a bankruptcy court"
    ],
    "OBJ_FOOD": [
        "pizza","watermelons","cupcakes","marshmallows","bagels","honeycombs","doughnuts","macarons",
        "pumpkins","ice-cream cones","lollipops","gingerbread cookies","pretzels",
        "bananas","cucumbers","zucchinis","eggplants","carrots","radishes","asparagus","green beans",
        "hot dogs","corn dogs","pickles","leeks","chili peppers","okra pods","parsnips","lotus roots",
        "squash blossoms","artichokes","mushrooms","figs","pomegranates","starfruit","cherries",
        "grapes","passionfruit","dragonfruit","plantains","turnips","beets","sweet potatoes",
        "cassava","sea cucumbers","ginger roots","taro roots","salsify","ramps","fiddlehead ferns",
        "cactus paddles","jackfruit"
    ],
    "OBJ_STRUCTURE": [
        "a sandcastle","a lighthouse","a drawbridge","a windmill","a wooden roller coaster","a treehouse",
        "a castle gate","a marble fountain","a paper bridge","a clock tower","a spiral staircase",
        # 37 new objects (plate tectonics inspired)
        "a volcanic caldera",
        "a lava tube",
        "a basalt cliff",
        "a magma chamber",
        "a geyser cone",
        "a fumarole vent",
        "a rift valley",
        "a fault scarp",
        "a tilted strata wall",
        "a subduction trench",
        "a folded mountain ridge",
        "a stratovolcano",
        "a shield volcano",
        "a volcanic island arc",
        "a tectonic plateau",
        "a seamount",
        "a hydrothermal vent",
        "a pillow lava mound",
        "a hot spring terrace",
        "a metamorphic outcrop",
        "a granite dome",
        "a basalt column formation",
        "a transform fault bridge",
        "a drifted continental shelf",
        "an uplifted coral atoll",
        "a collapsed sinkhole",
        "a pumice field",
        "a tephra cone",
        "an igneous dike wall",
        "a horst ridge",
        "a graben basin",
        "a tilted fault block",
        "a tectonic escarpment",
        "a mountain pass carved by glaciers",
        "a subduction accretionary wedge",
        "a craton shield",
        "a mid-ocean ridge"
    ],
    "OBJ_GAME": [
        "chess",
        "checkers",
        "go",
        "backgammon",
        "rainbow_dominoes",
        "flag_puzzle",
        "unity_cards",
        "identity_quiz",
        "parade_maze",
        "color_harmony",
        "allyship_bingo",
        "freedom_dice",
        "community_tower",
        "solidarity_tiles",
        "celebration_memory",
        "equality_ladders",
        "visibility_puzzle",
        "inclusion_rings",
        "love_letters",
        "pride_pathways",
        "acceptance_orchard",
        "heritage_match",
        "safe_space_game",
        "march_simulator",
        "queer_chess",
        "storytelling_circle",
        "rights_relay",
        "respect_dominoes",
        "joy_tangle",
        "support_networks",
        "color_guardians",
        "courage_climbers",
        "empathy_expedition",
        "truth_tokens",
        "liberation_labyrinth",
        "hope_hunt",
        "sparkle_stack",
        "dignity_duel",
        "celebration_chain",
        "spectrum_sprint",
        "identity_journey"
    ],
    "TOPIC": [
        "yoga","karate","ballet","origami","calligraphy","calculus","astronomy","juggling","fencing",
        "spherification","foams","emulsification","fermentation","dehydration","distillation","cryogenics",
        "caramelization","gelification","aeration","infusion","sous-vide","centrifugation","smoking",
        "encapsulation","maceration","distortion","flavor-pairing","deconstruction","powderization",
        "nitrogen-freezing","aromatization","curing","enzymology","lamination","syneresis","maillard-reaction",
        "texturization","alginates","isomalt-sculpting","vacuum-infusion","carbonation","transglutaminase",
        "edible-films","chromatography","flash-freezing","hydrocolloids","rotovapping","umami-mapping","zest-extraction"
    ],
    # Vehicles and supports/surfaces
    "VEHICLE": [
        "a unicycle","a skateboard","a surfboard","a bumper car","a convertible","a hot air balloon","a blimp",
        "a pirate ship","a subway car","a scooter","a tandem bicycle","a hovercraft","a gondola","a snowmobile",
        "a canoe","a zeppelin","a jetpack","a bathysphere","a submersible",
        # New 37
        "a parade float shaped like a balloon dog",
        "a tractor coated in chrome",
        "a limousine full of disco balls",
        "a tractor-trailer hauling giant tulips",
        "a dune buggy gilded in gold leaf",
        "a mechanical bull on roller skates",
        "a solar-powered combine harvester",
        "a yield-optimizing drone swarm",
        "a futuristic armored golf cart",
        "a segway with mirrored panels",
        "a rocket-powered shopping cart",
        "a crystal-encrusted tow truck",
        "a pedal-powered Ferris wheel car",
        "a blockchain mining dump truck",
        "a chariot pulled by holographic horses",
        "a chrome-plated forklift",
        "a hay baler with neon underglow",
        "a koi fish–shaped submarine",
        "a gilded armored personnel carrier",
        "a tractor disguised as a Trojan horse",
        "a mirrored carnival bumper boat",
        "a velvet-upholstered ambulance",
        "a crop-dusting drone shaped like a butterfly",
        "a robotic ox cart",
        "a levitating taxi cab",
        "a transparent glass bulldozer",
        "a carousel horse that actually gallops",
        "a hover-limo with disco lighting",
        "a platinum rickshaw",
        "a hay wagon pulled by robotic deer",
        "a neon-lit ice cream truck",
        "a ride-on Roomba convoy",
        "a double-decker tank",
        "a marbleized Vespa",
        "a tractor sprayer turned art installation",
        "a floating koi pond raft",
        "a zero-gravity pogo stick",
    ],
    "SURFACE": [
        "a tightrope","a rainbow","clouds","a lily pad","ice","the moon","a rope bridge","a sand dune",
        "a piano keyboard","a stack of books","a beam of light","a glacier","a giant leaf",
        # New 37
        "a field of giant sunflowers",
        "a floor of glass marbles",
        "a polished chrome runway",
        "a patchwork quilt stretched across the sky",
        "a kaleidoscope lens plane",
        "a surface of dollar bills",
        "a layer of bubble wrap",
        "a mound of golden hay bales",
        "a surface of holographic panels",
        "a network of mycelium threads",
        "a bed of neon moss",
        "a rotating vinyl record",
        "a shimmering pool of milk",
        "a checkerboard made of LEDs",
        "a field of QR codes",
        "a surface of candy-coated peanuts",
        "a holographic diploma scroll",
        "a chessboard of wheat stalks",
        "a stage made of tractor tires",
        "a bed of holographic confetti",
        "a glimmering pile of tokens",
        "a spinning disco floor",
        "a mound of popcorn kernels",
        "a mirror-polished yield chart",
        "a solar panel array",
        "a surface of ice cream sundaes",
        "a scaffolding of bamboo shoots",
        "a runway of circuit boards",
        "a bridge of glowing fiber optics",
        "a river of liquid chrome",
        "a meadow of inflatable flowers",
        "a spiral staircase of textbooks",
        "a pyramid of hay bales",
        "a golden wheat field under neon sky",
        "a rainbow-colored conveyor belt",
        "a stack of piggy banks",
        "a giant balance sheet scroll",
        "a mirrored cornfield labyrinth",
    ],
    # Locations / venues / environments
    "LOCATION": [
        "a library","an opera house","a dojo","a greenhouse","a museum","a crystal cave","a moonlit pier", "a Victorian parlor","a glass greenhouse","a moon base","a coral reef","a bamboo forest","a candy factory", "a floating market","a clockwork workshop","a volcano observatory","a space station", "a bioluminescent lagoon","a snowy village square","a lighthouse balcony","a canyon overlook", "a subterranean station","a rooftop garden","a foggy harbor","a mirrored ballroom","a neon arcade",
        # New dinosaur-friendly Shoney’s style locations
        "a prehistoric buffet hall",
        "a fossil-flecked diner booth",
        "a swamp-side Shoney’s patio",
        "a Jurassic pancake house",
        "a lava-lamp lit salad bar",
        "a neon-brontosaurus lounge",
        "a Triassic truck stop café",
        "a pterodactyl-friendly drive-in",
        "a fern-filled breakfast nook",
        "a Cretaceous pie counter",
        "a stegosaurus-sized soda fountain",
        "a dino-bone BBQ pit",
        "a dino-egg omelet station",
        "a tar-pit themed ice cream bar",
        "a caveman karaoke corner",
        "a prehistoric root beer stand",
        "a meteor-motif pizza parlor",
        "a Jurassic jukebox diner",
        "a dinosaur-themed salad buffet",
        "a Bronto-burger grill shack",
        "a volcanic milkshake bar",
        "a fossil-framed pie case",
        "a Shoney’s gift shop with dino plushies",
        "a time-traveling breakfast bar",
        "a raptor-run fried chicken counter",
        "a cave-painted pancake griddle",
        "a prehistoric drive-thru window",
        "a brontosaurus banquet hall",
        "a mossy Shoney’s atrium",
        "a dinosaur topiary garden café",
        "a meteor crater picnic zone",
        "a fossil-etched dessert bar",
        "a swampy iced tea fountain",
        "a Jurassic jukebox booth",
        "a T-Rex taco stand",
        "a Dino Disco dance floor",
        "a time-warped Shoney’s buffet line"
    ],
    # Materials / shapes
    "MADE_OF": [
        "spaghetti","teacups","jellybeans","newspapers","origami cranes","soap bubbles","gummy bears",
        "glass","crystals","gingerbread","clockwork gears","playing cards","stained glass","glitter","paperclips",
        "candy canes","rubber ducks","feathers","leaves","starlight",
        # --- new 37 ---
        "sesame seeds","caraway seeds","pumpernickel crumbs","pickles","mustard packets","pastrami slices",
        "matzo shards","bagel chips","deli tickets","menorah wax","Torah scrolls","kiddush cups","prayer shawls",
        "shofar horns","latke shreds","dreidels","hamentashen","seltzer bubbles","rye flour dust","onion skins",
        "garlic cloves","horseradish roots","brass samovars","smoked salmon","knishes","egg yolks","braided challah",
        "gilded candlesticks","soup noodles","chopped liver","rugelach spirals","mustard seeds","black bread crusts",
        "crumbled kugel","old recipe cards","pickling jars","lox roses","sour cream dollops"
    ],
    "SHAPED_LIKE": [
        "a dragon","a comet","a pineapple","a pretzel","a teapot","a zeppelin","a spiral","a snowflake",
        "a labyrinth","a lighthouse","a top hat","a Möbius strip","a musical note","a fractal tree","a keyhole",
        # --- new 37 ---
        "a braided challah","a dreidel","a Star of David","a menorah","a matzo cracker","a deli pickle spear",
        "a pastrami tower","a rye loaf","a sandwich stack","a kugel spiral","a soup ladle","a knish pocket",
        "a hamentashen triangle","a deli ticket stub","a bagel hoop","a challah knot","a shofar horn curl",
        "a prayer shawl fold","a kiddush cup","a deli counter scale","a mustard swirl","a rugelach crescent",
        "a gefilte fish oval","a seltzer bubble","an onion ring","a brass candlestick","a kugel square",
        "a pickling barrel","a recipe book","a deli awning","a chopping board","a synagogue arch","a breadbasket",
        "a deli sandwich pyramid","a rye seed swirl","a challah crown","a deli menu board","a kugel lattice"
    ],
    # Apparel / equipment
    "EQUIPMENT": [
        "goggles","a spacesuit","a wizard robe","a bowtie","a cape","roller skates","a helmet",
        "a backpack","mittens","a monocle","a diving suit","a chef hat",
        # New additions
        "a lawn dart set",
        "a gasoline leaf blower",
        "an oversized foam cowboy hat",
        "a ferret harness",
        "a jetpack",
        "a sharkskin wetsuit",
        "stilts",
        "a laser pointer stronger than 5mW",
        "a hoverboard with flames",
        "a chainsaw purse",
        "a nunchuck belt",
        "a flame-thrower guitar",
        "a plastic grocery bag cloak",
        "a vending machine full of sodas",
        "a brass knuckle keychain",
        "an unvented indoor fireplace",
        "a gasoline-powered pogo stick",
        "a crossbow umbrella",
        "a rooster saddle",
        "a bear-fur coat",
        "a medieval flail",
        "a pirate hook hand",
        "a pair of spiked shoulder pads",
        "a shark-tooth necklace the size of your head",
        "a two-stroke dirt bike",
        "a neon underglow kit",
        "a hover-shoes prototype",
        "a gas-powered blender",
        "a glow-in-the-dark shuriken",
        "a flamethrower backpack",
        "a whale-bone corset",
        "a giant inflatable pool toy shaped like a swan",
        "a retractable ladder cane"
    ],
    # Group companions / opponents / countables / colors / style tails
    "COMPANIONS": [
        "a class of turtles","a flock of parrots","a band of robots","a council of owls","a parade of crabs",
        "a troupe of raccoons","a choir of penguins","a brigade of beetles","a platoon of ants",
        "a league of squirrels","a school of goldfish","a herd of goats","a gaggle of geese",
        # New 37
        "a duo of llamas","a squad of frogs","a cluster of hedgehogs","a posse of pigeons",
        "a guild of hamsters","a convoy of ducks","a fellowship of moths","a regiment of lizards",
        "a circus of ferrets","a symposium of cats","a union of flamingos","a gallery of chameleons",
        "a caravan of donkeys","a committee of porcupines","a seminar of pigeons","a conclave of bats",
        "a procession of snails","a jamboree of monkeys","a symposium of beetles","a gathering of turkeys",
        "a drumline of seals","a flash mob of rabbits","a cluster of jellyfish","a coalition of toads",
        "a choir of cicadas","a platoon of meerkats","a panel of dogs","a dynasty of pigeons",
        "a parliament of ravens","a cabal of hedgehogs","a syndicate of moths","a conclave of turtles",
        "a regiment of crabs","a band of hyenas","a rehearsal of parrots","a festival of otters",
        "a phalanx of bees"
    ],
    "OPPONENT": [
        "a robot waiter","a cheetah","a chess automaton","a mime","a magician","a giant puppet","a scarecrow",
        # New 37
        "a rollerblading clown","a fortune teller","a yodeler","a trivia champion","a sumo wrestler",
        "a beekeeper","a karaoke host","a sword swallower","a ventriloquist","a plate spinner",
        "a marching tuba player","a swordfighter reenactor","a falconer","a tightrope walker",
        "a mime trapped in a box","a professional hopscotch player","a hotdog eating champion",
        "a hypnotist","a balloon animal artist","a fire breather","a unicyclist","a dunk tank clown",
        "a quiz show buzzer","a carnival barker","a breakdancer","a salsa instructor","a jousting knight",
        "a puppeteer","a tarot reader","a pie-thrower","a magician’s rabbit","a frisbee trickster",
        "a mime orchestra","a potato sack racer","a paddleboat competitor","a karaoke rival",
        "a pancake flipper"
    ],
    "COUNTABLE_OBJECT": [
        "kites","plates","umbrellas","balloons","lanterns","marbles","flags","paper planes","paintbrushes",
        # New 37
        "fortune cookies","yo-yos","sandwiches","sketchpads","souvenir mugs","postcards",
        "candles","straw hats","rubber ducks","bouquets","goggles","ping pong paddles",
        "ice cream cones","board game pieces","menus","souvenir keychains","water pistols",
        "trivia cards","roller skates","fortune wheels","scented candles","magnifying glasses",
        "disposable cameras","snow globes","sparklers","beach balls","compasses","puzzles",
        "fortune scrolls","juggling pins","playing cards","tea cups","souvenir buttons",
        "temporary tattoos","karaoke song slips","plastic swords","cupcakes","carnival tickets"
    ],
    "COLOR": [
        "red","blue","green","golden","violet","striped","polka-dotted","turquoise","scarlet","amber","cerulean",
        # New 37
        "chartreuse","fuchsia","lavender","plaid","checkered","maroon","teal","peach","lime",
        "crimson","bronze","silver","obsidian","pastel pink","indigo","periwinkle","khaki","mossy",
        "neon yellow","mahogany","pearl","mustard","charcoal","ivory","bubblegum","taupe",
        "gingham","ombre","glittery","rainbow","sepia","denim","frosted white","sunset orange",
        "speckled","mint"
    ],
    "STYLE_TAIL": [
        "under starlight","during a snowstorm","at dawn","in a gentle drizzle","under lantern light","at sunset",
        # New 37
        "on a rickety rollercoaster","inside a haunted house","at a petting zoo","while lost in a hedge maze",
        "on a swan boat","at a karaoke bar","under a disco ball","during a picnic gone wrong",
        "inside a planetarium","at a midnight pancake breakfast","at the county fair","during a scavenger hunt",
        "inside an escape room","on a tandem bicycle","while waiting for a bus","on bumper cars",
        "at a silent disco","while flying kites in heavy wind","at a miniature golf course",
        "during a pottery class","on a paddleboat","at a candle-making workshop","while lost on a hike",
        "in a laser tag arena","on a Ferris wheel","at a trivia night","while juggling oranges",
        "in a hot air balloon","at a bowling alley","during a flash mob","in a photo booth",
        "on a hayride","at a Renaissance fair","while salsa dancing","on a fishing pier",
        "at a cat café","in a science museum"
    ]
}

In [115]:
# =========================
# Merge into existing slots (supplement-only, no duplication)
# =========================

def _norm_article_list(xs):
    return sorted(set(normalize_np_with_article(x) if isinstance(x, str) else x for x in xs))

def augment_slots_with_lexicon(slots: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """
    Supplement parsed slots with de-novo lexicon items.
    Does NOT overwrite existing items, and avoids re-adding duplicates.
    """
    import copy
    out = copy.deepcopy(slots)

    # map lexicon keys to your slot keys
    keymap = {
        "SUBJECT": "SUBJECT",
        "OBJ_INSTRUMENT": "OBJ_INSTRUMENT",
        "OBJ_STRUCTURE": "OBJ_STRUCTURE",
        "OBJ_FOOD": "OBJ_FOOD",
        "OBJ_PROP": "OBJ_PROP",
        "OBJ_GAME": "OBJ_GAME",
        "TOPIC": "TOPIC",
        "VEHICLE": "VEHICLE_SURFACE",  # vehicles merge with your vehicle/surface pool
        "SURFACE": "VEHICLE_SURFACE",  # ditto
        "LOCATION": "LOCATION",
        "MADE_OF": "MADE_OF",
        "SHAPED_LIKE": "SHAPED_LIKE",
        "EQUIPMENT": "EQUIPMENT",
        "COMPANIONS": "COMPANIONS",
        "OPPONENT": "OPPONENT",
        "COUNTABLE_OBJECT": "COUNTABLE_OBJECT",
        "COLOR": "COLOR",
        "STYLE_TAIL": "STYLE_TAIL",
    }

    # normalize NP lists before merging (subjects, locations, etc.)
    NP_KEYS = {"SUBJECT","OBJ_PROP","OBJ_STRUCTURE","OBJ_INSTRUMENT","OBJ_GAME","LOCATION","COMPANIONS","OPPONENT"}

    for src, dst in keymap.items():
        new_vals = LEXICON.get(src, [])
        if not new_vals:
            continue

        # Prepare existing + new sets
        existing = set(out.get(dst, []))
        if src in NP_KEYS:
            # normalize articles for both sides before diff/merge
            existing_norm = set(_norm_article_list(existing))
            new_norm = _norm_article_list(new_vals)
            # only add items truly new vs. parser’s slots
            to_add = [x for x in new_norm if x not in existing_norm]
            merged = sorted(existing_norm.union(to_add))
        else:
            # simple dedupe for non-NPs
            merged = sorted(set(list(existing) + list(new_vals)))

        out[dst] = merged

    return out

# =========================
# Optional environment sanity (reuse if you want)
# =========================

AIR_VEHICLES = re.compile(r"\b(balloon|blimp|zeppelin|parachute|jetpack)\b", re.I)
SEA_VEHICLES = re.compile(r"\b(submarine|submersible|bathysphere|gondola)\b", re.I)

def env_incompatible(veh: Optional[str], loc: Optional[str]) -> bool:
    v = (veh or "").lower()
    l = (loc or "").lower()
    # underwater × air vehicles
    if ("underwater" in l or "lagoon" in l or "reef" in l or "coral reef" in l) and AIR_VEHICLES.search(v):
        return True
    # space × sea vehicles
    if "space station" in l and SEA_VEHICLES.search(v):
        return True
    return False

# =========================
# Optional wrapper to use the augmented pools
# =========================

def generate_prompts_with_lexicon(records: List[Dict[str, Any]], n: int = 40, seed: int = 42):
    import random
    random.seed(seed)
    base_slots = build_slot_tables(records)          # from your parser
    slots = augment_slots_with_lexicon(base_slots)   # supplement with de-novo items
    seeds = [r["raw"] for r in records]

    # (Optional) quick environment sanity: re-pick location if hard conflict with vehicle
    def _sanity_pick(veh, loc, slots_):
        tries = 0
        while veh and loc and env_incompatible(veh, loc) and tries < 3:
            # pick a different location from the augmented pool
            loc = random.choice(slots_.get("LOCATION", []) or ["a library"])
            tries += 1
        return veh, loc

    out = []
    for _ in range(n * 3):  # oversample then dedupe, same as your function
        # if you want the sanity check to apply, you can temporarily patch generate_one here,
        # but simplest is to rely on your existing logic; the augmented slots are already used.
        p = generate_one(slots, seeds)
        out.append(p)

    out = dedupe(out)
    seed_set = {s.lower() for s in seeds}
    out = [p for p in out if p.lower() not in seed_set]
    return out[:n]

# ---- Example use:
demo = generate_prompts_with_lexicon(parsed, n=40, seed=673232)
for i, p in enumerate(demo, 1):
    print(f"{i:2d}. {p}")

 1. Generate an SVG of a lynx practicing a curing in a raptor-run fried chicken counter with a council of owls made of stained glass
 2. Generate an SVG of a pumpkin golem building a folded mountain ridge shaped like top hat made of teacups
 3. Generate an SVG of a pumpkin spice salamander casting spells from a book a hollow safe in a giant balance sheet scroll in a bioluminescent lagoon with a dynasty of pigeons in a carousel horse that actually gallops made of pickling jars
 4. Generate an SVG of a duck competing with a mime trapped in a box in a fossil-flecked diner booth made of Torah scrolls
 5. Generate an SVG of a beaver playing chess in a glacier in a canyon overlook with penguins made of sesame seeds
 6. Generate an SVG of a seahorse building a tectonic escarpment shaped like synagogue arch made of rugelach spirals in a Dino Disco dance floor
 7. Generate an SVG of a narwhal building a volcanic caldera shaped like dreidel made of braided challah in a fossil-framed pie case
 8.

In [111]:
tests = [
    "Generate an SVG of a snail racing a cheetah in roller skates",
    "Generate an SVG of a raccoon wearing goggles and operating a steam engine",
    "Generate an SVG of a cat in a wizard robe casting spells from a book",
    "Generate an SVG of a penguin skateboarding down a rainbow",
    "Generate an SVG of a walrus conducting an orchestra of penguins",
]
for t in tests:
    r = parse_prompt(t)
    print(
        t,
        "\n  action:", r["action"],
        "\n  object:", r["object"],
        "\n  veh/surface:", r["vehicle_or_surface"],
        "\n  companions:", r["companions"],
        "\n  tags:", r["tags"], "\n"
    )

Generate an SVG of a snail racing a cheetah in roller skates 
  action: racing a cheetah in roller skates 
  object: None 
  veh/surface: roller skates 
  companions: None 
  tags: ['competition/duel', 'equipment/apparel', 'vehicle/locomotion'] 

Generate an SVG of a raccoon wearing goggles and operating a steam engine 
  action: operating a steam engine 
  object: steam engine 
  veh/surface: None 
  companions: None 
  tags: ['equipment/apparel', 'vehicle/locomotion'] 

Generate an SVG of a cat in a wizard robe casting spells from a book 
  action: casting spells from a book 
  object: None 
  veh/surface: None 
  companions: None 
  tags: ['equipment/apparel', 'performance/arts'] 

Generate an SVG of a penguin skateboarding down a rainbow 
  action: skateboarding down a rainbow 
  object: None 
  veh/surface: rainbow 
  companions: None 
  tags: ['balance/support', 'setting/location', 'vehicle/locomotion'] 

Generate an SVG of a walrus conducting an orchestra of penguins 
  action: 

In [112]:
def quality_checks(r: Dict[str,Any]) -> List[str]:
    issues = []
    # (keep your existing checks) ...

    # articleless topics shouldn't be prefixed
    if r.get("object") and r["object"].lower() in ARTICLELESS_TOPICS:
        if re.match(r"^(a|an|the)\b", r["object"], flags=re.I):
            issues.append("Article before articleless topic")

    # vehicles shouldn't sneak into LOCATION via 'in a parachute' etc.
    if r.get("location_in") and VEHICLE_RE.search(r["location_in"]):
        issues.append("Vehicle parsed as location_in")

    # jetpack should read 'with/using', not 'in'
    if r.get("vehicle_or_surface") and re.search(r"\bjetpack\b", r["vehicle_or_surface"], re.I):
        if r.get("vehicle_or_surface_origin") in {"in","in_action"}:
            issues.append("Jetpack should use 'with/using', not 'in'")
    return issues


In [113]:
# After you've built `parsed` and slot tables:
slots = augment_slots_with_lexicon(build_slot_tables(parsed))

S  = len(slots.get("SUBJECT", []))
V  = len(slots.get("VEHICLE_SURFACE", []))
L  = len(slots.get("LOCATION", []))
C  = len(slots.get("COMPANIONS", []))
M  = len(slots.get("MADE_OF", []))
H  = len(slots.get("SHAPED_LIKE", []))
STR= len(slots.get("OBJ_STRUCTURE", []))
TOP= len(slots.get("TOPIC", []))
OPP= len(slots.get("OPPONENT", []))

# Performance actions only (matches your PERFORMANCE_HEADS filter):
PERF_HEADS = {"playing","painting","conducting","djing","practicing","reading","casting","building"}
A_perf = sum(1 for a in slots.get("ACTION", []) if (a.split()[0].lower() in PERF_HEADS))

# Object “mass” for T4 (delivering/reading/sipping/painting/practicing/conducting/djing/casting)
mass_T4 = (
    len(slots.get("OBJ_FOOD", [])) +  # delivering
    1 +  # reading -> newspaper
    1 +  # sipping -> tea
    len(slots.get("OBJ_PROP", [])) +  # painting
    len(slots.get("TOPIC", [])) +     # practicing
    0 + 0 +                            # conducting/djing no object
    len(slots.get("OBJ_PROP", []))    # casting
)

T1 = S * A_perf * V * L * C * M
T2 = S * max(STR,1) * max(H,1) * max(M,1) * L
T3 = S * V * L * C * M
T4 = S * mass_T4 * L * C
T5 = S * 3 * max(OPP,1) * L
T6 = S * max(TOP,1) * max(len(slots.get("GROUP", [])), 1) * L

print({ "T1":T1, "T2":T2, "T3":T3, "T4":T4, "T5":T5, "T6":T6, "sum": T1+T2+T3+T4+T5+T6 })


{'T1': 31431079680, 'T2': 1244443200, 'T3': 2857370880, 'T4': 82839744, 'T5': 1015872, 'T6': 377104, 'sum': 35617126480}


In [114]:
import hashlib, random
from tqdm import trange  # progress bar

def sample_hashes(n, seed=0):
    random.seed(seed)
    hs = []
    for _ in trange(n, desc=f"Sampling {n} prompts", leave=False):
        s = generate_one(
            augment_slots_with_lexicon(build_slot_tables(parsed)),
            [r["raw"] for r in parsed]
        )
        hs.append(
            hashlib.blake2b(s.encode("utf-8"), digest_size=16).digest()
        )
    return hs

# two independent samples
n1 = 100_000
n2 = 100_000

print("Building sample A…")
A = set(sample_hashes(n1, seed=123))

print("Building sample B…")
B = set(sample_hashes(n2, seed=456))

m = len(A & B) or 1  # overlap (avoid div-by-zero)

N_hat = (len(A) * len(B)) / m   # Lincoln–Petersen estimate
coverage = len(A)/n1, len(B)/n2

print(f"\nUnique in sample A: {len(A)} / {n1}")
print(f"Unique in sample B: {len(B)} / {n2}")
print(f"Overlap m: {m}")
print(f"Estimated support size N≈ {N_hat:,.0f}")
print(f"Per-sample uniqueness ratios: {coverage}")


Building sample A…




Building sample B…


                                                                                


Unique in sample A: 99723 / 100000
Unique in sample B: 99709 / 100000
Overlap m: 500
Estimated support size N≈ 19,886,561
Per-sample uniqueness ratios: (0.99723, 0.99709)


