In [None]:
import re
from humor_classifier.openAi_judger import select_best_joke

def load_records_from_jokes_txt(path: str) -> list[dict]:
    """
    Parses blocks like:
    ID: 1
    INPUT: ...
    PATTERN: ...
    JOKES:
    1. ...
    2. ...
    =======

    Returns list of dicts:
    {"id": int, "input": str, "pattern": str, "jokes": [str, ...]}
    """
    records = []
    cur = None
    in_jokes = False

    id_re = re.compile(r"^ID:\s*(\d+)\s*$")
    input_re = re.compile(r"^INPUT:\s*(.*)\s*$")
    pattern_re = re.compile(r"^PATTERN:\s*(.*)\s*$")
    jokes_header_re = re.compile(r"^JOKES:\s*$")
    joke_line_re = re.compile(r"^\s*(\d+)\.\s*(.*)\s*$")
    sep_re = re.compile(r"^=+\s*$")

    def flush():
        nonlocal cur, in_jokes
        if cur and cur.get("id") is not None:
            # remove empty jokes if any
            cur["jokes"] = [j.strip() for j in cur.get("jokes", []) if j.strip()]
            records.append(cur)
        cur = None
        in_jokes = False

    with open(path, "r", encoding="utf-8") as f:
        for raw in f:
            line = raw.rstrip("\n").strip()

            if not line:
                continue

            # separator means end of block
            if sep_re.match(line):
                flush()
                continue

            m = id_re.match(line)
            if m:
                # start new block
                flush()
                cur = {"id": int(m.group(1)), "input": "", "pattern": "", "jokes": []}
                continue

            if cur is None:
                # ignore lines before first ID
                continue

            m = input_re.match(line)
            if m:
                cur["input"] = m.group(1).strip()
                continue

            m = pattern_re.match(line)
            if m:
                cur["pattern"] = m.group(1).strip()
                continue

            if jokes_header_re.match(line):
                in_jokes = True
                continue

            if in_jokes:
                m = joke_line_re.match(line)
                if m:
                    cur["jokes"].append(m.group(2).strip())
                else:
                    # if a joke wraps onto next line, attach to last joke
                    if cur["jokes"]:
                        cur["jokes"][-1] += " " + line

    flush()
    return records


In [8]:
def pick_best_joke_from_file(txt_path: str, model: str = "gpt-5"):
    records = load_records_from_jokes_txt(txt_path)

    # flatten all jokes
    all_jokes = []
    meta = []  # same length as all_jokes, keep origin info

    for r in records:
        for j in r["jokes"]:
            all_jokes.append(j)
            meta.append({"id": r["id"], "input": r["input"], "pattern": r["pattern"]})

    best_joke, best_score, scored = select_best_joke(all_jokes, model=model)

    # find metadata for the winning joke
    win_idx = all_jokes.index(best_joke)
    win_meta = meta[win_idx]

    return {
        "best_joke": best_joke,
        "best_score": best_score,
        "best_id": win_meta["id"],
        "best_input": win_meta["input"],
        "best_pattern": win_meta["pattern"],
        "num_records": len(records),
        "num_jokes": len(all_jokes),
        "all_scores": scored,  # list[(score, joke)]
    }


In [9]:
def pick_best_joke_per_id(txt_path: str, model: str = "gpt-5"):
    records = load_records_from_jokes_txt(txt_path)
    results = []

    for r in records:
        if not r["jokes"]:
            continue

        best_joke, best_score, scored = select_best_joke(r["jokes"], model=model)

        results.append({
            "id": r["id"],
            "input": r["input"],
            "pattern": r["pattern"],
            "best_joke": best_joke,
            "best_score": best_score,
            "all_scores": scored
        })

    return results


In [10]:
if __name__ == "__main__":
    path = r"C:\Users\Anwender\humor-project\src\generation\jokes_dataset_pip.txt"

    res = pick_best_joke_from_file(path, model="gpt-5")
    print(f"\nTotal records: {res['num_records']}, total jokes: {res['num_jokes']}")
    print("\n--- WINNER OVERALL ---")
    print(f"{res['best_score']:.2f}/5  |  ID={res['best_id']}  |  {res['best_pattern']}")
    print(res["best_joke"])


NameError: name 'select_best_joke' is not defined

In [None]:
def pick_best_across_n_jokes(txt_path: str, n: int = 30, model: str = "gpt-5"):
    records = load_records_from_jokes_txt(txt_path)

    jokes = []
    meta = []

    # collect jokes in file order until we have n
    for r in records:
        for j in r["jokes"]:
            jokes.append(j)
            meta.append({"id": r["id"], "input": r["input"], "pattern": r["pattern"]})
            if len(jokes) >= n:
                break
        if len(jokes) >= n:
            break

    if not jokes:
        raise ValueError("No jokes found in the txt file.")
    if len(jokes) < n:
        print(f"Warning: only found {len(jokes)} jokes, less than requested {n}.")

    best_joke, best_score, scored = select_best_joke(jokes, model=model)

    # attach metadata to scored list
    scored_with_meta = []
    for (score, joke) in scored:
        idx = jokes.index(joke)
        scored_with_meta.append((score, joke, meta[idx]))

    ranked = sorted(scored_with_meta, key=lambda x: x[0], reverse=True)

    # winner meta
    win_idx = jokes.index(best_joke)
    win_meta = meta[win_idx]

    return {
        "n_requested": n,
        "n_used": len(jokes),
        "best_joke": best_joke,
        "best_score": best_score,
        "best_meta": win_meta,
        "ranked": ranked,  # list of (score, joke, meta)
    }
