In [None]:
import json, random, re
from pathlib import Path
from typing import Any, Dict, List, Optional
from datasets import load_dataset

def first_non_empty(*vals, cast=str) -> Optional[str]:
    for v in vals:
        if v is None:
            continue
        if isinstance(v, (list, tuple)):
            if not v:
                continue
            try:
                s = "\n".join(map(str, v)).strip()
            except Exception:
                s = str(v)
        else:
            s = str(v).strip()
        if s:
            return cast(s)
    return None

ENTRY_RE = re.compile(r"def\s+([A-Za-z_]\w*)\s*\(", re.MULTILINE)
SIG_RE   = re.compile(r"def\s+[A-Za-z_]\w*\s*\([^)]*\):", re.MULTILINE)

def infer_entry_point(item: Dict[str, Any], prompt_text: str) -> Optional[str]:
    for k in ["entry_point","entrypoint","function_name","func_name"]:
        if isinstance(item.get(k), str) and item[k].strip():
            return item[k].strip()
    for src in [prompt_text, first_non_empty(item.get("solution"), item.get("canonical_solution"), item.get("code"))]:
        if src:
            m = ENTRY_RE.search(src)
            if m:
                return m.group(1)
    return None

def build_cot_prompt(base_prompt: str, entry_point: Optional[str]) -> str:
    suffix = f" implementing `{entry_point}`" if entry_point else ""
    return (
        f"{base_prompt.rstrip()}\n\n"
        "You are a careful Python developer.\n"
        "First, reason step by step privately about the algorithm and tricky cases.\n"
        f"Then, output ONLY valid Python code{suffix} — no comments, no prints, no tests."
    )

def build_self_planning_prompt(base_prompt: str, entry_point: Optional[str]) -> str:
    suffix = f" implementing `{entry_point}`" if entry_point else ""
    return (
        f"{base_prompt.rstrip()}\n\n"
        "You are a methodical Python engineer.\n"
        "Before coding, make a brief plan in your head: inputs/outputs, edge cases, approach, and complexity.\n"
        f"Finally, output ONLY the final Python code{suffix} — no comments, no prints, no tests."
    )

def extract_prompt_text_only(item: Dict[str, Any]) -> str:
    return first_non_empty(item.get("prompt"), item.get("text"), item.get("question"), item.get("instruction")) or ""

def extract_prompt_with_signature(item: Dict[str, Any]) -> str:
    """
    Return a prompt that starts with the function signature if available.
    If the NL prompt already has a signature, keep it as-is.
    Else, try to pull 'def ...:' from solution/code and prepend it.
    """
    base = extract_prompt_text_only(item)
    if SIG_RE.search(base):
        return base.strip()
    code_snippet = first_non_empty(item.get("code"), item.get("solution"), item.get("canonical_solution"))
    if code_snippet:
        m = SIG_RE.search(code_snippet)
        if m:
            # Put signature first, then keep original NL as doc/comment-style line
            sig = m.group(0)
            return f"{sig}\n{base}".strip()
    return base.strip()

def extract_tests(item: Dict[str, Any]) -> str:
    s = first_non_empty(item.get("tests"), item.get("test"), item.get("test_code"))
    if s:
        return s
    parts = []
    setup = first_non_empty(item.get("test_setup_code"))
    if setup:
        parts.append(setup)
    for key in ["test_list","challenge_test_list"]:
        lst = item.get(key)
        if isinstance(lst, list) and lst:
            parts.append("\n".join(str(x) for x in lst))
    if parts:
        return "\n\n".join(parts)
    fallback = first_non_empty(item.get("unittest"), item.get("asserts"))
    return fallback or ""

def extract_solution(item: Dict[str, Any]) -> str:
    return first_non_empty(item.get("solution"), item.get("canonical_solution"), item.get("code"), item.get("reference_solution")) or ""

def pick_split(ds_dict) -> str:
    for name in ["test","validation","val","dev","train"]:
        if name in ds_dict:
            return name
    return list(ds_dict.keys())[0]

def sample_indices(n: int, total: int, seed: int) -> List[int]:
    rng = random.Random(seed)
    idxs = list(range(total))
    rng.shuffle(idxs)
    return idxs[:min(n, total)]

def sample_mbpp(dataset="dz1/CodeScore-MBPP-ET", n=10, seed=42, outdir=Path("/content/drive/MyDrive/CS520/selected_mbpp_seed42"), include_signature=True):
    ds_all = load_dataset(dataset)
    split_name = pick_split(ds_all)
    ds = ds_all[split_name]

    idxs = sample_indices(n, len(ds), seed)
    outdir.mkdir(parents=True, exist_ok=True)
    manifest_path = outdir / "selected_mbpp.jsonl"

    with manifest_path.open("w", encoding="utf-8") as fw:
        for i in idxs:
            item = ds[i]
            task_id = first_non_empty(item.get("task_id"), item.get("id"), item.get("problem_id")) or f"{split_name}-{i}"

            prompt_base = extract_prompt_with_signature(item) if include_signature else extract_prompt_text_only(item)
            tests       = extract_tests(item)
            solution    = extract_solution(item)
            entry_point = infer_entry_point(item, prompt_base)

            prompt_cot  = build_cot_prompt(prompt_base, entry_point)
            prompt_plan = build_self_planning_prompt(prompt_base, entry_point)

            record = {
                "task_id": task_id,
                "split": split_name,
                "entry_point": entry_point,
                "prompt": prompt_base,
                "prompt_cot": prompt_cot,
                "prompt_self_planning": prompt_plan,
                "tests": tests,
                "solution": solution,
            }
            fw.write(json.dumps(record, ensure_ascii=False) + "\n")

            tdir = outdir / str(task_id).replace("/", "_")
            tdir.mkdir(parents=True, exist_ok=True)
            (tdir / "prompt.txt").write_text(prompt_base, encoding="utf-8")
            (tdir / "prompt_cot.txt").write_text(prompt_cot, encoding="utf-8")
            (tdir / "prompt_self_planning.txt").write_text(prompt_plan, encoding="utf-8")
            (tdir / "tests.py").write_text(tests, encoding="utf-8")
            (tdir / "solution.py").write_text(solution, encoding="utf-8")

    print(f"✅ Saved {len(idxs)} tasks to {manifest_path}")
    print(f"   Per-task files under: {outdir.resolve()}")
    print(f"   Split: {split_name} | include_signature={include_signature}")

# Run with signature prepended
sample_mbpp()


✅ Saved 10 tasks to /content/drive/MyDrive/CS520/selected_mbpp_seed42/selected_mbpp.jsonl
   Per-task files under: /content/drive/MyDrive/CS520/selected_mbpp_seed42
   Split: train | include_signature=True
