<a href="https://colab.research.google.com/github/john-d-noble/callcenter/blob/main/py_to_ipynb_batch_converter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🧰 Batch-convert Python "notebooks" (.py) → real Jupyter notebooks (.ipynb)

This notebook scans a directory for `.py` files, **splits them into digestible cells using your house-style section headings**, and writes a proper `.ipynb` next to each source file. Optionally, it can also write back a tidy `.py` with `# %%` cell markers.

**How to use**  
1. Edit the config cell below (set `SOURCE_DIR`).  
2. Run the cells top-to-bottom.  
3. Open your new notebooks (`.ipynb`) in the same folder.


In [1]:

# === Configuration ===
SOURCE_DIR = "/content"   # change me
INCLUDE_GLOB = "*.py"      # e.g., "**/*.py" for recursive
MAX_LINES = 160
WRITE_BACK_PY = True
WRITE_IPYNB = True
MARKDOWN_HEADINGS = True

# House-style section regex
SECTION_RE = r"^\s*#\s*(?:##+\s+(.*)|###\s*(?:Step|Phase)\s*:\s*(.*)|[-]{3,}\s+(.*?)\s*-*|[=]{3,}\s+(.*?)\s*=*|(?:SECTION|STEP|PHASE)\s*:\s*(.*))\s*$"


In [2]:
import os, re, glob, json, shutil
from typing import List, Tuple

CHUNK_HEADER_RE = re.compile(r"^\s*#\s*%%")
JUPYTER_IN_RE = re.compile(r"^\s*#\s*In\[\d*\]:")
TOP_LEVEL_DEF_RE = re.compile(r"^(def |class |@)", re.ASCII)
DOUBLE_BLANK_RE = re.compile(r"\n\s*\n\s*\n")

def split_into_cells(lines: List[str]) -> List[Tuple[str, List[str]]]:
    indices = []
    for i, line in enumerate(lines):
        if CHUNK_HEADER_RE.match(line) or JUPYTER_IN_RE.match(line):
            indices.append(i)
    if not indices:
        return [("NO_MARKER", lines[:])]
    cells = []
    for idx, start in enumerate(indices):
        end = indices[idx + 1] if idx + 1 < len(indices) else len(lines)
        header = lines[start].rstrip("\n")
        body = lines[start + 1:end]
        cells.append((header, body))
    return cells

def _extract_title(m: re.Match) -> str:
    gd = m.groupdict() if hasattr(m, "groupdict") else {}
    title = (gd.get("title") or "").strip() if gd else ""
    if title:
        return title
    for g in m.groups():
        if g and str(g).strip():
            return str(g).strip()
    return ""

def find_section_indices(text: str, section_re: re.Pattern):
    sections = []
    for i, ln in enumerate(text.splitlines()):
        m = section_re.match(ln)
        if m:
            title = _extract_title(m)
            sections.append((i, title))
    return sections

def secondary_splits(text: str, max_lines: int):
    lines = text.splitlines()
    if len(lines) <= max_lines or not text.strip():
        return [len(lines)]
    candidates = set()
    col0_indices = [i for i, ln in enumerate(lines) if ln and not ln.startswith((" ", "\t"))]
    for i in col0_indices:
        if TOP_LEVEL_DEF_RE.match(lines[i]):
            candidates.add(i)
    for m in DOUBLE_BLANK_RE.finditer("\n".join(lines)):
        upto = m.start()
        before = text[:upto].splitlines()
        candidates.add(len(before))
    anchors = sorted([0] + [i for i in candidates if 0 < i < len(lines)] + [len(lines)])
    cuts, cur = [], 0
    while cur < len(lines):
        next_idx = cur + max_lines
        valid = [a for a in anchors if cur < a <= min(len(lines), next_idx)]
        cut = max(valid) if valid else min(len(lines), cur + max_lines)
        cuts.append(cut)
        cur = cut
    final, prev = [], 0
    for cut in cuts:
        seg_len = cut - prev
        if seg_len <= max_lines:
            final.append(cut)
        else:
            while prev + max_lines < cut:
                prev += max_lines
                final.append(prev)
            final.append(cut)
        prev = cut
    return final

def reassemble_with_sections(py_text: str, max_lines: int, section_re: re.Pattern):
    lines = py_text.splitlines()
    cells = split_into_cells(lines)
    out = []
    for (header, body) in cells:
        body_text = "".join(body)
        blines = body_text.splitlines()
        sects = find_section_indices(body_text, section_re)
        if not sects:
            seg_bounds = [(0, len(blines), None)]
        else:
            anchors = [0] + [i for (i, _t) in sects] + [len(blines)]
            seg_bounds = []
            for i in range(len(anchors)-1):
                start, end = anchors[i], anchors[i+1]
                title = None
                if i > 0:
                    title = sects[i-1][1] if sects[i-1][0] == start else None
                if start < len(blines) and section_re.match(blines[start]):
                    start += 1
                seg_bounds.append((start, end, title))
        for (s, e, title) in seg_bounds:
            seg_text = "\n".join(blines[s:e]).rstrip()
            if not seg_text.strip():
                if title:
                    out.append(f"# %% {title} [part 1/1]\n")
                continue
            cuts = secondary_splits(seg_text, max_lines)
            prev, parts = 0, []
            s_lines = seg_text.splitlines()
            for c in cuts:
                chunk = "\n".join(s_lines[prev:c]).rstrip()
                if chunk.strip():
                    parts.append(chunk)
                prev = c
            total = len(parts) if parts else 1
            if not parts:
                label = title or "Auto-split cell"
                out.append(f"# %% {label} [part 1/1]\n")
                continue
            for i, chunk in enumerate(parts):
                label = title or "Auto-split cell"
                out.append(f"# %% {label} [part {i+1}/{total}]\n")
                out.append(chunk + ("\n" if not chunk.endswith("\n") else ""))
    return out

def to_ipynb(chunks_text, markdown_headings: bool):
    nb = {
        "cells": [],
        "metadata": {
            "kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
            "language_info": {"name": "python", "pygments_lexer": "ipython3"},
        },
        "nbformat": 4,
        "nbformat_minor": 5,
    }
    header_re = re.compile(r"^\s*#\s*%%\s*(.*)$")
    current_title = None
    current_code = []
    def flush_segment():
        nonlocal current_title, current_code
        if current_title is not None:
            if markdown_headings and current_title.strip():
                nb["cells"].append({
                    "cell_type": "markdown",
                    "metadata": {},
                    "source": f"## {current_title.strip()}\n",
                })
            nb["cells"].append({
                "cell_type": "code",
                "metadata": {},
                "execution_count": None,
                "outputs": [],
                "source": "\n".join(current_code).rstrip() + ("\n" if current_code else ""),
            })
            current_title, current_code = None, []
    for line in "".join(chunks_text).splitlines():
        m = header_re.match(line)
        if m:
            flush_segment()
            title = m.group(1)
            title = re.sub(r"\s*\[part\s+\d+/\d+\]\s*", "", title).strip()
            current_title = title
            current_code = []
        else:
            current_code.append(line)
    if current_title is not None or current_code:
        flush_segment()
    return nb

def process_one_file(py_path: str, max_lines: int, section_re: re.Pattern, write_back: bool, make_ipynb: bool, markdown_headings: bool):
    with open(py_path, "r", encoding="utf-8") as f:
        original_text = f.read()
    new_lines = reassemble_with_sections(original_text, max_lines=max_lines, section_re=section_re)

    # Backup original .py once
    if write_back:
        backup_path = py_path + ".bak"
        if not os.path.exists(backup_path):
            with open(backup_path, "w", encoding="utf-8") as bf:
                bf.write(original_text)
        with open(py_path, "w", encoding="utf-8") as f:
            f.write("".join(new_lines))

    ipynb_path = None
    if make_ipynb:
        nb = to_ipynb(new_lines, markdown_headings)
        ipynb_path = os.path.splitext(py_path)[0] + ".ipynb"
        with open(ipynb_path, "w", encoding="utf-8") as nf:
            json.dump(nb, nf, ensure_ascii=False, indent=2)
    return {
        "source_py": py_path,
        "wrote_py": write_back,
        "ipynb_path": ipynb_path,
        "chunks": sum(1 for ln in new_lines if ln.startswith("# %%")),
    }

In [3]:

# === Run conversion over the directory ===
import os, glob, re

section_re = re.compile(SECTION_RE)
paths = glob.glob(os.path.join(SOURCE_DIR, INCLUDE_GLOB), recursive=True)
paths = [p for p in paths if p.lower().endswith(".py")]

print(f"Found {len(paths)} .py file(s) under {SOURCE_DIR!r} matching {INCLUDE_GLOB!r}.")

results = []
for p in sorted(paths):
    res = process_one_file(
        py_path=p,
        max_lines=MAX_LINES,
        section_re=section_re,
        write_back=WRITE_BACK_PY,
        make_ipynb=WRITE_IPYNB,
        markdown_headings=MARKDOWN_HEADINGS,
    )
    results.append(res)
    print(f"- {os.path.basename(p)} -> chunks={res['chunks']}, ipynb={'yes' if res['ipynb_path'] else 'no'}")

print("\nDone.")


Found 5 .py file(s) under '/content' matching '*.py'.
- cx_basic_model_exploration_run_1.py -> chunks=1, ipynb=yes
- cx_basic_model_exploration_run_2.py -> chunks=1, ipynb=yes
- cx_basic_model_exploration_run_3.py -> chunks=1, ipynb=yes
- cx_basic_model_exploration_run_4.py -> chunks=1, ipynb=yes
- cx_basic_model_exploration_run_5.py -> chunks=1, ipynb=yes

Done.


In [None]:
# === Download all generated .ipynb files as a zip ===
import shutil
from google.colab import files
import glob
import os

# Zip file name
zip_path = "/content/converted_notebooks.zip"

# Find all ipynb files in the source dir
ipynb_files = glob.glob(os.path.join(SOURCE_DIR, "*.ipynb"))

# Create zip archive
shutil.make_archive(zip_path.replace(".zip",""), 'zip', SOURCE_DIR)

# Download
files.download(zip_path)


**Tip:** If you want to run only for certain files, change `INCLUDE_GLOB`.  
To tighten your headings (e.g., only `# SECTION: ...`), narrow the `SECTION_RE`.
