In [2]:
#!/usr/bin/env python3
"""
excel_to_jsonl.py
Convert an Excel sheet containing text-classification data
to JSON Lines for fine-tuning.

Best practices implemented
--------------------------
• Uses pathlib for path-safe I/O
• Validates expected columns (“Content”, “Manual Label”)
• Normalises whitespace and strips new-lines
• Drops rows with missing content or label
• Makes label case-consistent
• Writes UTF-8 encoded .jsonl, one object per line
• CLI with sensible defaults
"""

import argparse
import json
import sys
from pathlib import Path

import pandas as pd


def clean_text(text: str) -> str:
    """Trim, collapse internal whitespace, and ensure no line-breaks."""
    return " ".join(str(text).strip().split())


def to_jsonl(df: pd.DataFrame, out_path: Path) -> None:
    """Write DataFrame to jsonl with keys `prompt`, `completion`."""
    with out_path.open("w", encoding="utf-8") as fp:
        for record in df.to_dict(orient="records"):
            line = json.dumps(record, ensure_ascii=False)
            fp.write(line + "\n")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Convert Excel classification data to jsonl for fine-tuning"
    )
    parser.add_argument(
        "excel_path",
        type=Path,
        help="Path to Excel workbook (e.g. Checked.xlsx)",
    )
    parser.add_argument(
        "--sheet",
        default=0,
        help="Sheet name or zero-based index (default: first sheet)",
    )
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("checked.jsonl"),
        help="Destination jsonl path (default: checked.jsonl)",
    )

    args = parser.parse_args()

    # Load workbook
    try:
        df = pd.read_excel(args.excel_path, sheet_name=args.sheet)
    except Exception as exc:  # noqa: BLE001
        sys.exit(f"Failed to read Excel: {exc}")

    # Expected columns
    required_cols = {"Content", "Manual Label"}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        sys.exit(f"Missing required column(s): {', '.join(missing)}")

    # Select & rename for clarity
    df = df.loc[:, ["Content", "Manual Label"]].rename(
        columns={"Content": "prompt", "Manual Label": "completion"}
    )

    # Clean text fields
    df["prompt"] = df["prompt"].apply(clean_text)
    df["completion"] = df["completion"].apply(lambda x: clean_text(x).lower())

    # Drop empty rows
    before, after = len(df), len(df.dropna())
    df = df.dropna()
    if after < before:
        print(f"Dropped {before - after} incomplete rows.", file=sys.stderr)

    # Write jsonl
    to_jsonl(df, args.output)
    print(f"Wrote {len(df):,} records → {args.output.resolve()}")


In [None]:
if __name__ == "__main__":
    main()