In [None]:
from datasets import load_from_disk
import pandas as pd
import numpy as np
import json
from pathlib import Path

In [3]:
MASTER_TRAIN_DATASET = "sft_train_dataset.csv"
MASTER_EVAL_DATASET  = "sft_eval_dataset.csv"
MASTER_TEST_DATASET  = "sft_test_dataset.csv"

#Where to write flat files
EXPORT_DIR = Path("sft_split_dataset")
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
DESIRED_COLS = ["uid", "original", "simplified", "applied_rules"]

def coerce_to_list(x):
    """Ensure applied_rules is a Python list."""
    if isinstance(x, list):
        return x
    if pd.isna(x):
        return []
    if isinstance(x, str):
        x = x.strip()
        # Try JSON first
        try:
            y = json.loads(x)
            return y if isinstance(y, list) else [y]
        except Exception:
            pass
        # Try Python-literal format: "['a', 'b']"
        try:
            y = literal_eval(x)
            return y if isinstance(y, list) else [y]
        except Exception:
            # Fallback: split by comma, strip
            return [p.strip() for p in x.split(",") if p.strip()]
    # Fallback: wrap singletons
    return [x]

def ensure_schema(df: pd.DataFrame) -> pd.DataFrame:
    # Drop 'text' if present
    if "text" in df.columns:
        df = df.drop(columns=["text"])

    # Recover/construct uid
    if "uid" not in df.columns:
        # Some round-trips leave an index column like "__index_level_0__"
        idx_cols = [c for c in df.columns if c.startswith("__index_level_")]
        if idx_cols:
            df = df.rename(columns={idx_cols[0]: "uid"})
        else:
            # If nothing present, create a stable uid from current order
            df["uid"] = np.arange(1, len(df) + 1, dtype=int)

    # Ensure required text columns exist (fill empty if missing)
    for col in ["original", "simplified"]:
        if col not in df.columns:
            df[col] = ""

    # Ensure applied_rules exists
    if "applied_rules" not in df.columns:
        df["applied_rules"] = [[] for _ in range(len(df))]

    # Type coercions
    df["uid"] = pd.to_numeric(df["uid"], errors="coerce").fillna(-1).astype(int)
    df["original"] = df["original"].astype(str)
    df["simplified"] = df["simplified"].astype(str)
    df["applied_rules"] = df["applied_rules"].apply(coerce_to_list)

    # Order and sort
    # Keep exactly these columns, in this order
    df = df[DESIRED_COLS].sort_values("uid").reset_index(drop=True)
    return df

def save_all(df: pd.DataFrame, stem: str):
    # CSV: serialize list to JSON so it round-trips cleanly
    csv_df = df.copy()
    csv_df["applied_rules"] = csv_df["applied_rules"].apply(json.dumps, ensure_ascii=False)
    csv_path = EXPORT_DIR / f"{stem}.csv"
    csv_df.to_csv(csv_path, index=False)



    print(f"Saved {stem}:")
    print(f"  CSV    -> {csv_path}")
    return csv_path

def load_clean_save(ds_path: str, stem: str) -> pd.DataFrame:
    ds = load_from_disk(ds_path)
    df = ds.to_pandas()
    df = ensure_schema(df)
    save_all(df, stem)
    return df

train_df = load_clean_save(MASTER_TRAIN_DATASET, "train")
eval_df  = load_clean_save(MASTER_EVAL_DATASET,  "eval")
test_df  = load_clean_save(MASTER_TEST_DATASET,  "test")


Saved train:
  CSV    -> sft_split_dataset/train.csv
Saved eval:
  CSV    -> sft_split_dataset/eval.csv
Saved test:
  CSV    -> sft_split_dataset/test.csv
