# 1_preprocess.ipynb

Jennifer Xu (Jennifer.Xu.26@dartmouth.edu)

This code loads the Tang and Song poetry JSON files, combines them, cleans the data, formats it with <gloss> entries, and then shuffles and splits it into training and testing sets. It outputs the cleaned Tang and Song poetry as separate JSON files for training and testing.

In [10]:
from pathlib import Path
import json, random, re, unicodedata, pandas as pd
from itertools import zip_longest
from tqdm.auto import tqdm

# Setup paths & constants

In [11]:
ROOT      = Path(".")
RAW_DIR   = ROOT / "data" / "raw"
PROC_DIR  = ROOT / "data" / "proc"
PROC_DIR.mkdir(parents=True, exist_ok=True)

RAW_TANG  = RAW_DIR / "cn_en_tmp/Chinese-Poetry-Bilingual-master/Tang/Tang.json"
RAW_SONG  = RAW_DIR / "cn_en_tmp/Chinese-Poetry-Bilingual-master/Song/Song.json"

SEED        = 42
N_PAIRS     = 2000
TRAIN_FRAC  = 0.80
random.seed(SEED)

# Cleaning functions

In [12]:
PUNCT_MAP = str.maketrans({
    "，": ",", "。": ".", "？": "?", "！": "!", "：": ":", "；": ";",
    "「": "\"", "」": "\"", "『": "\"", "』": "\"",
    "（": "(", "）": ")", "《": "\"", "》": "\"", "—": "-", "﹏": "_"
})
SPLIT_RE = re.compile(r"[，,.。\n]+")

def normalise(s: str) -> str:
    # NFKC normalize + strip
    return unicodedata.normalize("NFKC", s or "").strip()

def load_poem_lines(fp: Path):
    # flatten poem json into list of {zh,en} dicts, one per corresponding line
    data = json.load(fp.open(encoding="utf-8"))
    if isinstance(data, dict):
        data = [data]

    rows = []
    for poem in data:
        zh_lines = poem["Chinese"]["content"]
        en_lines = poem["English"]["content"]
        for zh, en in zip_longest(zh_lines, en_lines, fillvalue=""):
            rows.append({"zh": normalise(zh), "en": normalise(en)})
    return rows

def split_zh(t: str):
    # rudimentary line segmentation for Chinese verse
    return [ln for ln in SPLIT_RE.split(t) if ln]

# 1. Load & sample corpora

In [13]:
rows      = load_poem_lines(RAW_TANG) + load_poem_lines(RAW_SONG)
full_df   = pd.DataFrame(rows).dropna()
print(f"Loaded {len(full_df):,} bilingual (zh,en) pairs")

Loaded 1,130 bilingual (zh,en) pairs


In [14]:
sample_df = (full_df
             .sample(n=min(N_PAIRS, len(full_df)), random_state=SEED, replace=False)
             .reset_index(drop=True))


print(f"Sampling {len(sample_df):,} parallel pairs")

Sampling 1,130 parallel pairs


# 2. Clean & build record list

In [15]:
parallel_records = []

# bilingual records (both zh & en)
for zh_raw, en_raw in tqdm(sample_df[["zh", "en"]].itertuples(index=False),
                           total=len(sample_df), desc="Cleaning bilingual"):
    zh_clean = "\n".join(split_zh(normalise(zh_raw)))
    en_clean = normalise(en_raw)
    parallel_records.append({"zh": zh_clean, "en": en_clean, "gloss": ""})

# style‑only english sentences
print(f"Total: {len(parallel_records):,} parallel")

Cleaning bilingual:   0%|          | 0/1130 [00:00<?, ?it/s]

Total: 1,130 parallel


# 3. Shuffle & split parallel corpus

In [16]:
random.shuffle(parallel_records)
cut = int(len(parallel_records) * TRAIN_FRAC)
train_recs, test_recs = parallel_records[:cut], parallel_records[cut:]

# 4. Output jsonl files

In [17]:
def write_jsonl(path: Path, data):
    with path.open("w", encoding="utf-8") as f:
        for row in data:
            json.dump(row, f, ensure_ascii=False)
            f.write("\n")

write_jsonl(PROC_DIR / "train.jsonl", train_recs)
write_jsonl(PROC_DIR / "test.jsonl",  test_recs)

print("train.jsonl ->", len(train_recs))
print("test.jsonl  ->", len(test_recs))

train.jsonl -> 904
test.jsonl  -> 226


In [18]:
## check
import json
import textwrap
from pathlib import Path

PROC_DIR = Path("data/proc")
with (PROC_DIR / "train.jsonl").open(encoding="utf-8") as fh:
    for ln in fh:
        rec = json.loads(ln)
        if rec["zh"]:
            sample = textwrap.indent(json.dumps(rec, ensure_ascii=False, indent=2), "  ")
            print(sample)
            break

  {
    "zh": "徒此揖清芬",
    "en": "We can but breathe your fragrance the wind brings down.",
    "gloss": ""
  }
