# 0_setup.ipynb

Jennifer Xu (Jennifer.Xu.26@dartmouth.edu)

This code loads Chinese-English poem pairs for training and testing translation, English poetry for training style, and Chinese glosses along with archaic poetic vocabulary for retrieval-augmented generation.

In [None]:
!pip install --upgrade "transformers>=4.40" peft faiss-cpu datasets evaluate sentencepiece tiktoken openai bitsandbytes accelerate tqdm

In [2]:
import urllib.request, requests, zipfile, gzip, shutil, json, random, pandas as pd
from pathlib import Path

root = Path("data/raw")
root.mkdir(parents=True, exist_ok=True)

In [3]:
# Chinese-English poem pairs
zip_url = ("https://github.com/Leslie-Wong-H/Chinese-Poetry-Bilingual/"
           "archive/refs/heads/master.zip")
zip_path = root / "ch_poetry_bi.zip"
if not zip_path.exists():
    urllib.request.urlretrieve(zip_url, zip_path)
    print("Downloaded bilingual zip.")

with zipfile.ZipFile(zip_path) as z:
    z.extractall(root / "cn_en_tmp")
print("Unzipped bilingual corpus.")

Downloaded bilingual zip.
Unzipped bilingual corpus.


In [4]:
## English poetry
gp_gz  = root / "gutenberg_poetry.ndjson.gz"
url = "http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"

if not gp_gz.exists():
    urllib.request.urlretrieve(url, gp_gz)
    print("Downloaded Gutenberg.")

# sample 5k lines
sample_n = 5000
lines = []
with gzip.open(gp_gz, "rt", encoding="utf-8") as f:
    for ln in f:
        lines.append(json.loads(ln)["s"])

sample = random.sample(lines, sample_n)
pd.DataFrame({"text": sample}).to_csv(root / "gutenberg_en.csv",
                                      index=False, encoding="utf-8")
print(f"data/raw/gutenberg_en.csv written ({sample_n} rows)")

Downloaded Gutenberg.
data/raw/gutenberg_en.csv written (5000 rows)


In [5]:
## CC-CEDICT
url = "https://www.mdbg.net/chinese/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz"
gz_path = Path("data/raw/cedict_ts.u8.gz")
txt_path = Path("data/raw/cedict_ts.u8")

gz_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(url, gz_path)

# decompress
with gzip.open(gz_path, "rb") as f_in, open(txt_path, "wb") as f_out:
    shutil.copyfileobj(f_in, f_out)

print("CC-CEDICT downloaded.")

CC-CEDICT downloaded.


In [6]:
## Archaic-poetry add-ons (成语 + 词语)
def github_raw(path):
    api = f"https://raw.githubusercontent.com/{path}"
    return requests.get(api, timeout=30).text

for short, fname in [("idiom", "idiom.json"), ("ci", "ci.json")]:
    dest = Path("data/raw") / fname
    if not dest.exists():
        txt = github_raw(f"pwxcoo/chinese-xinhua/master/data/{fname}")
        dest.write_text(txt, encoding="utf-8")
print("Dictionary downloaded.")

Dictionary downloaded.
