# 2_rag.ipynb

Jennifer Xu (Jennifer.Xu.26@dartmouth.edu)

This retrieval-augmented generation pipeline takes train.jsonl and the Chinese dictionaries as input. It combines the dictionaries into a single lookup table, then uses Jieba tokenization to scan each poem line and retrieve English glosses. The resulting dataset is saved as train_gloss.jsonl.

In [1]:
from pathlib import Path
import urllib.request, gzip, shutil, os, json, re, jieba, unicodedata, random, textwrap

RAW_DIR = Path("data/raw")
PROC_DIR = Path("data/proc")
RAW_DIR.mkdir(parents=True, exist_ok=True)

# Build {word: short_gloss} dictionary

In [2]:
from collections import defaultdict

# load CC-CEDICT
cedict = {}
with open("data/raw/cedict_ts.u8", encoding="utf-8") as f:
    for ln in f:
        if ln.startswith("#"): continue
        parts = ln.strip().split(" ", 2)
        if len(parts) != 3: continue
        trad, simp, rest = parts
        gloss = rest.split("/")[1] if "/" in rest else ""
        if simp and gloss:
            cedict.setdefault(simp, gloss)

# merge idioms & expressions
for fname, key in [("idiom.json", "word"), ("ci.json", "ci")]:
    for obj in json.loads((RAW_DIR / fname).read_text(encoding="utf-8")):
        word  = obj[key]
        gloss = obj.get("explanation", obj.get("derivation", ""))[:60]
        if word not in cedict and gloss:
            cedict[word] = gloss

print(f"Combined dictionary size: {len(cedict):,}")

Combined dictionary size: 359,040


# Regex

In [3]:
# 4-char idioms
IDIOM_RE = re.compile(r"[\u4e00-\u9fff]{4}")
# none = match all
MAX_GLOSSES = 3

def collect_glosses(zh: str) -> str:
    zh = unicodedata.normalize("NFKC", zh)
    found, seen = [], set()

    for ido in IDIOM_RE.findall(zh):
        if ido in cedict:
            found.append(cedict[ido]);  seen.add(ido)

    for tok in jieba.cut(zh):
        if tok in cedict and tok not in seen:
            found.append(cedict[tok]);  seen.add(tok)

    if MAX_GLOSSES is not None:
        found = found[:MAX_GLOSSES]

    return "; ".join(found)

# Attach glosses to train/test

In [4]:
def augment_file(in_path, out_path):
    total = hit = 0
    with open(in_path, encoding="utf-8") as fin, \
         open(out_path, "w", encoding="utf-8") as fout:
        for ln in fin:
            rec = json.loads(ln)
            if rec["zh"]:
                g = collect_glosses(rec["zh"])
                if g:
                    hit += 1
                rec["gloss"] = g
            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
            total += 1
    pct = hit / total * 100
    print(f"{in_path.name}: {hit}/{total} lines glossed ({pct:.1f} %)")

In [5]:
augment_file(PROC_DIR / "train.jsonl", PROC_DIR / "train_gloss.jsonl")

Building prefix dict from C:\Users\bb\anaconda3\Lib\site-packages\jieba\dict.txt ...
Loading model from cache C:\Users\bb\AppData\Local\Temp\jieba.cache
Loading model cost 0.3609955310821533 seconds.
Prefix dict has been built succesfully.


train.jsonl: 834/904 lines glossed (92.3 %)


In [6]:
## check
import itertools, textwrap, json
with open(PROC_DIR / "train_gloss.jsonl", encoding="utf-8") as f:
    for ln in f:
        rec = json.loads(ln)
        if rec["gloss"]:
            print(textwrap.indent(json.dumps(rec, ensure_ascii=False, indent=2), "  "))
            break

  {
    "zh": "徒此揖清芬",
    "en": "We can but breathe your fragrance the wind brings down.",
    "gloss": "to greet by raising clasped hands; 1.清香。 \n2.喻高洁的德行。"
  }
