# 3_gpt4o_explain.ipynb

Jennifer Xu (Jennifer.Xu.26@dartmouth.edu)

This code reads train_gloss.jsonl as input and queries GPT-4o for a concise English explanation of each Chinese line. It outputs the explanations in explanations.sqlite, pairing each Chinese line with its English explanation in JSONL format. This file will be used later in the LoRA training stage.

# Configuration

In [2]:
# setup API
import os, openai

os.environ["OPENAI_API_KEY"] = "sk-XXX"

client = openai.OpenAI()
print("Key detected?  ->", bool(client.api_key))

Key detected?  -> True


In [3]:
from pathlib import Path
import sqlite3, hashlib, textwrap, json, time
from tqdm.auto import tqdm

DATA_DIR   = Path("data/proc")
TRAIN_FILE = DATA_DIR / "train_gloss.jsonl"

In [4]:
# cache
CACHE_DB   = DATA_DIR / "explanations.sqlite"
conn = sqlite3.connect(CACHE_DB)
cur  = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS explain (
    id          TEXT PRIMARY KEY,
    zh          TEXT,
    gloss       TEXT,
    explanation TEXT
)
""")
conn.commit()

# Helper function: fetch & store explanation

In [5]:
# %% [code]
def row_id(zh: str, gloss: str) -> str:
    return hashlib.sha1(f"{zh}||{gloss}".encode("utf-8")).hexdigest()

def gpt_explain(zh: str, gloss: str) -> dict:
    # return GPT-4o explanation JSON
    rid = row_id(zh, gloss)
    row = cur.execute("SELECT explanation FROM explain WHERE id=?", (rid,)).fetchone()
    if row:
        return json.loads(row[0])

    # build prompt
    prompt = textwrap.dedent(f"""
    You are a helpful scholar of classical Chinese poetry.
    For the line below, return a *short* JSON with keys:
      literal, imagery, cultural, tone.
    <line>{zh}</line>
    <gloss>{gloss}</gloss>
    """).strip()

    # call GPT-4o
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=90,
    )
    content = resp.choices[0].message.content.strip()

    # parse to JSON
    try:
        expl = json.loads(content)
    except json.JSONDecodeError:
        expl = {"literal": content}

    # store in cache
    cur.execute(
        "INSERT OR REPLACE INTO explain VALUES (?,?,?,?)",
        (rid, zh, gloss, json.dumps(expl, ensure_ascii=False))
    )
    conn.commit()
    return expl

# Iterate through training data

In [6]:
SLEEP_SEC = 1.2
FILES = [TRAIN_FILE]

for fp in FILES:
    total = hit = 0
    with open(fp, encoding="utf-8") as f:
        for ln in tqdm(f, desc=f"GPT-4o explain  →  {fp.name}"):
            rec = json.loads(ln)
            # skip English-only rows
            if not rec["zh"]:
                continue
            # only query lines that have a gloss
            if rec["gloss"] == "":
                continue

            gpt_explain(rec["zh"], rec["gloss"])
            time.sleep(SLEEP_SEC)
            hit += 1
    print(f"{fp.name}: {hit} lines processed.")

GPT-4o explain  →  train_gloss.jsonl: 0it [00:00, ?it/s]

train_gloss.jsonl: 834 lines processed.


In [7]:
## check
row = cur.execute("SELECT zh, gloss, explanation FROM explain LIMIT 1").fetchone()
print("Line :", row[0])
print("Gloss:", row[1])
print("JSON :", json.dumps(json.loads(row[2]), indent=2, ensure_ascii=False))

Line : 徒此揖清芬
Gloss: to greet by raising clasped hands; 1.清香。 
2.喻高洁的德行。
JSON : {
  "literal": "```json\n{\n  \"literal\": \"to greet by raising clasped hands; fragrant and pure\",\n  \"imagery\": \"the act of greeting suggests reverence, while '清芬' evokes the scent of purity and elegance\",\n  \"cultural\": \"the gesture of clasped hands signifies respect and humility in Chinese culture, while '清芬' symbolizes noble character\",\n  \"tone\": \"elevated and respectful, conveying admiration"
}
