<a href="https://colab.research.google.com/github/leosammallahti/AnalysisCoLab/blob/main/Chinese_Philosophy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1 — Setup & Load
!pip -q install --upgrade pip
!pip -q install "spacy==3.8.2" "pandas==2.2.2" "matplotlib==3.9.0" "nltk==3.9.1" "wordfreq==3.1.1" "gradio>=4.41,<5" "seaborn==0.13.2" "openai>=1.30.0"
!python -m spacy download en_core_web_sm

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, pandas as pd

ROOT = "/content/drive/MyDrive/Chinese Philosophers"
TARGET = f"{ROOT}/chinese_philosophers_quotes_corrected.csv"
os.makedirs(ROOT, exist_ok=True)

# Load and normalize
df = pd.read_csv(TARGET)
df.columns = [c.strip().lower() for c in df.columns]

# Ensure a unified text column
for cand in ["text","quote","translation","english","content","verse_text","line_text"]:
    if cand in df.columns:
        if cand != "text":
            df = df.rename(columns={cand: "text"})
        break
assert "text" in df.columns, "Input needs a text/quote column."

# Add row_id if missing
if "row_id" not in df.columns:
    df.insert(0, "row_id", range(len(df)))

print(f"Loaded {len(df):,} rows; columns: {list(df.columns)}")
df.head(2)




Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m104.1 MB/s[0m  [33m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Mounted at /content/drive
Loaded 1,162 rows; columns: ['row_id', 'philosopher', 'work', 'chapter_verse', 'text', 'source']


Unnamed: 0,row_id,philosopher,work,chapter_verse,text,source
0,0,Confucius,Analects,1:1,"""The Master said, ‘Is it not a pleasure, havin...",Confucius (Analects)
1,1,Confucius,Analects,1:2,"""Tzu said, ‘It is rare for a man whose charact...",Confucius (Analects)


In [None]:
# Cell 2 — Metaphor Detection (spaCy + heuristics)
import importlib, pandas as pd, os
import nltk; nltk.download('wordnet'); nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn
from wordfreq import zipf_frequency
import spacy

def load_nlp():
    try:
        return spacy.load("en_core_web_sm")
    except Exception:
        en_model = importlib.import_module("en_core_web_sm")
        return en_model.load()

nlp = load_nlp()

ROOT = "/content/drive/MyDrive/Chinese Philosophers"

CONCRETE_LEX = {"noun.artifact","noun.object","noun.substance","noun.body","noun.phenomenon"}
ABSTRACT_LEX  = {"noun.attribute","noun.cognition","noun.communication","noun.group",
                 "noun.motive","noun.act","noun.state","noun.time"}

def is_concrete_lemma(lemma:str):
    syns = wn.synsets(lemma)
    if any(s.lexname() in CONCRETE_LEX for s in syns):
        return True
    if any(s.lexname() in ABSTRACT_LEX for s in syns):
        return False
    return zipf_frequency(lemma, 'en') >= 4.5

def candidates_from_doc(doc, base_row_id):
    rows, local_idx = [], 0
    for sent in doc.sents:
        for tok in sent:
            reason = None
            if tok.pos_ in {"VERB","ADJ"}:
                nouns = [c for c in tok.children if c.pos_=="NOUN"]
                if tok.head.pos_=="NOUN":
                    nouns.append(tok.head)
                for n in nouns:
                    if is_concrete_lemma(tok.lemma_.lower()) and not is_concrete_lemma(n.lemma_.lower()):
                        reason = f"CCO: {tok.lemma_} (concrete) → {n.lemma_} (abstract)"
                        break
            if not reason and tok.dep_=="attr" and tok.head.pos_=="NOUN" and tok.pos_=="NOUN":
                head = tok.head
                if is_concrete_lemma(tok.lemma_.lower()) and not is_concrete_lemma(head.lemma_.lower()):
                    reason = f"Copula-CCO: {tok.lemma_} (concrete) ≈ {head.lemma_} (abstract)"
            uid = f"{base_row_id}_{local_idx}"
            rows.append({
                "uid": uid,
                "row_id": base_row_id,
                "sent_text": sent.text,
                "token": tok.text,
                "lemma": tok.lemma_,
                "pos": tok.pos_,
                "dep": tok.dep_,
                "is_metaphor_candidate": 1 if reason else 0,
                "reason": reason
            })
            local_idx += 1
    return rows

# Expect that 01_setup_and_load.py already defined df in the notebook session
# If pasting into a fresh cell, run Cell 1 first to create df
texts = df["text"].astype(str).tolist()
out_rows = []
for i, doc in enumerate(nlp.pipe(texts, batch_size=256, n_process=2)):
    out_rows.extend(candidates_from_doc(doc, base_row_id=int(df.iloc[i]["row_id"])))

meta_df = pd.DataFrame(out_rows)

# Join optional metadata (e.g., philosopher) for convenience
# Select columns from df excluding 'text' and 'row_id' to avoid duplicates
metadata_cols = [c for c in df.columns if c not in {"text", "row_id"}]
if metadata_cols:
    meta_df = meta_df.merge(df[["row_id"] + metadata_cols], on="row_id", how="left")

enriched_full = os.path.join(ROOT, "metaphor_enriched_full.csv")
enriched_flagged = os.path.join(ROOT, "metaphor_enriched.csv")
meta_df.to_csv(enriched_full, index=False)
meta_df[meta_df.is_metaphor_candidate==1].to_csv(enriched_flagged, index=False)
print("Saved:\n ", enriched_flagged, "\n ", enriched_full)
meta_df.head(3)




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Saved:
  /content/drive/MyDrive/Chinese Philosophers/metaphor_enriched.csv 
  /content/drive/MyDrive/Chinese Philosophers/metaphor_enriched_full.csv


Unnamed: 0,uid,row_id,sent_text,token,lemma,pos,dep,is_metaphor_candidate,reason,philosopher,work,chapter_verse,source
0,0_0,0,"""The Master said, ‘Is it not a pleasure, havin...","""","""",PUNCT,punct,0,,Confucius,Analects,1:1,Confucius (Analects)
1,0_1,0,"""The Master said, ‘Is it not a pleasure, havin...",The,the,DET,det,0,,Confucius,Analects,1:1,Confucius (Analects)
2,0_2,0,"""The Master said, ‘Is it not a pleasure, havin...",Master,Master,PROPN,nsubj,0,,Confucius,Analects,1:1,Confucius (Analects)


In [None]:
# Cell 3 — QA & Sampling artifacts
import pandas as pd, os

ROOT = "/content/drive/MyDrive/Chinese Philosophers"
enriched_flagged = os.path.join(ROOT, "metaphor_enriched.csv")
m = pd.read_csv(enriched_flagged)

# Summary by philosopher/reason if columns exist
summary_cols = [c for c in ["philosopher","reason"] if c in m.columns]
qa_summary = None
if summary_cols:
    qa_summary = m.groupby(summary_cols).size().reset_index(name="count")
else:
    qa_summary = m.groupby(["reason"]).size().reset_index(name="count")

qa_summary_path = os.path.join(ROOT, "metaphor_QA_summary.csv")
qa_summary.to_csv(qa_summary_path, index=False)

# Deduplicate candidates by row_id + reason (first instance)
dedup = m.sort_values(["row_id","uid"]).drop_duplicates(subset=["row_id","reason"], keep="first")
dedup_path = os.path.join(ROOT, "metaphor_candidates_dedup.csv")
dedup.to_csv(dedup_path, index=False)

# Label queue with minimal fields
queue_cols = [c for c in ["uid","row_id","philosopher","sent_text","token","lemma","reason"] if c in dedup.columns]
label_queue = dedup[queue_cols].copy()
label_queue_path = os.path.join(ROOT, "metaphor_label_queue.csv")
label_queue.to_csv(label_queue_path, index=False)

print("Saved:\n ", qa_summary_path, "\n ", dedup_path, "\n ", label_queue_path)
label_queue.head(3)




Saved:
  /content/drive/MyDrive/Chinese Philosophers/metaphor_QA_summary.csv 
  /content/drive/MyDrive/Chinese Philosophers/metaphor_candidates_dedup.csv 
  /content/drive/MyDrive/Chinese Philosophers/metaphor_label_queue.csv


Unnamed: 0,uid,row_id,philosopher,sent_text,token,lemma,reason
0,0_30,0,Confucius,Is it not a joy to have friends come from afar?,have,have,CCO: have (concrete) → joy (abstract)
1,0_48,0,Confucius,Is it not gentlemanly not to take offence when...,appreciate,appreciate,CCO: appreciate (concrete) → ability (abstract)
2,1_29,1,Confucius,"""Tzu said, ‘It is rare for a man whose charact...",have,have,CCO: have (concrete) → inclination (abstract)


In [None]:
# Cell 4 — Labeling UI (Gradio)
import os, pandas as pd, gradio as gr, datetime

ROOT = "/content/drive/MyDrive/Chinese Philosophers"
label_queue_path = os.path.join(ROOT, "metaphor_label_queue.csv")
labels_path = os.path.join(ROOT, "metaphor_labels.csv")
queue_df = pd.read_csv(label_queue_path)

if not os.path.exists(labels_path):
    pd.DataFrame(columns=["uid","row_id","metaphoricity_0to3","source_domain","target_domain","notes","timestamp"]).to_csv(labels_path, index=False)

def get_item(idx:int):
    idx = max(0, min(int(idx), len(queue_df)-1))
    row = queue_df.iloc[idx]
    header_parts = []
    if "philosopher" in row:
        header_parts.append(f"Philosopher: {row['philosopher']}")
    header = " | ".join(header_parts) if header_parts else "Item"
    return (
        int(idx),
        row["uid"],
        row["row_id"],
        header,
        row.get("sent_text",""),
        f"Token: {row.get('token','')} | Lemma: {row.get('lemma','')}",
        row.get("reason","")
    )

def submit_label(uid, row_id, metaphoricity, source_domain, target_domain, notes):
    labels = pd.read_csv(labels_path)
    timestamp = datetime.datetime.utcnow().isoformat()
    entry = {
        "uid": uid,
        "row_id": int(row_id),
        "metaphoricity_0to3": int(metaphoricity),
        "source_domain": source_domain,
        "target_domain": target_domain,
        "notes": notes,
        "timestamp": timestamp
    }
    # upsert by uid
    if (labels["uid"] == uid).any():
        labels.loc[labels["uid"] == uid, list(entry.keys())] = list(entry.values())
    else:
        labels = pd.concat([labels, pd.DataFrame([entry])], ignore_index=True)
    labels.to_csv(labels_path, index=False)
    return f"Saved label for uid={uid}"

with gr.Blocks(title="Metaphor Labeling") as demo:
    gr.Markdown("# Metaphor Labeling")
    with gr.Row():
        idx = gr.Slider(0, max(0, len(queue_df)-1), value=0, step=1, label="Index")
    with gr.Row():
        uid = gr.Textbox(label="uid", interactive=False)
        row_id = gr.Number(label="row_id", interactive=False, precision=0)
    meta_header = gr.Markdown()
    sent = gr.Textbox(label="Sentence", lines=4)
    token_info = gr.Textbox(label="Token Info", lines=1)
    reason = gr.Textbox(label="Heuristic Reason", lines=2)
    metaphoricity = gr.Slider(0,3,step=1,value=1,label="Metaphoricity (0–3)")
    source_domain = gr.Textbox(label="Source Domain")
    target_domain = gr.Textbox(label="Target Domain")
    notes = gr.Textbox(label="Notes", lines=2)
    save_btn = gr.Button("Save Label")
    msg = gr.Markdown()

    def refresh(i):
        return get_item(i)

    idx.change(fn=refresh, inputs=[idx], outputs=[idx, uid, row_id, meta_header, sent, token_info, reason])
    save_btn.click(fn=submit_label, inputs=[uid, row_id, metaphoricity, source_domain, target_domain, notes], outputs=[msg])

demo.launch(share=False, show_error=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>



In [None]:
# Cell 5 — CMI Analysis (verb–noun association scoring)
import os, pandas as pd, spacy, math
from collections import Counter
from nltk.corpus import wordnet as wn
from wordfreq import zipf_frequency

ROOT = "/content/drive/MyDrive/Chinese Philosophers"

# reuse is_concrete_lemma from earlier cell if in memory; otherwise define
def is_concrete_lemma(lemma:str):
    syns = wn.synsets(lemma)
    if any(s.lexname() in {"noun.artifact","noun.object","noun.substance","noun.body","noun.phenomenon"} for s in syns):
        return True
    if any(s.lexname() in {"noun.attribute","noun.cognition","noun.communication","noun.group","noun.motive","noun.act","noun.state","noun.time"} for s in syns):
        return False
    return zipf_frequency(lemma, 'en') >= 4.5

nlp = spacy.load("en_core_web_sm")

texts = df["text"].astype(str).tolist()
verb_counts, noun_counts, pair_counts = Counter(), Counter(), Counter()
sent_store = []

for i, doc in enumerate(nlp.pipe(texts, batch_size=256, n_process=2)):
    row_id = int(df.iloc[i]["row_id"])
    for sent in doc.sents:
        verbs = [t.lemma_.lower() for t in sent if t.pos_=="VERB"]
        nouns = [t.lemma_.lower() for t in sent if t.pos_=="NOUN"]
        for v in verbs:
            verb_counts[v] += 1
        for n in nouns:
            noun_counts[n] += 1
        for v in verbs:
            for n in nouns:
                pair_counts[(v,n)] += 1
                if len(sent_store) < 100000:
                    sent_store.append({"row_id": row_id, "verb": v, "noun": n, "sentence": sent.text})

total_pairs = sum(pair_counts.values())

def pmi(v,n):
    # add-1 smoothing
    pv = (verb_counts[v] + 1) / (sum(verb_counts.values()) + len(verb_counts))
    pn = (noun_counts[n] + 1) / (sum(noun_counts.values()) + len(noun_counts))
    pvn = (pair_counts[(v,n)] + 1) / (total_pairs + len(pair_counts))
    return math.log2(pvn / (pv * pn))

rows = []
for (v,n), c in pair_counts.items():
    if c < 3:
        continue
    concrete_v = is_concrete_lemma(v)
    abstract_n = not is_concrete_lemma(n)
    if concrete_v and abstract_n:
        rows.append({
            "verb": v, "noun": n, "count": c,
            "pmi": round(pmi(v,n), 4),
            "verb_concrete": concrete_v, "noun_abstract": abstract_n
        })

cmi_df = pd.DataFrame(rows).sort_values(["pmi","count"], ascending=[False, False])
cmi_path = os.path.join(ROOT, "cmi_candidates.csv")
cmi_df.to_csv(cmi_path, index=False)

# Evidence samples
evidence = pd.DataFrame(sent_store)
evidence = evidence.merge(cmi_df[["verb","noun"]].drop_duplicates(), on=["verb","noun"], how="inner")
evidence_samples = evidence.groupby(["verb","noun"]).head(5).reset_index(drop=True)
evidence_path = os.path.join(ROOT, "cmi_evidence_samples.csv")
evidence_samples.to_csv(evidence_path, index=False)

print("Saved:\n ", cmi_path, "\n ", evidence_path)
cmi_df.head(10)




Saved:
  /content/drive/MyDrive/Chinese Philosophers/cmi_candidates.csv 
  /content/drive/MyDrive/Chinese Philosophers/cmi_evidence_samples.csv


Unnamed: 0,verb,noun,count,pmi,verb_concrete,noun_abstract
1330,tail,deer,4,10.9611,True,True
1331,tail,insect,4,10.9611,True,True
1654,render,caste,3,10.4168,True,True
1656,render,bureaucrat,3,10.4168,True,True
1657,render,litterateur,3,10.4168,True,True
1648,render,elite,3,9.8318,True,True
211,refuse,conjecture,5,9.7647,True,True
413,cross,hi,3,9.7647,True,True
416,cross,drummer,3,9.7647,True,True
1647,render,shi,6,9.6392,True,True


In [None]:
# Cell 6 — Heatmaps
import os, pandas as pd, seaborn as sns, matplotlib.pyplot as plt

ROOT = "/content/drive/MyDrive/Chinese Philosophers"

flagged_path = os.path.join(ROOT, "metaphor_enriched.csv")
m = pd.read_csv(flagged_path)

# Count heatmap (philosopher x reason) if philosopher exists
if "philosopher" in m.columns:
    pivot_counts = m.pivot_table(index="philosopher", columns="reason", values="uid", aggfunc="count", fill_value=0)
    plt.figure(figsize=(min(20, 1 + 0.6*pivot_counts.shape[1]), 6))
    sns.heatmap(pivot_counts, annot=False, cmap="Blues")
    counts_path = os.path.join(ROOT, "metaphor_heatmap_counts.png")
    plt.tight_layout(); plt.savefig(counts_path, dpi=150); plt.close()

    # Row-normalized heatmap
    pivot_norm = pivot_counts.div(pivot_counts.sum(axis=1).replace(0,1), axis=0)
    plt.figure(figsize=(min(20, 1 + 0.6*pivot_norm.shape[1]), 6))
    sns.heatmap(pivot_norm, annot=False, cmap="YlOrRd")
    norm_path = os.path.join(ROOT, "metaphor_heatmap_normalized.png")
    plt.tight_layout(); plt.savefig(norm_path, dpi=150); plt.close()

    print("Saved:\n ", counts_path, "\n ", norm_path)
else:
    print("Column 'philosopher' not found; skipping philosopher heatmaps.")

# Optional: placeholder for by-topic heatmaps if topic files present
topic_csv_candidates = [
    os.path.join(ROOT, "mozi_topics_overview.csv"),
    os.path.join(ROOT, "mozi_topics_overview_refined.csv")
]
for tpath in topic_csv_candidates:
    if os.path.exists(tpath):
        print(f"Found topic file: {tpath} (add custom by-topic heatmaps as needed)")
        break




Saved:
  /content/drive/MyDrive/Chinese Philosophers/metaphor_heatmap_counts.png 
  /content/drive/MyDrive/Chinese Philosophers/metaphor_heatmap_normalized.png


In [None]:
# Cell 7 — Paraphrasing (LLM, optional; requires OPENAI_API_KEY)
import os, pandas as pd

ROOT = "/content/drive/MyDrive/Chinese Philosophers"

flagged_path = os.path.join(ROOT, "metaphor_enriched.csv")
m = pd.read_csv(flagged_path)
m = m[m["is_metaphor_candidate"]==1].drop_duplicates(subset=["row_id","sent_text"]).reset_index(drop=True)

api_key = os.environ.get("OPENAI_API_KEY", "").strip()
out_path = os.path.join(ROOT, "paraphrase_outputs.csv")

if not api_key:
    print("OPENAI_API_KEY not set; skipping paraphrasing.")
else:
    try:
        from openai import OpenAI
        client = OpenAI()

        def paraphrase(sentence:str):
            prompt = (
                "Rewrite the sentence in a literal, non-metaphorical way. "
                "Keep meaning; avoid figurative language. Sentence: " + sentence
            )
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role":"user","content":prompt}],
                temperature=0.2,
            )
            return resp.choices[0].message.content.strip()

        results = []
        limit = min(len(m), 200)
        for i in range(limit):
            s = str(m.iloc[i]["sent_text"])
            try:
                p = paraphrase(s)
            except Exception as e:
                p = f"[ERROR] {e}"
            results.append({
                "row_id": int(m.iloc[i]["row_id"]),
                "uid": m.iloc[i]["uid"],
                "original_sentence": s,
                "paraphrase_literal": p
            })
        pd.DataFrame(results).to_csv(out_path, index=False)
        print("Saved:", out_path)
    except Exception as e:
        print("Paraphrasing failed:", e)




OPENAI_API_KEY not set; skipping paraphrasing.


In [6]:
# Cell 7 — Philosopher Quote Paraphrasing (3 Top Models)
# Requires TOGETHER_API_KEY environment variable
import os, pandas as pd
import time

ROOT = "/content/drive/MyDrive/Chinese Philosophers"

# Load original philosopher quotes from the main file
quotes_path = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")
m = pd.read_csv(quotes_path)
m.columns = [c.strip().lower() for c in m.columns]

# Ensure we have the right text column
for cand in ["text","quote","translation","english","content","verse_text","line_text"]:
    if cand in m.columns:
        if cand != "text":
            m = m.rename(columns={cand: "text"})
        break

# Filter out very short quotes (less than 50 characters) and ensure we have meaningful content
m = m[m["text"].str.len() >= 50].reset_index(drop=True)
print(f"Loaded {len(m)} quotes from the original file")

# Try to get API key from Colab secrets first, then environment
try:
    from google.colab import userdata
    api_key = userdata.get("TOGETHER_API_KEY")
except:
    api_key = os.environ.get("TOGETHER_API_KEY", "").strip()
out_path = os.path.join(ROOT, "philosopher_quotes_paraphrased_test_5.csv")

# Top 3 models for paraphrasing
MODELS = {
    "DeepSeek_R1_70B": "deepseek-ai/DeepSeek-R1",
    "Qwen_2.5_72B": "Qwen/Qwen2.5-72B-Instruct-Turbo",
    "Llama_3.1_70B": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"
}

if not api_key:
    print("TOGETHER_API_KEY not set; skipping paraphrasing.")
else:
    try:
        from openai import OpenAI
        client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")

        def paraphrase_quote(quote: str, model_id: str):
            prompt = f"Rewrite this philosophical quote in simple, literal language while preserving its core meaning. Remove metaphors and make it direct and concrete:\n\n{quote}"

            resp = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.2,
                max_tokens=200
            )
            return resp.choices[0].message.content.strip()

        # Randomly sample 5 quotes for testing
        import random
        random.seed(42)  # For reproducible results
        sample_indices = random.sample(range(len(m)), min(5, len(m)))

        results = []
        limit = len(sample_indices)

        print(f"Testing with {limit} randomly selected philosopher quotes...")
        print("Processing quotes and generating paraphrases...")

        for idx, i in enumerate(sample_indices):
            original_quote = str(m.iloc[i]["text"])
            print(f"Processing quote {idx+1}/{limit}: {original_quote[:50]}...")

            # Create row with original quote
            row = {
                "original_quote": original_quote,
                "philosopher": m.iloc[i].get("philosopher", "Unknown"),
                "work": m.iloc[i].get("work", "Unknown"),
                "chapter_verse": m.iloc[i].get("chapter_verse", ""),
                "row_id": i
            }

            # Get paraphrase from each model
            for model_name, model_id in MODELS.items():
                try:
                    paraphrase = paraphrase_quote(original_quote, model_id)
                    row[f"paraphrase_{model_name}"] = paraphrase
                    print(f"  ✓ {model_name}")
                    time.sleep(0.8)  # Rate limiting
                except Exception as e:
                    row[f"paraphrase_{model_name}"] = f"[ERROR: {str(e)[:50]}]"
                    print(f"  ✗ {model_name} failed")

            results.append(row)
            print()

        # Save results
        df = pd.DataFrame(results)
        df.to_csv(out_path, index=False)

        print(f"✅ Completed! Saved to: {out_path}")
        print(f"📊 Generated {len(results)} quote comparisons")

    except Exception as e:
        print(f"Paraphrasing failed: {e}")

# Display results from CSV (this runs outside the try-except block)
try:
    import pandas as pd
    df = pd.read_csv(out_path)

    print("\n" + "="*100)
    print("🎯 PHILOSOPHER QUOTE PARAPHRASING RESULTS")
    print("="*100)

    for i, row in df.iterrows():
        print(f"\n📜 QUOTE {i+1}/{len(df)}:")
        print("=" * 90)
        print(f"🔸 ORIGINAL QUOTE:")
        print(f"   {row['original_quote']}")
        print(f"👤 PHILOSOPHER: {row.get('philosopher', 'Unknown')}")
        print(f"📚 WORK: {row.get('work', 'Unknown')}")
        if row.get('chapter_verse', ''):
            print(f"📖 CHAPTER/VERSE: {row.get('chapter_verse', '')}")
        print()

        print("🤖 AI PARAPHRASES:")
        print("-" * 90)
        print(f"🧠 DeepSeek R1 70B:")
        print(f"   {row['paraphrase_DeepSeek_R1_70B']}")
        print()
        print(f"🌟 Qwen 2.5 72B:")
        print(f"   {row['paraphrase_Qwen_2.5_72B']}")
        print()
        print(f"🦙 Llama 3.1 70B:")
        print(f"   {row['paraphrase_Llama_3.1_70B']}")
        print("\n" + "=" * 90)

except Exception as e:
    print(f"Could not display results: {e}")

# Display results from saved CSV (run this cell to show results anytime)
print("\n" + "🔄 To display results anytime, run this command:")
print("="*60)
print("""
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/Chinese Philosophers/philosopher_quotes_paraphrased_test.csv")

print("🎯 PHILOSOPHER QUOTE PARAPHRASING RESULTS")
print("="*100)

for i, row in df.iterrows():
    print(f"\\n📜 QUOTE {i+1}/{len(df)}:")
    print("-" * 80)
    print(f"🔸 ORIGINAL: {row['original_quote']}")
    print(f"👤 PHILOSOPHER: {row.get('philosopher', 'Unknown')}")
    print()
    print("🤖 AI PARAPHRASES:")
    print(f"   🧠 DeepSeek R1: {row['paraphrase_DeepSeek_R1_70B']}")
    print()
    print(f"   🌟 Qwen 2.5:    {row['paraphrase_Qwen_2.5_72B']}")
    print()
    print(f"   🦙 Llama 3.1:   {row['paraphrase_Llama_3.1_70B']}")
    print("\\n" + "-" * 80)
""")




Loaded 1142 quotes from the original file
Testing with 5 randomly selected philosopher quotes...
Processing quotes and generating paraphrases...
Processing quote 1/5: "The Master said, ‘It was after my return from Wei...
  ✓ DeepSeek_R1_70B
  ✓ Qwen_2.5_72B
  ✓ Llama_3.1_70B

Processing quote 2/5: "Someone asked about the theory of the Sti sacrifi...
  ✓ DeepSeek_R1_70B
  ✓ Qwen_2.5_72B
  ✓ Llama_3.1_70B

Processing quote 3/5: ‘In all these five things, the sage is restrained ...
  ✓ DeepSeek_R1_70B
  ✓ Qwen_2.5_72B
  ✓ Llama_3.1_70B

Processing quote 4/5: "Zu-hsia said, ‘In the three following situations ...
  ✓ DeepSeek_R1_70B
  ✓ Qwen_2.5_72B
  ✓ Llama_3.1_70B

Processing quote 5/5: "The Master said, 'Men are close to one another by...
  ✓ DeepSeek_R1_70B
  ✓ Qwen_2.5_72B
  ✓ Llama_3.1_70B

✅ Completed! Saved to: /content/drive/MyDrive/Chinese Philosophers/philosopher_quotes_paraphrased_test_5.csv
📊 Generated 5 quote comparisons

🎯 PHILOSOPHER QUOTE PARAPHRASING RESULTS

📜 QUOTE 1/5

In [8]:
"""
Data-driven Confucius vs Mozi comparison using existing artifacts only.

Outputs:
- compare_confucius_mozi.json  (headline_pairs + metrics)
- compare_confucius_mozi_table.csv (compact table for UI)

This script intentionally avoids generating new paraphrases. It derives
contrasts from:
- Base CSV (philosopher, text)
- metaphor_enriched.csv / metaphor_enriched_full.csv (reasons, tokens)
- metaphor_labels.csv (metaphoricity, source/target domains)
- cmi_candidates.csv + cmi_evidence_samples.csv (verb→noun motifs)

It also produces public-friendly names using template-based rules that map
measured differences to plain-English headline pairs.
"""

import os
import json
import math
import re
from collections import Counter, defaultdict
from typing import Dict, List, Tuple

import pandas as pd


# --- Configuration ---
ROOT = "/content/drive/MyDrive/Chinese Philosophers"
OUT_JSON = os.path.join(ROOT, "compare_confucius_mozi.json")
OUT_CSV = os.path.join(ROOT, "compare_confucius_mozi_table.csv")

BASE_CSV = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")
ENRICHED = os.path.join(ROOT, "metaphor_enriched.csv")
ENRICHED_FULL = os.path.join(ROOT, "metaphor_enriched_full.csv")
LABELS = os.path.join(ROOT, "metaphor_labels.csv")
CMI = os.path.join(ROOT, "cmi_candidates.csv")
CMI_EVID = os.path.join(ROOT, "cmi_evidence_samples.csv")


# --- Utilities ---
def load_csv(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        return pd.DataFrame()
    df = pd.read_csv(path)
    # Normalize columns to lowercase for easier joins
    df.columns = [c.strip().lower() for c in df.columns]
    return df


def normalize_text_column(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    for cand in ["text", "quote", "translation", "english", "content", "verse_text", "line_text"]:
        if cand in df.columns:
            if cand != "text":
                df = df.rename(columns={cand: "text"})
            break
    return df


def ensure_row_id(df: pd.DataFrame) -> pd.DataFrame:
    if df.empty:
        return df
    if "row_id" not in df.columns:
        df.insert(0, "row_id", range(len(df)))
    return df


def attach_philosopher(df_tokens: pd.DataFrame, df_base: pd.DataFrame) -> pd.DataFrame:
    if df_tokens.empty:
        return df_tokens
    if "philosopher" in df_tokens.columns:
        return df_tokens
    if "row_id" in df_tokens.columns and "row_id" in df_base.columns and "philosopher" in df_base.columns:
        return df_tokens.merge(df_base[["row_id", "philosopher"]], on="row_id", how="left")
    return df_tokens


def compute_reason_distribution(df_enriched: pd.DataFrame) -> pd.DataFrame:
    cols = [c for c in ["philosopher", "reason"] if c in df_enriched.columns]
    if set(cols) == {"reason"}:
        # No philosopher column; cannot compute
        return pd.DataFrame()
    if not cols or df_enriched.empty:
        return pd.DataFrame()
    g = (
        df_enriched
        .dropna(subset=["reason", "philosopher"])
        .groupby(["philosopher", "reason"]).size().reset_index(name="count")
    )
    return g


def compute_cmi_motifs(df_cmi: pd.DataFrame, df_evid: pd.DataFrame, df_base: pd.DataFrame) -> pd.DataFrame:
    if df_evid.empty:
        return pd.DataFrame()
    cols_needed = {"row_id", "verb", "noun"}
    if not cols_needed.issubset(df_evid.columns):
        return pd.DataFrame()
    evid = df_evid.copy()
    evid = evid.merge(df_base[["row_id", "philosopher"]], on="row_id", how="left")
    cnt = (
        evid
        .dropna(subset=["philosopher", "verb", "noun"])
        .groupby(["philosopher", "verb", "noun"]).size().reset_index(name="count")
    )
    if not df_cmi.empty and {"verb", "noun", "pmi"}.issubset(df_cmi.columns):
        cnt = cnt.merge(df_cmi[["verb", "noun", "pmi"]], on=["verb", "noun"], how="left")
    return cnt.sort_values(["count", "pmi"], ascending=[False, False])


def compute_token_lemmas(df_tokens: pd.DataFrame) -> pd.DataFrame:
    if df_tokens.empty:
        return pd.DataFrame()
    if not {"philosopher", "lemma", "pos"}.issubset(df_tokens.columns):
        return pd.DataFrame()
    # Focus on content words
    df = df_tokens[df_tokens["pos"].isin(["NOUN", "VERB", "ADJ"])].copy()
    df["lemma"] = df["lemma"].astype(str).str.lower()
    df = df[df["lemma"].str.len() >= 3]
    counts = df.groupby(["philosopher", "lemma"]).size().reset_index(name="count")
    return counts


def score_concept_bins(lemma_counts: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    """
    DEPRECATED: Use data_driven_discovery() instead for unbiased pattern finding.
    """
    if lemma_counts.empty:
        return {}

    return {}


def data_driven_discovery(lemma_counts: pd.DataFrame) -> List[Dict[str, object]]:
    """
    Data-driven discovery of the strongest word differences between philosophers.
    Finds the most distinctive words for each philosopher and groups them into
    meaningful contrasts.
    """
    if lemma_counts.empty:
        return []

    # Normalize counts by total words per philosopher to avoid size bias
    totals = lemma_counts.groupby("philosopher")["count"].sum().to_dict()
    lemma_counts["norm_count"] = lemma_counts.apply(
        lambda r: r["count"] / max(1, totals.get(r["philosopher"], 1)), axis=1
    )

    # Get top distinctive words for each philosopher
    conf_words = lemma_counts[lemma_counts["philosopher"].astype(str).str.contains("Confucius|Kongzi", case=False, na=False)]
    mozi_words = lemma_counts[lemma_counts["philosopher"].astype(str).str.contains("Mozi|Mo-tzu", case=False, na=False)]

    # Sort by normalized frequency and take top words
    conf_top = conf_words.sort_values("norm_count", ascending=False).head(20)
    mozi_top = mozi_words.sort_values("norm_count", ascending=False).head(20)

    # Find words that appear strongly in one philosopher but weakly in the other
    def find_contrasts(conf_df: pd.DataFrame, mozi_df: pd.DataFrame, min_count: int = 3) -> List[Dict]:
        contrasts = []

        # Create lookup dictionaries
        conf_lookup = {row["lemma"]: row["norm_count"] for _, row in conf_df.iterrows()}
        mozi_lookup = {row["lemma"]: row["norm_count"] for _, row in mozi_df.iterrows()}

        # Find words that are distinctive to each philosopher
        for _, conf_row in conf_df.iterrows():
            lemma = conf_row["lemma"]
            conf_norm = conf_row["norm_count"]
            mozi_norm = mozi_lookup.get(lemma, 0)

            # Must have minimum count and be distinctive
            if conf_row["count"] >= min_count and conf_norm > mozi_norm * 2:
                contrasts.append({
                    "conf_word": lemma,
                    "conf_count": int(conf_row["count"]),
                    "conf_norm": conf_norm,
                    "mozi_norm": mozi_norm,
                    "ratio": conf_norm / max(mozi_norm, 0.001),
                    "type": "confucius_distinctive"
                })

        for _, mozi_row in mozi_df.iterrows():
            lemma = mozi_row["lemma"]
            mozi_norm = mozi_row["norm_count"]
            conf_norm = conf_lookup.get(lemma, 0)

            if mozi_row["count"] >= min_count and mozi_norm > conf_norm * 2:
                contrasts.append({
                    "mozi_word": lemma,
                    "mozi_count": int(mozi_row["count"]),
                    "mozi_norm": mozi_norm,
                    "conf_norm": conf_norm,
                    "ratio": mozi_norm / max(conf_norm, 0.001),
                    "type": "mozi_distinctive"
                })

        return sorted(contrasts, key=lambda x: x["ratio"], reverse=True)

    distinctive_words = find_contrasts(conf_top, mozi_top)

    # Group distinctive words into thematic contrasts
    def group_into_themes(distinctive_words: List[Dict]) -> List[Dict]:
        themes = []

        # Simple thematic grouping based on word similarity and meaning
        theme_patterns = [
            {
                "name": "family_relationships",
                "conf_words": ["father", "mother", "son", "daughter", "family", "filial", "kin"],
                "mozi_words": ["everyone", "all", "people", "impartial", "equal", "universal"]
            },
            {
                "name": "governance_method",
                "conf_words": ["ritual", "rite", "propriety", "ceremony", "music", "form"],
                "mozi_words": ["benefit", "utility", "result", "measure", "test", "proof"]
            },
            {
                "name": "moral_focus",
                "conf_words": ["virtue", "benevolence", "righteous", "good", "worthy"],
                "mozi_words": ["ability", "skill", "merit", "talent", "capable"]
            },
            {
                "name": "social_structure",
                "conf_words": ["role", "lord", "minister", "subject", "hierarchy"],
                "mozi_words": ["impartial", "fair", "public", "common", "ordinary"]
            },
            {
                "name": "temporal_orientation",
                "conf_words": ["ancient", "old", "tradition", "former", "model"],
                "mozi_words": ["now", "present", "current", "immediate", "direct"]
            },
            {
                "name": "material_attitude",
                "conf_words": ["honor", "ceremony", "decorum", "ornament"],
                "mozi_words": ["simple", "frugal", "thrift", "cost", "waste"]
            }
        ]

        for pattern in theme_patterns:
            conf_matches = [w for w in distinctive_words if w.get("conf_word") in pattern["conf_words"]]
            mozi_matches = [w for w in distinctive_words if w.get("mozi_word") in pattern["mozi_words"]]

            if conf_matches or mozi_matches:
                conf_score = sum(w.get("conf_norm", 0) for w in conf_matches)
                mozi_score = sum(w.get("mozi_norm", 0) for w in mozi_matches)

                if conf_score > 0 or mozi_score > 0:
                    themes.append({
                        "theme_id": pattern["name"],
                        "conf_words": [w.get("conf_word") for w in conf_matches],
                        "mozi_words": [w.get("mozi_word") for w in mozi_matches],
                        "conf_score": conf_score,
                        "mozi_score": mozi_score,
                        "strength": abs(conf_score - mozi_score) / max(conf_score + mozi_score, 0.001)
                    })

        return sorted(themes, key=lambda x: x["strength"], reverse=True)

    themes = group_into_themes(distinctive_words)

    return themes


def generate_plain_labels(themes: List[Dict]) -> List[Dict[str, str]]:
    """
    Generate plain-language labels for discovered themes.
    """
    label_templates = {
        "family_relationships": {
            "conf_label": "Start with Family",
            "mozi_label": "Care for Everyone",
            "subtitle": "Confucius begins close and widens; Mozi treats all people the same."
        },
        "governance_method": {
            "conf_label": "Lead by Ritual",
            "mozi_label": "Lead by Results",
            "subtitle": "Confucius points to proper forms; Mozi asks what helps people now."
        },
        "moral_focus": {
            "conf_label": "Promote Worthy Character",
            "mozi_label": "Promote Able People",
            "subtitle": "Confucius prizes moral fitness; Mozi prizes skill that helps all."
        },
        "social_structure": {
            "conf_label": "Keep Roles Clear",
            "mozi_label": "Treat People Impartially",
            "subtitle": "Confucius organizes by roles; Mozi removes favoritism."
        },
        "temporal_orientation": {
            "conf_label": "Follow the Old Ways",
            "mozi_label": "Test What Works",
            "subtitle": "Confucius looks to models and tradition; Mozi checks outcomes."
        },
        "material_attitude": {
            "conf_label": "Honor and Ceremony",
            "mozi_label": "Simple and Frugal",
            "subtitle": "Confucius keeps forms meaningful; Mozi cuts cost and waste."
        }
    }

    pairs = []
    for theme in themes[:8]:  # Top 8 themes
        theme_id = theme["theme_id"]
        template = label_templates.get(theme_id, {})

        if template:
            pairs.append({
                "label": f"{template['conf_label']} vs {template['mozi_label']}",
                "confucius": template["conf_label"],
                "mozi": template["mozi_label"],
                "subtitle": template["subtitle"],
                "theme_id": theme_id,
                "data": theme,  # Include the raw data for transparency
                "discovery_method": "data_driven"
            })

    return pairs


def pick_headline_pairs(bin_scores: Dict[str, Dict[str, int]]) -> List[Dict[str, str]]:
    """
    DEPRECATED: Use data_driven_discovery() + generate_plain_labels() instead.
    """
    return []


def compute_keywords(lemma_counts: pd.DataFrame, top_k: int = 12) -> Dict[str, List[str]]:
    if lemma_counts.empty:
        return {}
    # Normalize by total to avoid size bias
    totals = lemma_counts.groupby("philosopher")["count"].sum().to_dict()
    lemma_counts["norm"] = lemma_counts.apply(
        lambda r: r["count"] / max(1, totals.get(r["philosopher"], 1)), axis=1
    )
    top = (
        lemma_counts.sort_values(["philosopher", "norm", "count"], ascending=[True, False, False])
        .groupby("philosopher")
        .head(top_k)
    )
    out: Dict[str, List[str]] = {}
    for ph, grp in top.groupby("philosopher"):
        out[str(ph)] = grp["lemma"].tolist()
    return out


def compute_metaphoricity(df_labels: pd.DataFrame, df_base: pd.DataFrame) -> Dict[str, Dict[str, float]]:
    if df_labels.empty:
        return {}
    if not {"row_id", "metaphoricity_0to3"}.issubset(df_labels.columns):
        return {}
    use = df_labels.merge(df_base[["row_id", "philosopher"]], on="row_id", how="left")
    grp = use.dropna(subset=["philosopher"]).groupby("philosopher")["metaphoricity_0to3"]
    stats = grp.agg(["count", "mean"]).reset_index()
    out: Dict[str, Dict[str, float]] = {}
    for _, row in stats.iterrows():
        out[str(row["philosopher"])]= {"count": float(row["count"]), "avg": float(row["mean"]) }
    return out


def compute_domains(df_labels: pd.DataFrame, df_base: pd.DataFrame) -> Dict[str, Dict[str, int]]:
    if df_labels.empty:
        return {}
    need = {"row_id", "source_domain", "target_domain"}
    if not need.issubset(df_labels.columns):
        return {}
    use = df_labels.merge(df_base[["row_id", "philosopher"]], on="row_id", how="left")
    use["pair"] = use.apply(lambda r: f"{str(r['source_domain']).strip()}→{str(r['target_domain']).strip()}", axis=1)
    out: Dict[str, Dict[str, int]] = {}
    for ph, grp in use.dropna(subset=["philosopher"]).groupby("philosopher"):
        counts = (
            grp["pair"]
            .value_counts()
            .to_dict()
        )
        out[str(ph)] = {k: int(v) for k, v in counts.items()}
    return out


def main() -> None:
    base = load_csv(BASE_CSV)
    base = normalize_text_column(base)
    base = ensure_row_id(base)

    enr = load_csv(ENRICHED)
    enr_full = load_csv(ENRICHED_FULL)
    labels = load_csv(LABELS)
    cmi = load_csv(CMI)
    cmi_evid = load_csv(CMI_EVID)

    # Attach philosopher where needed
    enr = attach_philosopher(enr, base)
    enr_full = attach_philosopher(enr_full, base)

    # 1) Reason distribution (metaphor mechanisms)
    reason_dist = compute_reason_distribution(enr)

    # 2) CMI motifs per philosopher
    cmi_motifs = compute_cmi_motifs(cmi, cmi_evid, base)

    # 3) Token lemma counts → keywords and data-driven discovery
    lemma_counts = compute_token_lemmas(enr_full if not enr_full.empty else enr)
    # Use data-driven discovery instead of predefined categories
    discovered_themes = data_driven_discovery(lemma_counts)
    headline_pairs = generate_plain_labels(discovered_themes)
    top_keywords = compute_keywords(lemma_counts)

    # 4) Metaphoricity and domains (from labels)
    metaphoricity = compute_metaphoricity(labels, base)
    domains = compute_domains(labels, base)

    # Summary text (plain)
    summary_lines: List[str] = []
    if headline_pairs:
        summary_lines.append("Data-driven analysis reveals the strongest word differences. Here are the most distinctive contrasts.")
    if not reason_dist.empty:
        summary_lines.append("They also use different kinds of metaphors, by how often our heuristics flag each pattern.")
    if metaphoricity:
        summary_lines.append("Human labels indicate different levels of imagery across the two.")
    if domains:
        summary_lines.append("When people labeled source→target domains, the two emphasize different mappings.")
    if top_keywords:
        summary_lines.append("Distinct keywords emerge for each, which we surface as chips for browsing.")
    if discovered_themes:
        summary_lines.append("Themes were discovered by finding words that appear much more frequently in one philosopher than the other.")

    out = {
        "headline_pairs": headline_pairs,  # [{label, confucius, mozi, subtitle, bin_id, scores}]
        "reason_distribution": (
            reason_dist.to_dict(orient="records") if not reason_dist.empty else []
        ),
        "top_cmi_motifs": (
            cmi_motifs.head(30).to_dict(orient="records") if not cmi_motifs.empty else []
        ),
        "keywords": top_keywords,
        "metaphoricity": metaphoricity,
        "domains": domains,
        "summary": " ".join(summary_lines) if summary_lines else "",
    }

    with open(OUT_JSON, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    # Compact table: dimension/value pairs for quick UI
    table_rows: List[Dict[str, str]] = []
    for hp in headline_pairs:
        table_rows.append({
            "dimension": hp.get("label", "Contrast"),
            "confucius": hp.get("confucius", ""),
            "mozi": hp.get("mozi", ""),
            "note": hp.get("subtitle", ""),
        })

    # Add a small selection of top motifs (human-readable mini-phrases)
    if not cmi_motifs.empty:
        # Take top 3 per philosopher
        for ph in ["Confucius", "Mozi"]:
            top3 = (
                cmi_motifs[cmi_motifs["philosopher"].astype(str).str.contains(ph, case=False, na=False)]
                .head(3)
            )
            for _, r in top3.iterrows():
                table_rows.append({
                    "dimension": f"Key motif ({ph})",
                    "confucius": f"{r['verb']}→{r['noun']}" if ph.lower().startswith("confu") else "",
                    "mozi": f"{r['verb']}→{r['noun']}" if ph.lower().startswith("mozi") else "",
                    "note": f"count={int(r.get('count', 0))}, pmi={float(r.get('pmi', 0)):.2f}",
                })

    if table_rows:
        pd.DataFrame(table_rows).to_csv(OUT_CSV, index=False)

    print("Saved:", OUT_JSON)
    if os.path.exists(OUT_CSV):
        print("Saved:", OUT_CSV)

    # --- Plain-language Colab output ---
    print("\n==============================")
    print("Confucius vs Mozi — Data-Driven Comparison")
    print("==============================\n")

    # Headline pairs
    if headline_pairs:
        print("Top headline pairs (discovered from data):")
        for hp in headline_pairs[:8]:
            label = hp.get("label", "Contrast")
            conf = hp.get("confucius", "Confucius")
            moz = hp.get("mozi", "Mozi")
            sub = hp.get("subtitle", "")
            data = hp.get("data", {})
            print(f"- {label}")
            print(f"  Confucius: {conf}")
            print(f"  Mozi: {moz}")
            if sub:
                print(f"  Note: {sub}")
            if data:
                conf_words = data.get("conf_words", [])
                mozi_words = data.get("mozi_words", [])
                if conf_words or mozi_words:
                    print(f"  Data: Confucius distinctive words: {', '.join(conf_words[:5])}")
                    print(f"  Data: Mozi distinctive words: {', '.join(mozi_words[:5])}")
        print()

    # Keywords
    if top_keywords:
        conf_kw = top_keywords.get("Confucius") or top_keywords.get("Kongzi") or []
        mozi_kw = top_keywords.get("Mozi") or top_keywords.get("Mo-tzu") or []
        if conf_kw or mozi_kw:
            print("Keywords that show up a lot:")
            if conf_kw:
                print("  Confucius:", ", ".join(conf_kw[:12]))
            if mozi_kw:
                print("  Mozi:", ", ".join(mozi_kw[:12]))
            print()

    # Metaphoricity (from human labels)
    if metaphoricity:
        print("Imagery level (from human labels):")
        for who in ["Confucius", "Mozi"]:
            stats = metaphoricity.get(who)
            if not stats and who == "Confucius":
                stats = metaphoricity.get("Kongzi")
            if not stats and who == "Mozi":
                stats = metaphoricity.get("Mo-tzu")
            if stats:
                print(f"  {who}: avg={stats.get('avg', 0):.2f} over {int(stats.get('count', 0))} labels")
        print()

    # Domains (source→target) if labeled
    if domains:
        print("Common metaphor mappings (source → target):")
        for who in ["Confucius", "Mozi"]:
            d = domains.get(who) or (domains.get("Kongzi") if who == "Confucius" else domains.get("Mo-tzu"))
            if d:
                top = sorted(d.items(), key=lambda kv: kv[1], reverse=True)[:5]
                pairs = "; ".join([f"{k} ({v})" for k, v in top])
                print(f"  {who}: {pairs}")
        print()

    # Reason distribution (metaphor mechanisms)
    if not reason_dist.empty:
        print("How their metaphors are flagged (by pattern):")
        # Show top 5 reasons overall with per-philosopher counts
        top_reasons = (
            reason_dist.groupby("reason")["count"].sum().sort_values(ascending=False).head(5).index.tolist()
        )
        for r in top_reasons:
            conf_ct = int(reason_dist[(reason_dist["philosopher"].astype(str).str.contains("Confucius", case=False, na=False)) & (reason_dist["reason"] == r) ]["count"].sum())
            mozi_ct = int(reason_dist[(reason_dist["philosopher"].astype(str).str.contains("Mozi", case=False, na=False)) & (reason_dist["reason"] == r) ]["count"].sum())
            print(f"  {r}: Confucius={conf_ct}, Mozi={mozi_ct}")
        print()

    # CMI motifs (high-signal verb→noun patterns)
    if not cmi_motifs.empty:
        print("Key action→object patterns (examples):")
        for who in ["Confucius", "Mozi"]:
            sub = cmi_motifs[cmi_motifs["philosopher"].astype(str).str.contains(who, case=False, na=False)].head(3)
            if not sub.empty:
                items = [f"{r['verb']}→{r['noun']} (n={int(r['count'])}, pmi={float(r.get('pmi', 0)):.2f})" for _, r in sub.iterrows()]
                print(f"  {who}: "+"; ".join(items))
        print()


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print("Comparison failed:", e)




Saved: /content/drive/MyDrive/Chinese Philosophers/compare_confucius_mozi.json
Saved: /content/drive/MyDrive/Chinese Philosophers/compare_confucius_mozi_table.csv

Confucius vs Mozi — Data-Driven Comparison

Top headline pairs (discovered from data):
- Start with Family vs Care for Everyone
  Confucius: Start with Family
  Mozi: Care for Everyone
  Note: Confucius begins close and widens; Mozi treats all people the same.
  Data: Confucius distinctive words: 
  Data: Mozi distinctive words: people
- Promote Worthy Character vs Promote Able People
  Confucius: Promote Worthy Character
  Mozi: Promote Able People
  Note: Confucius prizes moral fitness; Mozi prizes skill that helps all.
  Data: Confucius distinctive words: good, benevolence
  Data: Mozi distinctive words: 
- Lead by Ritual vs Lead by Results
  Confucius: Lead by Ritual
  Mozi: Lead by Results
  Note: Confucius points to proper forms; Mozi asks what helps people now.
  Data: Confucius distinctive words: rite
  Data: Mozi di

In [9]:
"""
Unsupervised embeddings, clustering, and keyword discovery.

This script intentionally avoids predefining dimensions. It computes:
- Sentence embeddings (Sentence-Transformers if available; fallback: TF-IDF+SVD)
- MiniBatchKMeans clustering with heuristic K
- Cluster keywords via TF-IDF centroids and representative quotes
- 2D projections (PCA; UMAP if available)
- Discriminative keywords by philosopher via log-odds with prior

Outputs under ROOT for downstream comparison/visualization:
- unsupervised_summary.json
- unsupervised_cluster_keywords.csv
- unsupervised_cluster_assignments.csv
- unsupervised_embeddings_pca.csv
- unsupervised_embeddings_umap.csv (if UMAP available)
- logodds_keywords_by_philosopher.csv

ROOT follows prior scripts; override via env PHILOSOPHERS_ROOT if set.
"""

import os
import re
import json
import math
from collections import Counter, defaultdict
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import normalize as sk_normalize


# --- Configuration ---
ROOT = os.environ.get("PHILOSOPHERS_ROOT", "/content/drive/MyDrive/Chinese Philosophers")
BASE_CSV = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")

OUT_JSON = os.path.join(ROOT, "unsupervised_summary.json")
OUT_CLUSTER_KEYWORDS = os.path.join(ROOT, "unsupervised_cluster_keywords.csv")
OUT_ASSIGN = os.path.join(ROOT, "unsupervised_cluster_assignments.csv")
OUT_PCA = os.path.join(ROOT, "unsupervised_embeddings_pca.csv")
OUT_UMAP = os.path.join(ROOT, "unsupervised_embeddings_umap.csv")
OUT_LOGODDS = os.path.join(ROOT, "logodds_keywords_by_philosopher.csv")


# --- Utilities ---
def load_base(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Base CSV not found at {path}")
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    # Normalize text column
    if "text" not in df.columns:
        for cand in ["quote", "translation", "english", "content", "verse_text", "line_text"]:
            if cand in df.columns:
                df = df.rename(columns={cand: "text"})
                break
    assert "text" in df.columns, "Input needs a text/quote column."
    if "row_id" not in df.columns:
        df.insert(0, "row_id", range(len(df)))
    return df


def try_import_sentence_transformer():
    try:
        from sentence_transformers import SentenceTransformer  # type: ignore
        return SentenceTransformer
    except Exception:
        return None


def try_import_umap():
    try:
        import umap  # type: ignore
        return umap
    except Exception:
        return None


def try_load_spacy():
    try:
        import spacy  # type: ignore
        try:
            nlp = spacy.load("en_core_web_sm")
        except Exception:
            # Model may not be present; fall back to blank with simple rules
            nlp = spacy.blank("en")
            if "sentencizer" not in nlp.pipe_names:
                nlp.add_pipe("sentencizer")
        return nlp
    except Exception:
        return None


def basic_tokenize(text: str) -> List[str]:
    # Simple fallback tokenizer: words of length >=3, letters only
    return re.findall(r"[A-Za-z][A-Za-z\-]{2,}", text.lower())


def default_stopwords() -> set:
    # Minimal English stopword set; extend as needed
    common = {
        "the","and","for","are","but","not","with","you","that","this","was","his","her",
        "have","has","had","were","from","they","their","them","what","when","where","which",
        "who","whom","into","out","over","under","about","after","before","because","been",
        "can","could","should","would","may","might","will","shall","do","does","did",
        "of","in","to","on","at","as","by","an","a","be","or","if","it","is","we","i",
    }
    return common


# --- Embeddings ---
class EmbeddingResult:
    def __init__(self, vectors: np.ndarray, kind: str, tfidf: Optional[TfidfVectorizer] = None):
        self.vectors = vectors
        self.kind = kind
        self.tfidf = tfidf


def compute_embeddings(texts: List[str]) -> EmbeddingResult:
    st_cls = try_import_sentence_transformer()
    if st_cls is not None:
        try:
            model = st_cls("sentence-transformers/all-MiniLM-L6-v2")
            vectors = model.encode(texts, batch_size=128, show_progress_bar=False, normalize_embeddings=True)
            # Still build a TF-IDF for keyword extraction
            tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000, min_df=2)
            tfidf.fit(texts)
            return EmbeddingResult(vectors=np.asarray(vectors), kind="miniLM", tfidf=tfidf)
        except Exception:
            pass

    # Fallback: TF-IDF then SVD to dense vectors
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50000, min_df=2)
    X = tfidf.fit_transform(texts)
    k = 256 if X.shape[1] >= 256 else max(50, min(128, X.shape[1]))
    svd = TruncatedSVD(n_components=k, random_state=42)
    dense = svd.fit_transform(X)
    dense = sk_normalize(dense)
    return EmbeddingResult(vectors=dense, kind="tfidf_svd", tfidf=tfidf)


# --- Clustering and keywords ---
def heuristic_k(n: int) -> int:
    if n <= 200:
        return max(4, min(10, int(round(math.sqrt(max(1, n)) / 1.1))))
    if n <= 2000:
        return 12
    return 16


def cluster_vectors(vectors: np.ndarray, n_clusters: int) -> MiniBatchKMeans:
    model = MiniBatchKMeans(
        n_clusters=n_clusters,
        random_state=42,
        batch_size=512,
        n_init=10,
        reassignment_ratio=0.01,
    )
    model.fit(vectors)
    return model


def extract_cluster_keywords(texts: List[str], labels: np.ndarray, tfidf: TfidfVectorizer, top_n: int = 15) -> pd.DataFrame:
    X = tfidf.transform(texts)
    vocab = np.array(tfidf.get_feature_names_out())
    out_rows = []
    for cl in sorted(set(labels.tolist())):
        idx = np.where(labels == cl)[0]
        if idx.size == 0:
            continue
        sub = X[idx]
        # Mean TF-IDF per term in cluster
        means = np.asarray(sub.mean(axis=0)).ravel()
        top_idx = np.argsort(-means)[:top_n]
        top_terms = vocab[top_idx]
        top_vals = means[top_idx]
        for rank, (term, score) in enumerate(zip(top_terms, top_vals), start=1):
            out_rows.append({
                "cluster": int(cl),
                "rank": int(rank),
                "term": term,
                "score": float(score),
            })
    return pd.DataFrame(out_rows)


def nearest_representatives(vectors: np.ndarray, labels: np.ndarray, km: MiniBatchKMeans, k: int = 3) -> Dict[int, List[int]]:
    reps: Dict[int, List[int]] = {}
    centers = km.cluster_centers_
    # cosine distance on L2-normalized vectors approximated by Euclidean on normalized space
    for cl in range(centers.shape[0]):
        idx = np.where(labels == cl)[0]
        if idx.size == 0:
            reps[cl] = []
            continue
        sub = vectors[idx]
        c = centers[cl]
        # Ensure normalization
        c = c / (np.linalg.norm(c) + 1e-9)
        sub = sub / (np.linalg.norm(sub, axis=1, keepdims=True) + 1e-9)
        # cosine similarity = dot
        sims = sub @ c
        order = np.argsort(-sims)[:k]
        reps[cl] = idx[order].tolist()
    return reps


# --- Projections ---
def compute_pca(vectors: np.ndarray) -> np.ndarray:
    pca = PCA(n_components=2, random_state=42)
    coords = pca.fit_transform(vectors)
    return coords


def compute_umap(vectors: np.ndarray) -> Optional[np.ndarray]:
    umap = try_import_umap()
    if umap is None:
        return None
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
    return reducer.fit_transform(vectors)


# --- Log-odds keywords by philosopher ---
def tokenize_for_counts(texts: List[str], nlp) -> List[List[str]]:
    sw = default_stopwords()
    if nlp is None:
        return [[t for t in basic_tokenize(tx) if t not in sw] for tx in texts]

    tokens: List[List[str]] = []
    try:
        for doc in nlp.pipe(texts, batch_size=256):
            toks = []
            for t in doc:
                if not t.is_alpha:
                    continue
                s = t.lemma_.lower() if t.lemma_ else t.text.lower()
                if len(s) < 3 or s in sw:
                    continue
                toks.append(s)
            tokens.append(toks)
        return tokens
    except Exception:
        return [[t for t in basic_tokenize(tx) if t not in sw] for tx in texts]


def log_odds_two_groups(counts_a: Counter, counts_b: Counter, alpha: float = 0.01) -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Monroe et al. 2008 inspired, simplified symmetric Dirichlet prior
    vocab = set(counts_a.keys()) | set(counts_b.keys())
    Na = sum(counts_a.values())
    Nb = sum(counts_b.values())
    Va = len(vocab)
    # Prior mass per term
    alpha0 = alpha * Va
    rows_a, rows_b = [], []
    for term in vocab:
        ya = counts_a.get(term, 0)
        yb = counts_b.get(term, 0)
        # Log-odds
        lo_a = math.log((ya + alpha) / (Na + alpha0 - (ya + alpha) + 1e-12))
        lo_b = math.log((yb + alpha) / (Nb + alpha0 - (yb + alpha) + 1e-12))
        delta = lo_a - lo_b
        # Variance approximation
        var = 1.0 / (ya + alpha) + 1.0 / (yb + alpha)
        z = delta / math.sqrt(var)
        rows_a.append({"term": term, "z": z, "delta": delta, "count": ya})
        rows_b.append({"term": term, "z": -z, "delta": -delta, "count": yb})
    da = pd.DataFrame(rows_a).sort_values(["z", "count"], ascending=[False, False])
    db = pd.DataFrame(rows_b).sort_values(["z", "count"], ascending=[False, False])
    return da, db


def compute_logodds_by_philosopher(df: pd.DataFrame) -> pd.DataFrame:
    if "philosopher" not in df.columns:
        return pd.DataFrame()
    nlp = try_load_spacy()
    grouped = df.groupby("philosopher")
    # Focus on the two main philosophers if present, otherwise top two by count
    phs = [p for p in ["Confucius", "Kongzi", "Mozi", "Mo-tzu"] if p in grouped.groups]
    if len(phs) < 2:
        ph_counts = df["philosopher"].value_counts().index.tolist()
        if len(ph_counts) >= 2:
            phs = ph_counts[:2]
        else:
            return pd.DataFrame()

    # Map alternative names
    def unify(ph: str) -> str:
        pl = ph.lower()
        if "confucius" in pl or "kongzi" in pl:
            return "Confucius"
        if "mozi" in pl or "mo-tzu" in pl:
            return "Mozi"
        return ph

    df_2 = df.copy()
    df_2["ph2"] = df_2["philosopher"].astype(str).map(unify)
    # Tokenize per-quote
    tokens = tokenize_for_counts(df_2["text"].astype(str).tolist(), nlp)
    df_2["_tokens"] = tokens
    # Aggregate counts by philosopher
    counts_by_ph: Dict[str, Counter] = defaultdict(Counter)
    for ph, toks in zip(df_2["ph2"].tolist(), df_2["_tokens"].tolist()):
        counts_by_ph[ph].update(toks)

    if len(counts_by_ph) < 2:
        return pd.DataFrame()
    ph_list = list(counts_by_ph.keys())[:2]
    a, b = ph_list[0], ph_list[1]
    da, db = log_odds_two_groups(counts_by_ph[a], counts_by_ph[b], alpha=0.01)
    da["philosopher"] = a
    db["philosopher"] = b
    out = pd.concat([da.head(100), db.head(100)], ignore_index=True)
    return out[["philosopher", "term", "z", "delta", "count"]]


# --- Main ---
def main() -> None:
    os.makedirs(ROOT, exist_ok=True)
    base = load_base(BASE_CSV)
    texts = base["text"].astype(str).tolist()

    emb = compute_embeddings(texts)
    n_clusters = heuristic_k(len(texts))
    km = cluster_vectors(emb.vectors, n_clusters=n_clusters)
    labels = km.labels_

    # Cluster keywords and representatives
    tfidf_for_kw = emb.tfidf if emb.tfidf is not None else TfidfVectorizer(ngram_range=(1,2), max_features=50000, min_df=2).fit(texts)
    kw_df = extract_cluster_keywords(texts, labels, tfidf=tfidf_for_kw, top_n=15)
    reps = nearest_representatives(emb.vectors, labels, km, k=3)

    # Per-quote outputs
    assign_df = base[["row_id"]].copy()
    if "philosopher" in base.columns:
        assign_df["philosopher"] = base["philosopher"].astype(str)
    else:
        assign_df["philosopher"] = ""
    assign_df["cluster"] = labels.astype(int)
    assign_df.to_csv(OUT_ASSIGN, index=False)

    # Projections
    pca_xy = compute_pca(emb.vectors)
    pca_df = assign_df.copy()
    pca_df["x"] = pca_xy[:, 0]
    pca_df["y"] = pca_xy[:, 1]
    pca_df.to_csv(OUT_PCA, index=False)

    umap_xy = compute_umap(emb.vectors)
    if umap_xy is not None:
        umap_df = assign_df.copy()
        umap_df["x"] = umap_xy[:, 0]
        umap_df["y"] = umap_xy[:, 1]
        umap_df.to_csv(OUT_UMAP, index=False)

    # Discriminative keywords
    logodds_df = compute_logodds_by_philosopher(base)
    if not logodds_df.empty:
        logodds_df.to_csv(OUT_LOGODDS, index=False)

    # Aggregate cluster-level summary
    cluster_summary: List[Dict] = []
    # Philosopher distribution per cluster
    ph_series = assign_df["philosopher"].astype(str)
    ph_vals = ph_series.unique().tolist()
    for cl in sorted(assign_df["cluster"].unique().tolist()):
        mask = assign_df["cluster"] == cl
        size = int(mask.sum())
        ph_dist = (
            assign_df[mask]["philosopher"].value_counts().to_dict() if "philosopher" in assign_df.columns else {}
        )
        top_kw = kw_df[kw_df["cluster"] == cl].sort_values(["rank"]).head(10)["term"].tolist()
        rep_idx = reps.get(int(cl), [])
        rep_rows = base.iloc[rep_idx] if rep_idx else pd.DataFrame(columns=base.columns)
        rep_quotes = [
            {
                "row_id": int(r["row_id"]) if "row_id" in r else None,
                "philosopher": str(r.get("philosopher", "")),
                "text": str(r.get("text", ""))[:400],
            }
            for _, r in rep_rows.iterrows()
        ]
        cluster_summary.append({
            "cluster": int(cl),
            "size": size,
            "philosopher_distribution": {str(k): int(v) for k, v in ph_dist.items()},
            "top_keywords": top_kw,
            "representatives": rep_quotes,
        })

    # Save keywords per cluster
    if not kw_df.empty:
        kw_df.to_csv(OUT_CLUSTER_KEYWORDS, index=False)

    # Save summary JSON
    out = {
        "model": emb.kind,
        "n_clusters": int(n_clusters),
        "outputs": {
            "assignments_csv": OUT_ASSIGN,
            "cluster_keywords_csv": OUT_CLUSTER_KEYWORDS,
            "pca_csv": OUT_PCA,
            "umap_csv": OUT_UMAP if umap_xy is not None else "",
            "logodds_csv": OUT_LOGODDS if not logodds_df.empty else "",
        },
        "clusters": cluster_summary,
        "notes": "No predefined dimensions. Clusters, keywords, and discriminative terms are data-driven.",
    }
    with open(OUT_JSON, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    print("Saved:")
    print(" ", OUT_JSON)
    print(" ", OUT_CLUSTER_KEYWORDS)
    print(" ", OUT_ASSIGN)
    print(" ", OUT_PCA)
    if umap_xy is not None:
        print(" ", OUT_UMAP)
    if not logodds_df.empty:
        print(" ", OUT_LOGODDS)


if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print("Unsupervised analysis failed:", e)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  warn(


Saved:
  /content/drive/MyDrive/Chinese Philosophers/unsupervised_summary.json
  /content/drive/MyDrive/Chinese Philosophers/unsupervised_cluster_keywords.csv
  /content/drive/MyDrive/Chinese Philosophers/unsupervised_cluster_assignments.csv
  /content/drive/MyDrive/Chinese Philosophers/unsupervised_embeddings_pca.csv
  /content/drive/MyDrive/Chinese Philosophers/unsupervised_embeddings_umap.csv
  /content/drive/MyDrive/Chinese Philosophers/logodds_keywords_by_philosopher.csv


In [13]:
import os, json, random, pandas as pd

# Set to your ROOT (Colab default below)
ROOT = "/content/drive/MyDrive/Chinese Philosophers"
BASE = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")
JSONL = os.path.join(ROOT, "per_quote_llm_keywords.jsonl")

# Load base and normalize columns
base = pd.read_csv(BASE)
base.columns = [c.strip().lower() for c in base.columns]
text_col = "quote" if "quote" in base.columns else ("text" if "text" in base.columns else None)
assert text_col, "No quote/text column found in base CSV"
if text_col != "text":
    base = base.rename(columns={text_col: "text"})
if "row_id" not in base.columns:
    base.insert(0, "row_id", range(len(base)))

# Load per-quote LLM results
recs = []
with open(JSONL, "r", encoding="utf-8") as f:
    for line in f:
        rec = json.loads(line)
        if rec.get("keywords"):
            recs.append(rec)

# Sample 10 quotes with keywords
random.seed()  # change to a number for repeatability
sample = random.sample(recs, k=min(10, len(recs)))

# Pretty print
for i, rec in enumerate(sample, 1):
    rid = int(rec["row_id"])
    row = base.loc[base["row_id"] == rid].iloc[0]
    print(f"\n[{i}] row_id={rid} | philosopher={row.get('philosopher', '')}")
    print("Quote:")
    print(row["text"])
    print("Keywords:")
    for kw in rec["keywords"]:
        term = kw.get("term", "")
        kind = kw.get("kind", "keyword")
        conf = kw.get("confidence", 0.0)
        span = kw.get("support_span", "")
        print(f" - {term} ({kind}, conf={conf:.2f})" + (f" — span: \"{span}\"" if span else ""))


[1] row_id=80 | philosopher=Confucius
Quote:
The Master said, ‘If one is guided by profit in one's actions, one will incur much ill will.'
Keywords:
 - profit (keyword, conf=0.90) — span: "guided by profit"
 - ill will (keyphrase, conf=0.90) — span: "incur much ill will"
 - guided by (keyphrase, conf=0.80) — span: "guided by profit"
 - one's actions (keyphrase, conf=0.80) — span: "one's actions"
 - incur (keyword, conf=0.70) — span: "incur much ill will"
 - The Master (keyphrase, conf=0.60) — span: "The Master said"

[2] row_id=18 | philosopher=Confucius
Quote:
"The Master said, ‘The rule of virtue can be compared to the Pole Star which commands the homage of the multitude of stars without leaving its place.'"
Keywords:
 - Pole Star (keyphrase, conf=1.00) — span: "the Pole Star"
 - rule of virtue (keyphrase, conf=1.00) — span: "the rule of virtue"
 - commands homage (keyphrase, conf=1.00) — span: "commands the homage"
 - multitude of stars (keyphrase, conf=1.00) — span: "the multitude

Looking for CSV at: /content/drive/MyDrive/Chinese Philosophers/chinese_philosophers_quotes_corrected.csv
File exists: True
Loaded 1162 quotes from CSV
Cache hits: 1162; to process: 0
All items are cached. To force re-run, set LLM_RESUME=0 or clear cache at:
  /content/drive/MyDrive/Chinese Philosophers/cache/qwen_multidim
Saved:
  /content/drive/MyDrive/Chinese Philosophers/per_quote_qwen_multidim.jsonl
  /content/drive/MyDrive/Chinese Philosophers/per_quote_qwen_multidim.csv
Estimated cost for 1162 quotes (avg in 300, out 220): $0.2231 (in $0.0697 + out $0.1534)


In [None]:
"""
One-cell Colab runner: paste this entire file into a single Colab cell and run to generate
multi-dimensional keywords into JSONL/CSV.

Behaviour:
- Mounts Drive, installs dependencies, prompts for Together API key if needed
- Processes quotes in batches with concurrency
- Writes snapshots to JSONL/CSV after each batch so you can inspect mid-run

Outputs (in PHILOSOPHERS_ROOT):
- per_quote_qwen_multidim.jsonl
- per_quote_qwen_multidim.csv
- cache/qwen_multidim/{row_id}.json
"""

import os, sys, json, time, subprocess
from typing import Dict, List, Optional, Any, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# 0) Quiet install
try:
    import pandas as pd  # type: ignore
    from openai import OpenAI  # type: ignore
except Exception:
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas", "openai>=1.40.0"], check=True)
    import pandas as pd  # type: ignore
    from openai import OpenAI  # type: ignore

# 1) Mount Drive
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
except Exception:
    pass

# 2) Configuration (edit if needed)
ROOT = "/content/drive/MyDrive/Chinese Philosophers"
BASE_CSV = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")
OUT_JSONL = os.path.join(ROOT, "per_quote_qwen_multidim.jsonl")
OUT_CSV = os.path.join(ROOT, "per_quote_qwen_multidim.csv")
CACHE_DIR = os.path.join(ROOT, "cache", "qwen_multidim")

MODEL = "Qwen/Qwen3-235B-A22B-fp8"
BATCH_SIZE = 8
CONCURRENCY = 8
RUN_LIMIT = None  # None = full run; snapshots are written after each batch
RESUME = False    # False = ignore cache; True = use per-quote cache if present
SNAPSHOT_EVERY_N_BATCHES = 1  # write JSONL/CSV after every N completed batches

# 3) API key
api_key = None
try:
    from google.colab import userdata  # type: ignore
    api_key = userdata.get("TOGETHER_API_KEY") or userdata.get("TOGETHER_AF")
except Exception:
    pass
if not api_key:
    try:
        import getpass
        api_key = getpass.getpass("Enter TOGETHER_API_KEY: ")
    except Exception:
        pass
if not api_key:
    raise RuntimeError("TOGETHER_API_KEY is required.")

# --- Client ---
def make_client():
    return OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")

# --- Data loading ---
def load_base(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Base CSV not found at {path}")
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    text_col: Optional[str] = None
    if "quote" in df.columns:
        text_col = "quote"
    elif "text" in df.columns:
        text_col = "text"
    else:
        for cand in ["translation", "english", "content", "verse_text", "line_text"]:
            if cand in df.columns:
                text_col = cand
                break
    if not text_col:
        raise RuntimeError("No quote/text-like column found.")
    df = df.rename(columns={text_col: "text"})
    if "row_id" not in df.columns:
        df.insert(0, "row_id", range(len(df)))
    if "philosopher" not in df.columns:
        df["philosopher"] = ""
    df["text"] = df["text"].astype(str).str.strip()
    df = df[df["text"].str.len() > 0].reset_index(drop=True)
    return df

# --- Prompt ---
def build_batch_prompt(items: List[Dict[str, Any]]) -> str:
    header = (
        "You are an expert in classical Chinese philosophy text analysis. "
        "Extract comprehensive, structured features for search, clustering, and deep analysis from English translations.\n\n"
        "FORMATTING RULES\n"
        "- Output VALID JSON only (no extra text)\n"
        "- Keywords: lowercase, snake_case, ASCII\n"
        "- Deduplicate within each dimension\n"
        "- Use standard philosophical terminology consistently\n\n"
        "Return a single JSON object with this structure only:\n"
        "{"\
        "\n  \"results\": ["\
        "\n    {"\
        "\n      \"row_id\": int,"\
        "\n      \"philosopher\": str,"\
        "\n      \"core_concepts\": [{\"term\": str, \"importance\": \"primary\"|\"secondary\"}],"\
        "\n      \"themes\": [str],"\
        "\n      \"oppositions\": [{\"term\": str, \"type\": str, \"description\": str}],"\
        "\n      \"metaphors\": [{\"image\": str, \"category\": str, \"represents\": str}],"\
        "\n      \"prescriptive\": [{\"type\": str, \"prescription\": str}],"\
        "\n      \"argument\": {\"method\": [str], \"devices\": [str], \"structure\": str},"\
        "\n      \"actors\": [str],"\
        "\n      \"scope\": {\"level\": str, \"context\": str|null},"\
        "\n      \"connections\": [{\"concept\": str, \"connects_to\": str, \"relationship\": str}],"\
        "\n      \"distinctive\": [str]"\
        "\n    }"\
        "\n  ]"\
        "\n}\n\n"
        "Analyze strictly within each quote's content. Now the quotes follow as bullet points:"
    )
    body_lines = []
    for it in items:
        rid = int(it["row_id"])  # type: ignore
        ph = str(it.get("philosopher", ""))
        tx = str(it.get("text", ""))
        body_lines.append(f"- row_id: {rid}\n- philosopher: {ph}\n- quote: {tx}\n")
    return header + "\n\n".join(body_lines)

# --- LLM call ---
def call_llm(client, model: str, system_prompt: str, user_prompt: str, max_retries: int = 3, max_tokens: int = 900) -> Optional[str]:
    delay = 1.0
    for attempt in range(max_retries):
        try:
            kwargs = dict(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a philosophical analysis system. Return only valid JSON."},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=0.2,
                top_p=0.9,
                max_tokens=max_tokens,
            )
            try:
                kwargs["response_format"] = {"type": "json_object"}
            except Exception:
                pass
            resp = client.chat.completions.create(**kwargs)
            return resp.choices[0].message.content
        except Exception as e:
            error_str = str(e).lower()
            if "429" in error_str or "rate limit" in error_str or "too many requests" in error_str:
                rate_limit_delay = min(30.0, delay * 3.0)
                if attempt == max_retries - 1:
                    print(f"Rate limit exceeded after {max_retries} retries: {e}")
                    return None
                print(f"Rate limit hit, waiting {rate_limit_delay:.1f}s before retry {attempt + 1}/{max_retries}")
                time.sleep(rate_limit_delay)
                delay = rate_limit_delay
            else:
                if attempt == max_retries - 1:
                    print(f"LLM call failed after {max_retries} retries: {e}")
                    return None
                time.sleep(delay)
                delay = min(15.0, delay * 2.0)
    return None

def extract_json(text: str) -> Optional[Dict[str, Any]]:
    if not text:
        return None
    try:
        return json.loads(text)
    except Exception:
        pass
    try:
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1 and end > start:
            return json.loads(text[start:end+1])
    except Exception:
        pass
    return None

def build_single_prompt(item: Dict[str, Any]) -> str:
    rid = int(item["row_id"])  # type: ignore
    ph = str(item.get("philosopher", ""))
    tx = str(item.get("text", ""))
    return (
        "You are an expert in classical Chinese philosophy text analysis. "
        "Extract comprehensive, structured features for search, clustering, and deep analysis from English translations.\n\n"
        "FORMATTING RULES\n"
        "- Output VALID JSON only (no extra text)\n"
        "- Keywords: lowercase, snake_case, ASCII\n"
        "- Deduplicate within each dimension\n"
        "- Use standard philosophical terminology consistently\n\n"
        "Return ONLY this JSON object (no wrapper, no narration):\n"
        "{"\
        f"\n  \"row_id\": {rid},"\
        f"\n  \"philosopher\": \"{ph}\","\
        "\n  \"core_concepts\": [{\"term\": str, \"importance\": \"primary\"|\"secondary\"}],"\
        "\n  \"themes\": [str],"\
        "\n  \"oppositions\": [{\"term\": str, \"type\": str, \"description\": str}],"\
        "\n  \"metaphors\": [{\"image\": str, \"category\": str, \"represents\": str}],"\
        "\n  \"prescriptive\": [{\"type\": str, \"prescription\": str}],"\
        "\n  \"argument\": {\"method\": [str], \"devices\": [str], \"structure\": str},"\
        "\n  \"actors\": [str],"\
        "\n  \"scope\": {\"level\": str, \"context\": str|null},"\
        "\n  \"connections\": [{\"concept\": str, \"connects_to\": str, \"relationship\": str}],"\
        "\n  \"distinctive\": [str]"\
        "\n}\n\n"
        f"Analyze strictly within the quote.\n- row_id: {rid}\n- philosopher: {ph}\n- quote: {tx}"
    )

def is_meaningful(clean: Dict[str, Any]) -> bool:
    if clean.get("core_concepts"): return True
    if clean.get("themes"): return True
    if clean.get("oppositions"): return True
    if clean.get("metaphors"): return True
    if clean.get("prescriptive"): return True
    arg = clean.get("argument") or {}
    if arg.get("method") or arg.get("devices") or (arg.get("structure") or "").strip():
        return True
    if clean.get("actors"): return True
    sc = clean.get("scope") or {}
    if (sc.get("level") or "").strip(): return True
    if clean.get("connections"): return True
    if clean.get("distinctive"): return True
    return False

# --- Cache ---
def ensure_cache_dirs():
    os.makedirs(CACHE_DIR, exist_ok=True)

def cache_path(row_id: int) -> str:
    return os.path.join(CACHE_DIR, f"{row_id}.json")

# --- Normalization & flattening ---
def to_ascii_snake(s: str) -> str:
    try:
        ascii_txt = s.encode("ascii", "ignore").decode("ascii")
    except Exception:
        ascii_txt = s
    cleaned = ascii_txt.strip().lower().replace(" ", "_").replace("-", "_")
    cleaned = "".join(ch if (ch.isalnum() or ch == "_") else "_" for ch in cleaned)
    while "__" in cleaned:
        cleaned = cleaned.replace("__", "_")
    return cleaned.strip("_")

def sanitize_record(obj: Dict[str, Any]) -> Dict[str, Any]:
    def as_str_list(v: Any) -> List[str]:
        if isinstance(v, list):
            return [str(x).strip() for x in v if str(x).strip()]
        if v is None:
            return []
        return [str(v).strip()]

    def dedupe_order(items: List[Any]) -> List[Any]:
        seen = set()
        out: List[Any] = []
        for it in items:
            key = json.dumps(it, ensure_ascii=False, sort_keys=True) if isinstance(it, dict) else str(it)
            if key not in seen:
                seen.add(key)
                out.append(it)
        return out

    rid_val = obj.get("row_id", None)
    rid: Optional[int]
    if isinstance(rid_val, (int, float)):
        rid = int(rid_val)
    elif isinstance(rid_val, str):
        try:
            rid = int(rid_val.strip())
        except Exception:
            rid = None
    else:
        rid = None

    core_concepts: List[Dict[str, str]] = []
    for item in obj.get("core_concepts", []) or []:
        term_raw = str(item.get("term", "")).strip()
        term = to_ascii_snake(term_raw) if term_raw else ""
        importance = str(item.get("importance", "secondary")).strip().lower()
        if importance not in {"primary", "secondary"}:
            importance = "secondary"
        if term:
            core_concepts.append({"term": term, "importance": importance})
    core_concepts = dedupe_order(core_concepts)

    themes = [to_ascii_snake(x) for x in as_str_list(obj.get("themes"))]
    themes = dedupe_order(themes)

    oppositions: List[Dict[str, str]] = []
    for it in obj.get("oppositions", []) or []:
        term = to_ascii_snake(str(it.get("term", "")).strip())
        typ = to_ascii_snake(str(it.get("type", "")).strip())
        desc = str(it.get("description", "")).strip()
        if term:
            oppositions.append({"term": term, "type": typ, "description": desc})
    oppositions = dedupe_order(oppositions)

    metaphors: List[Dict[str, str]] = []
    for it in obj.get("metaphors", []) or []:
        image = to_ascii_snake(str(it.get("image", "")).strip())
        category = to_ascii_snake(str(it.get("category", "")).strip())
        represents = to_ascii_snake(str(it.get("represents", "")).strip())
        if image or category or represents:
            metaphors.append({"image": image, "category": category, "represents": represents})
    metaphors = dedupe_order(metaphors)

    prescriptive: List[Dict[str, str]] = []
    for it in obj.get("prescriptive", []) or []:
        ptype = to_ascii_snake(str(it.get("type", "")).strip())
        presc = to_ascii_snake(str(it.get("prescription", "")).strip())
        if ptype or presc:
            prescriptive.append({"type": ptype, "prescription": presc})
    prescriptive = dedupe_order(prescriptive)

    arg = obj.get("argument") or {}
    arg_methods = [to_ascii_snake(x) for x in as_str_list(arg.get("method"))]
    arg_devices = [to_ascii_snake(x) for x in as_str_list(arg.get("devices"))]
    argument = {"method": dedupe_order(arg_methods), "devices": dedupe_order(arg_devices), "structure": str(arg.get("structure", "")).strip()}

    actors = [to_ascii_snake(x) for x in as_str_list(obj.get("actors"))]
    actors = dedupe_order(actors)

    sc = obj.get("scope") or {}
    level = to_ascii_snake(str(sc.get("level", "")).strip()) if sc.get("level") not in (None, "") else ""
    context = str(sc.get("context", "")).strip() if sc.get("context") not in (None, "") else None
    scope = {"level": level, "context": context}

    connections: List[Dict[str, str]] = []
    for it in obj.get("connections", []) or []:
        concept = to_ascii_snake(str(it.get("concept", "")).strip())
        connects_to = str(it.get("connects_to", "")).strip()
        relationship = to_ascii_snake(str(it.get("relationship", "")).strip())
        if concept or connects_to or relationship:
            connections.append({"concept": concept, "connects_to": connects_to, "relationship": relationship})
    connections = dedupe_order(connections)

    distinctive = [str(x).strip() for x in as_str_list(obj.get("distinctive")) if str(x).strip()]
    distinctive = dedupe_order(distinctive)

    return {
        "row_id": rid,
        "core_concepts": core_concepts,
        "themes": themes,
        "oppositions": oppositions,
        "metaphors": metaphors,
        "prescriptive": prescriptive,
        "argument": argument,
        "actors": actors,
        "scope": scope,
        "connections": connections,
        "distinctive": distinctive,
    }

def flatten_records(records: List[Dict[str, Any]], base_df: pd.DataFrame) -> pd.DataFrame:
    if not records:
        return pd.DataFrame()
    base_index = base_df.set_index("row_id")
    rows: List[Dict[str, Any]] = []
    for rec in records:
        rid_val = rec.get("row_id")
        try:
            rid = int(rid_val) if rid_val is not None else None
        except Exception:
            rid = None
        if rid is None:
            continue

        philosopher = ""
        quote_text = ""
        try:
            if rid in base_index.index:
                philosopher = str(base_index.loc[rid]["philosopher"])  # type: ignore[index]
                quote_text = str(base_index.loc[rid]["text"])  # type: ignore[index]
        except Exception:
            pass

        optional_meta_cols = [
            "work", "book", "chapter", "chapter_title", "section", "subsection",
            "source", "reference", "translator", "collection", "dynasty", "year",
        ]
        meta_values: Dict[str, Any] = {}
        for col in optional_meta_cols:
            try:
                if col in base_index.columns and rid in base_index.index:
                    meta_values[col] = str(base_index.loc[rid][col])  # type: ignore[index]
            except Exception:
                meta_values[col] = ""

        cc_primary = [it["term"] for it in rec.get("core_concepts", []) if it.get("importance") == "primary"]
        cc_secondary = [it["term"] for it in rec.get("core_concepts", []) if it.get("importance") == "secondary"]
        argument = rec.get("argument", {}) or {}

        row = {
            "row_id": rid,
            "philosopher": philosopher,
            "text": quote_text,
            **meta_values,
            "core_concepts_primary": "; ".join(cc_primary),
            "core_concepts_secondary": "; ".join(cc_secondary),
            "themes": "; ".join(rec.get("themes", [])),
            "oppositions_json": json.dumps(rec.get("oppositions", []), ensure_ascii=False),
            "metaphors_json": json.dumps(rec.get("metaphors", []), ensure_ascii=False),
            "prescriptive_json": json.dumps(rec.get("prescriptive", []), ensure_ascii=False),
            "argument_method": "; ".join(argument.get("method", [])),
            "argument_devices": "; ".join(argument.get("devices", [])),
            "argument_structure": argument.get("structure", ""),
            "actors": "; ".join(rec.get("actors", [])),
            "scope_level": (rec.get("scope", {}) or {}).get("level", ""),
            "scope_context": (rec.get("scope", {}) or {}).get("context", None),
            "connections_json": json.dumps(rec.get("connections", []), ensure_ascii=False),
            "distinctive": "; ".join(rec.get("distinctive", [])),
        }
        rows.append(row)
    return pd.DataFrame(rows)

# --- Processing ---
def process_batch(client, model: str, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    # Scale max_tokens with batch size to avoid truncation
    scaled_max_tokens = min(4000, 700 + 250 * max(1, len(items)))
    user_prompt = build_batch_prompt(items)
    resp_text = call_llm(client, model, "You are a philosophical analysis system. Return only valid JSON.", user_prompt, max_retries=3, max_tokens=scaled_max_tokens)
    parsed = extract_json(resp_text or "") if resp_text else None
    results_map: Dict[int, Dict[str, Any]] = {}
    if parsed and isinstance(parsed, dict):
        results = parsed.get("results")
        if isinstance(results, list):
            for obj in results:
                try:
                    rid = int(obj.get("row_id"))
                    results_map[rid] = obj
                except Exception:
                    continue

    outputs: List[Dict[str, Any]] = []
    for it in items:
        rid = int(it["row_id"])  # type: ignore
        obj = results_map.get(rid, {"row_id": rid})
        obj["row_id"] = rid
        clean = sanitize_record(obj)
        # If empty, retry single-quote extraction once
        if not is_meaningful(clean):
            single_prompt = build_single_prompt(it)
            single_text = call_llm(client, model, "You are a philosophical analysis system. Return only valid JSON.", single_prompt, max_retries=2, max_tokens=1200)
            parsed_single = extract_json(single_text or "") if single_text else None
            if isinstance(parsed_single, dict):
                parsed_single["row_id"] = rid
                clean_single = sanitize_record(parsed_single)
                if is_meaningful(clean_single):
                    clean = clean_single
        try:
            with open(cache_path(rid), "w", encoding="utf-8") as f:
                json.dump(clean, f, ensure_ascii=False)
        except Exception:
            pass
        outputs.append(clean)
    return outputs

def chunker(lst: List[Dict[str, Any]], n: int):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def estimate_cost(num_quotes: int, avg_input_tokens: int = 300, avg_output_tokens: int = 220) -> Tuple[float, float, float]:
    in_cost = (num_quotes * avg_input_tokens) / 1_000_000 * 0.20
    out_cost = (num_quotes * avg_output_tokens) / 1_000_000 * 0.60
    return (round(in_cost, 4), round(out_cost, 4), round(in_cost + out_cost, 4))

# --- Run ---
def run():
    os.makedirs(ROOT, exist_ok=True)
    ensure_cache_dirs()

    if not os.path.exists(BASE_CSV):
        raise FileNotFoundError(f"CSV not found at {BASE_CSV}")

    base = load_base(BASE_CSV)
    print(f"Loaded {len(base)} quotes from CSV")

    client = make_client()

    # Determine rows to process
    rows: List[Dict[str, Any]] = []
    out_records: List[Dict[str, Any]] = []
    already = 0
    for _, r in base.iterrows():
        rid = int(r["row_id"])  # type: ignore[index]
        cpath = cache_path(rid)
        if RESUME and os.path.exists(cpath):
            try:
                with open(cpath, "r", encoding="utf-8") as f:
                    cached = json.load(f)
                out_records.append(cached)
                already += 1
                continue
            except Exception:
                pass
        rows.append({
            "row_id": rid,
            "philosopher": str(r.get("philosopher", "")),
            "text": str(r["text"]).strip(),
        })

    if RUN_LIMIT is not None:
        rows = rows[:max(0, RUN_LIMIT)]

    print(f"Cache hits used: {already}; to process now: {len(rows)}")
    total_to_process = len(rows)

    out_lock = Lock()
    if total_to_process:
        print(f"Processing {total_to_process} quotes in batches of {BATCH_SIZE} with concurrency={CONCURRENCY}…")
        futures = []
        done_batches = 0
        with ThreadPoolExecutor(max_workers=max(1, CONCURRENCY)) as ex:
            for batch in chunker(rows, max(1, BATCH_SIZE)):
                futures.append(ex.submit(process_batch, client, MODEL, batch))
            for fut in as_completed(futures):
                try:
                    batch_out = fut.result()
                except Exception as e:
                    print("Batch failed:", e)
                    batch_out = []
                with out_lock:
                    out_records.extend(batch_out)
                    done_batches += 1
                    # Snapshot after each N batches
                    if done_batches % SNAPSHOT_EVERY_N_BATCHES == 0:
                        # Write JSONL snapshot
                        try:
                            with open(OUT_JSONL, "w", encoding="utf-8") as f:
                                for rec in out_records:
                                    f.write(json.dumps(rec, ensure_ascii=False) + "\n")
                        except Exception as e:
                            print("Failed writing JSONL snapshot:", e)
                        # Write CSV snapshot
                        try:
                            flat = flatten_records(out_records, base)
                            if not flat.empty:
                                flat.to_csv(OUT_CSV, index=False)
                        except Exception as e:
                            print("Failed writing CSV snapshot:", e)
                        print(f"Completed {done_batches} batches… snapshot written to CSV/JSONL.")
    else:
        print("Nothing to process (all cached or limit=0). Set RESUME=False to force fresh generations.")

    # Final write
    if out_records:
        try:
            with open(OUT_JSONL, "w", encoding="utf-8") as f:
                for rec in out_records:
                    f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        except Exception as e:
            print("Final JSONL write failed:", e)

        try:
            flat = flatten_records(out_records, base)
            if not flat.empty:
                flat.to_csv(OUT_CSV, index=False)
        except Exception as e:
            print("Final CSV write failed:", e)

    print("Saved:")
    print(" ", OUT_JSONL)
    if os.path.exists(OUT_CSV):
        print(" ", OUT_CSV)

    if out_records:
        num_quotes = len(out_records)
        in_cost, out_cost, total = estimate_cost(num_quotes)
        print(f"Estimated cost for {num_quotes} quotes (avg in 300, out 220): ${total} (in ${in_cost} + out ${out_cost})")


# Execute when run in Colab cell or as a script
if __name__ == "__main__":
    run()




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 1162 quotes from CSV
Cache hits used: 0; to process now: 1162
Processing 1162 quotes in batches of 8 with concurrency=8…
Completed 1 batches… snapshot written to CSV/JSONL.
Completed 2 batches… snapshot written to CSV/JSONL.
Completed 3 batches… snapshot written to CSV/JSONL.


KeyboardInterrupt: 

In [31]:
"""
One-cell Colab runner: paste this entire file into a single Colab cell and run to generate
multi-dimensional keywords into JSONL/CSV.

Behaviour:
- Mounts Drive, installs dependencies, prompts for Together API key if needed
- Processes quotes in batches with concurrency
- Writes snapshots to JSONL/CSV after each batch so you can inspect mid-run

Outputs (in PHILOSOPHERS_ROOT):
- per_quote_qwen_multidim.jsonl
- per_quote_qwen_multidim.csv
- cache/qwen_multidim/{row_id}.json
"""

import os, sys, json, time, subprocess, shutil, math
from typing import Dict, List, Optional, Any, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# 0) Quiet install
try:
    import pandas as pd  # type: ignore
    from openai import OpenAI  # type: ignore
except Exception:
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", "pandas", "openai>=1.40.0"], check=True)
    import pandas as pd  # type: ignore
    from openai import OpenAI  # type: ignore

# 1) Mount Drive
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
except Exception:
    pass

# 2) Configuration (edit if needed)
ROOT = "/content/drive/MyDrive/Chinese Philosophers"
BASE_CSV = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")
OUT_JSONL = os.path.join(ROOT, "per_quote_qwen_multidim.jsonl")
OUT_CSV = os.path.join(ROOT, "per_quote_qwen_multidim.csv")
CACHE_DIR = os.path.join(ROOT, "cache", "qwen_multidim")

MODEL = "Qwen/Qwen3-235B-A22B-fp8"
BATCH_SIZE = 8
CONCURRENCY = 8
RUN_LIMIT = None  # None = full run; snapshots are written after each batch
RESUME = False    # False = ignore cache; True = use per-quote cache if present
SNAPSHOT_EVERY_N_BATCHES = 1  # write JSONL/CSV after every N completed batches
CLEAR_CACHE = False  # True to wipe per-quote cache before running
BATCH_PREVIEW = True  # Print a short preview after each snapshot
PREVIEW_MAX_ITEMS = 1

# 3) API key
api_key = os.environ.get("TOGETHER_API_KEY")
try:
    from google.colab import userdata  # type: ignore
    if not api_key:
        api_key = userdata.get("TOGETHER_API_KEY") or userdata.get("TOGETHER_AF")
except Exception:
    pass
if not api_key:
    try:
        import getpass
        api_key = getpass.getpass("Enter TOGETHER_API_KEY: ")
    except Exception:
        pass
if not api_key:
    raise RuntimeError("TOGETHER_API_KEY is required.")

# --- Client ---
def make_client():
    return OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")

# --- Data loading ---
def load_base(path: str) -> pd.DataFrame:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Base CSV not found at {path}")
    df = pd.read_csv(path)
    df.columns = [c.strip().lower() for c in df.columns]
    text_col: Optional[str] = None
    if "quote" in df.columns:
        text_col = "quote"
    elif "text" in df.columns:
        text_col = "text"
    else:
        for cand in ["translation", "english", "content", "verse_text", "line_text"]:
            if cand in df.columns:
                text_col = cand
                break
    if not text_col:
        raise RuntimeError("No quote/text-like column found.")
    df = df.rename(columns={text_col: "text"})
    if "row_id" not in df.columns:
        df.insert(0, "row_id", range(len(df)))
    if "philosopher" not in df.columns:
        df["philosopher"] = ""
    df["text"] = df["text"].astype(str).str.strip()
    df = df[df["text"].str.len() > 0].reset_index(drop=True)
    return df

# --- Prompt ---
def build_batch_prompt(items: List[Dict[str, Any]]) -> str:
    header = (
        "You are an expert in classical Chinese philosophy text analysis. "
        "Extract comprehensive, structured features for search, clustering, and deep analysis from English translations.\n\n"
        "FORMATTING RULES\n"
        "- Output VALID JSON only (no extra text)\n"
        "- Keywords: lowercase, snake_case, ASCII\n"
        "- Deduplicate within each dimension\n"
        "- Use standard philosophical terminology consistently\n\n"
        "Return a single JSON object with this structure only:\n"
        "{"\
        "\n  \"results\": ["\
        "\n    {"\
        "\n      \"row_id\": int,"\
        "\n      \"philosopher\": str,"\
        "\n      \"core_concepts\": [{\"term\": str, \"importance\": \"primary\"|\"secondary\"}],"\
        "\n      \"themes\": [str],"\
        "\n      \"oppositions\": [{\"term\": str, \"type\": str, \"description\": str}],"\
        "\n      \"metaphors\": [{\"image\": str, \"category\": str, \"represents\": str}],"\
        "\n      \"prescriptive\": [{\"type\": str, \"prescription\": str}],"\
        "\n      \"argument\": {\"method\": [str], \"devices\": [str], \"structure\": str},"\
        "\n      \"actors\": [str],"\
        "\n      \"scope\": {\"level\": str, \"context\": str|null},"\
        "\n      \"connections\": [{\"concept\": str, \"connects_to\": str, \"relationship\": str}],"\
        "\n      \"distinctive\": [str]"\
        "\n    }"\
        "\n  ]"\
        "\n}\n\n"
        "Analyze strictly within each quote's content. Now the quotes follow as bullet points:"
    )
    body_lines = []
    for it in items:
        rid = int(it["row_id"])  # type: ignore
        ph = str(it.get("philosopher", ""))
        tx = str(it.get("text", ""))
        body_lines.append(f"- row_id: {rid}\n- philosopher: {ph}\n- quote: {tx}\n")
    return header + "\n\n".join(body_lines)

# --- LLM call ---
def call_llm(client, model: str, system_prompt: str, user_prompt: str, max_retries: int = 3, max_tokens: int = 900) -> Optional[str]:
    delay = 1.0
    for attempt in range(max_retries):
        try:
            kwargs = dict(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a philosophical analysis system. Return only valid JSON."},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=0.2,
                top_p=0.9,
                max_tokens=max_tokens,
            )
            try:
                kwargs["response_format"] = {"type": "json_object"}
            except Exception:
                pass
            resp = client.chat.completions.create(**kwargs)
            return resp.choices[0].message.content
        except Exception as e:
            error_str = str(e).lower()
            if "429" in error_str or "rate limit" in error_str or "too many requests" in error_str:
                rate_limit_delay = min(30.0, delay * 3.0)
                if attempt == max_retries - 1:
                    print(f"Rate limit exceeded after {max_retries} retries: {e}")
                    return None
                print(f"Rate limit hit, waiting {rate_limit_delay:.1f}s before retry {attempt + 1}/{max_retries}")
                time.sleep(rate_limit_delay)
                delay = rate_limit_delay
            else:
                if attempt == max_retries - 1:
                    print(f"LLM call failed after {max_retries} retries: {e}")
                    return None
                time.sleep(delay)
                delay = min(15.0, delay * 2.0)
    return None

def extract_json(text: str) -> Optional[Dict[str, Any]]:
    if not text:
        return None
    try:
        return json.loads(text)
    except Exception:
        pass
    try:
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1 and end > start:
            return json.loads(text[start:end+1])
    except Exception:
        pass
    return None

def build_single_prompt(item: Dict[str, Any]) -> str:
    rid = int(item["row_id"])  # type: ignore
    ph = str(item.get("philosopher", ""))
    tx = str(item.get("text", ""))
    return (
        "You are an expert in classical Chinese philosophy text analysis. "
        "Extract comprehensive, structured features for search, clustering, and deep analysis from English translations.\n\n"
        "FORMATTING RULES\n"
        "- Output VALID JSON only (no extra text)\n"
        "- Keywords: lowercase, snake_case, ASCII\n"
        "- Deduplicate within each dimension\n"
        "- Use standard philosophical terminology consistently\n\n"
        "Return ONLY a JSON object with these keys (populate with actual values, no placeholders):\n"
        "{"\
        f"\n  \"row_id\": {rid},"\
        f"\n  \"philosopher\": \"{ph}\","\
        "\n  \"core_concepts\": [{\"term\": "" , \"importance\": \"primary\"}],"\
        "\n  \"themes\": [""],"\
        "\n  \"oppositions\": [{\"term\": "" , \"type\": \"binary\", \"description\": ""}],"\
        "\n  \"metaphors\": [{\"image\": "" , \"category\": \"water\", \"represents\": ""}],"\
        "\n  \"prescriptive\": [{\"type\": \"moral\", \"prescription\": ""}],"\
        "\n  \"argument\": {\"method\": [""], \"devices\": [""], \"structure\": ""},"\
        "\n  \"actors\": [""],"\
        "\n  \"scope\": {\"level\": \"individual\", \"context\": null},"\
        "\n  \"connections\": [{\"concept\": "" , \"connects_to\": "" , \"relationship\": \"analogy\"}],"\
        "\n  \"distinctive\": ["" ]"\
        "\n}\n\n"
        f"Analyze strictly within the quote.\n- row_id: {rid}\n- philosopher: {ph}\n- quote: {tx}"
    )

def is_meaningful(clean: Dict[str, Any]) -> bool:
    if clean.get("core_concepts"): return True
    if clean.get("themes"): return True
    if clean.get("oppositions"): return True
    if clean.get("metaphors"): return True
    if clean.get("prescriptive"): return True
    arg = clean.get("argument") or {}
    if arg.get("method") or arg.get("devices") or (arg.get("structure") or "").strip():
        return True
    if clean.get("actors"): return True
    sc = clean.get("scope") or {}
    if (sc.get("level") or "").strip(): return True
    if clean.get("connections"): return True
    if clean.get("distinctive"): return True
    return False

# --- Cache ---
def ensure_cache_dirs():
    os.makedirs(CACHE_DIR, exist_ok=True)

def cache_path(row_id: int) -> str:
    return os.path.join(CACHE_DIR, f"{row_id}.json")

# --- Normalization & flattening ---
def to_ascii_snake(s: str) -> str:
    try:
        ascii_txt = s.encode("ascii", "ignore").decode("ascii")
    except Exception:
        ascii_txt = s
    cleaned = ascii_txt.strip().lower().replace(" ", "_").replace("-", "_")
    cleaned = "".join(ch if (ch.isalnum() or ch == "_") else "_" for ch in cleaned)
    while "__" in cleaned:
        cleaned = cleaned.replace("__", "_")
    return cleaned.strip("_")

def sanitize_record(obj: Dict[str, Any]) -> Dict[str, Any]:
    def as_str_list(v: Any) -> List[str]:
        if isinstance(v, list):
            return [str(x).strip() for x in v if str(x).strip()]
        if v is None:
            return []
        return [str(v).strip()]

    def dedupe_order(items: List[Any]) -> List[Any]:
        seen = set()
        out: List[Any] = []
        for it in items:
            key = json.dumps(it, ensure_ascii=False, sort_keys=True) if isinstance(it, dict) else str(it)
            if key not in seen:
                seen.add(key)
                out.append(it)
        return out

    rid_val = obj.get("row_id", None)
    rid: Optional[int]
    if isinstance(rid_val, (int, float)):
        rid = int(rid_val)
    elif isinstance(rid_val, str):
        try:
            rid = int(rid_val.strip())
        except Exception:
            rid = None
    else:
        rid = None

    core_concepts: List[Dict[str, str]] = []
    for item in obj.get("core_concepts", []) or []:
        term_raw = str(item.get("term", "")).strip()
        term = to_ascii_snake(term_raw) if term_raw else ""
        importance = str(item.get("importance", "secondary")).strip().lower()
        if importance not in {"primary", "secondary"}:
            importance = "secondary"
        if term:
            core_concepts.append({"term": term, "importance": importance})
    core_concepts = dedupe_order(core_concepts)

    themes = [to_ascii_snake(x) for x in as_str_list(obj.get("themes"))]
    themes = dedupe_order(themes)

    oppositions: List[Dict[str, str]] = []
    for it in obj.get("oppositions", []) or []:
        term = to_ascii_snake(str(it.get("term", "")).strip())
        typ = to_ascii_snake(str(it.get("type", "")).strip())
        desc = str(it.get("description", "")).strip()
        if term:
            oppositions.append({"term": term, "type": typ, "description": desc})
    oppositions = dedupe_order(oppositions)

    metaphors: List[Dict[str, str]] = []
    for it in obj.get("metaphors", []) or []:
        image = to_ascii_snake(str(it.get("image", "")).strip())
        category = to_ascii_snake(str(it.get("category", "")).strip())
        represents = to_ascii_snake(str(it.get("represents", "")).strip())
        if image or category or represents:
            metaphors.append({"image": image, "category": category, "represents": represents})
    metaphors = dedupe_order(metaphors)

    prescriptive: List[Dict[str, str]] = []
    for it in obj.get("prescriptive", []) or []:
        ptype = to_ascii_snake(str(it.get("type", "")).strip())
        presc = to_ascii_snake(str(it.get("prescription", "")).strip())
        if ptype or presc:
            prescriptive.append({"type": ptype, "prescription": presc})
    prescriptive = dedupe_order(prescriptive)

    arg = obj.get("argument") or {}
    arg_methods = [to_ascii_snake(x) for x in as_str_list(arg.get("method"))]
    arg_devices = [to_ascii_snake(x) for x in as_str_list(arg.get("devices"))]
    argument = {"method": dedupe_order(arg_methods), "devices": dedupe_order(arg_devices), "structure": str(arg.get("structure", "")).strip()}

    actors = [to_ascii_snake(x) for x in as_str_list(obj.get("actors"))]
    actors = dedupe_order(actors)

    sc = obj.get("scope") or {}
    level = to_ascii_snake(str(sc.get("level", "")).strip()) if sc.get("level") not in (None, "") else ""
    context = str(sc.get("context", "")).strip() if sc.get("context") not in (None, "") else None
    scope = {"level": level, "context": context}

    connections: List[Dict[str, str]] = []
    for it in obj.get("connections", []) or []:
        concept = to_ascii_snake(str(it.get("concept", "")).strip())
        connects_to = str(it.get("connects_to", "")).strip()
        relationship = to_ascii_snake(str(it.get("relationship", "")).strip())
        if concept or connects_to or relationship:
            connections.append({"concept": concept, "connects_to": connects_to, "relationship": relationship})
    connections = dedupe_order(connections)

    distinctive = [str(x).strip() for x in as_str_list(obj.get("distinctive")) if str(x).strip()]
    distinctive = dedupe_order(distinctive)

    return {
        "row_id": rid,
        "core_concepts": core_concepts,
        "themes": themes,
        "oppositions": oppositions,
        "metaphors": metaphors,
        "prescriptive": prescriptive,
        "argument": argument,
        "actors": actors,
        "scope": scope,
        "connections": connections,
        "distinctive": distinctive,
    }

def flatten_records(records: List[Dict[str, Any]], base_df: pd.DataFrame) -> pd.DataFrame:
    if not records:
        return pd.DataFrame()
    base_index = base_df.set_index("row_id")
    rows: List[Dict[str, Any]] = []
    for rec in records:
        rid_val = rec.get("row_id")
        try:
            rid = int(rid_val) if rid_val is not None else None
        except Exception:
            rid = None
        if rid is None:
            continue

        philosopher = ""
        quote_text = ""
        try:
            if rid in base_index.index:
                philosopher = str(base_index.loc[rid]["philosopher"])  # type: ignore[index]
                quote_text = str(base_index.loc[rid]["text"])  # type: ignore[index]
        except Exception:
            pass

        optional_meta_cols = [
            "work", "book", "chapter", "chapter_title", "section", "subsection",
            "source", "reference", "translator", "collection", "dynasty", "year",
        ]
        meta_values: Dict[str, Any] = {}
        for col in optional_meta_cols:
            try:
                if col in base_index.columns and rid in base_index.index:
                    meta_values[col] = str(base_index.loc[rid][col])  # type: ignore[index]
            except Exception:
                meta_values[col] = ""

        cc_primary = [it["term"] for it in rec.get("core_concepts", []) if it.get("importance") == "primary"]
        cc_secondary = [it["term"] for it in rec.get("core_concepts", []) if it.get("importance") == "secondary"]
        argument = rec.get("argument", {}) or {}

        row = {
            "row_id": rid,
            "philosopher": philosopher,
            "text": quote_text,
            **meta_values,
            "core_concepts_primary": "; ".join(cc_primary),
            "core_concepts_secondary": "; ".join(cc_secondary),
            "themes": "; ".join(rec.get("themes", [])),
            "oppositions_json": json.dumps(rec.get("oppositions", []), ensure_ascii=False),
            "metaphors_json": json.dumps(rec.get("metaphors", []), ensure_ascii=False),
            "prescriptive_json": json.dumps(rec.get("prescriptive", []), ensure_ascii=False),
            "argument_method": "; ".join(argument.get("method", [])),
            "argument_devices": "; ".join(argument.get("devices", [])),
            "argument_structure": argument.get("structure", ""),
            "actors": "; ".join(rec.get("actors", [])),
            "scope_level": (rec.get("scope", {}) or {}).get("level", ""),
            "scope_context": (rec.get("scope", {}) or {}).get("context", None),
            "connections_json": json.dumps(rec.get("connections", []), ensure_ascii=False),
            "distinctive": "; ".join(rec.get("distinctive", [])),
        }
        rows.append(row)
    return pd.DataFrame(rows)

# --- Processing ---
def process_batch(client, model: str, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    # Scale max_tokens with batch size to avoid truncation
    scaled_max_tokens = min(4000, 700 + 250 * max(1, len(items)))
    user_prompt = build_batch_prompt(items)
    resp_text = call_llm(client, model, "You are a philosophical analysis system. Return only valid JSON.", user_prompt, max_retries=3, max_tokens=scaled_max_tokens)
    parsed = extract_json(resp_text or "") if resp_text else None
    results_map: Dict[int, Dict[str, Any]] = {}
    if parsed and isinstance(parsed, dict):
        results = parsed.get("results")
        if isinstance(results, list):
            for obj in results:
                try:
                    rid = int(obj.get("row_id"))
                    results_map[rid] = obj
                except Exception:
                    continue

    outputs: List[Dict[str, Any]] = []
    for it in items:
        rid = int(it["row_id"])  # type: ignore
        obj = results_map.get(rid, {"row_id": rid})
        obj["row_id"] = rid
        clean = sanitize_record(obj)
        # If empty, retry single-quote extraction once
        if not is_meaningful(clean):
            single_prompt = build_single_prompt(it)
            single_text = call_llm(client, model, "You are a philosophical analysis system. Return only valid JSON.", single_prompt, max_retries=2, max_tokens=1200)
            parsed_single = extract_json(single_text or "") if single_text else None
            if isinstance(parsed_single, dict):
                parsed_single["row_id"] = rid
                clean_single = sanitize_record(parsed_single)
                if is_meaningful(clean_single):
                    clean = clean_single
        try:
            with open(cache_path(rid), "w", encoding="utf-8") as f:
                json.dump(clean, f, ensure_ascii=False)
        except Exception:
            pass
        outputs.append(clean)
    return outputs

def chunker(lst: List[Dict[str, Any]], n: int):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

def estimate_cost(num_quotes: int, avg_input_tokens: int = 300, avg_output_tokens: int = 220) -> Tuple[float, float, float]:
    in_cost = (num_quotes * avg_input_tokens) / 1_000_000 * 0.20
    out_cost = (num_quotes * avg_output_tokens) / 1_000_000 * 0.60
    return (round(in_cost, 4), round(out_cost, 4), round(in_cost + out_cost, 4))

# --- Run ---
def run():
    os.makedirs(ROOT, exist_ok=True)
    ensure_cache_dirs()
    if CLEAR_CACHE and os.path.exists(CACHE_DIR):
        try:
            shutil.rmtree(CACHE_DIR)
            ensure_cache_dirs()
            print("Cleared cache directory:", CACHE_DIR)
        except Exception as e:
            print("Failed to clear cache:", e)

    if not os.path.exists(BASE_CSV):
        raise FileNotFoundError(f"CSV not found at {BASE_CSV}")

    base = load_base(BASE_CSV)
    print(f"Loaded {len(base)} quotes from CSV")

    client = make_client()

    # Determine rows to process
    rows: List[Dict[str, Any]] = []
    out_records: List[Dict[str, Any]] = []
    already = 0
    for _, r in base.iterrows():
        rid = int(r["row_id"])  # type: ignore[index]
        cpath = cache_path(rid)
        if RESUME and os.path.exists(cpath):
            try:
                with open(cpath, "r", encoding="utf-8") as f:
                    cached = json.load(f)
                out_records.append(cached)
                already += 1
                continue
            except Exception:
                pass
        rows.append({
            "row_id": rid,
            "philosopher": str(r.get("philosopher", "")),
            "text": str(r["text"]).strip(),
        })

    if RUN_LIMIT is not None:
        rows = rows[:max(0, RUN_LIMIT)]

    print(f"Cache hits used: {already}; to process now: {len(rows)}")
    total_to_process = len(rows)
    total_quotes_all = already + total_to_process

    out_lock = Lock()
    if total_to_process:
        print(f"Processing {total_to_process} quotes in batches of {BATCH_SIZE} with concurrency={CONCURRENCY}…")
        futures = []
        done_batches = 0
        with ThreadPoolExecutor(max_workers=max(1, CONCURRENCY)) as ex:
            for batch in chunker(rows, max(1, BATCH_SIZE)):
                futures.append(ex.submit(process_batch, client, MODEL, batch))
            for fut in as_completed(futures):
                try:
                    batch_out = fut.result()
                except Exception as e:
                    print("Batch failed:", e)
                    batch_out = []
                with out_lock:
                    out_records.extend(batch_out)
                    done_batches += 1
                    # Snapshot after each N batches
                    if done_batches % SNAPSHOT_EVERY_N_BATCHES == 0:
                        # Write JSONL snapshot
                        try:
                            with open(OUT_JSONL, "w", encoding="utf-8") as f:
                                for rec in out_records:
                                    f.write(json.dumps(rec, ensure_ascii=False) + "\n")
                        except Exception as e:
                            print("Failed writing JSONL snapshot:", e)
                        # Write CSV snapshot
                        try:
                            flat = flatten_records(out_records, base)
                            if not flat.empty:
                                flat.to_csv(OUT_CSV, index=False)
                        except Exception as e:
                            print("Failed writing CSV snapshot:", e)
                        # Progress summary (derive completed batches from processed_count for robustness)
                        total_batches = max(1, (total_to_process + max(1, BATCH_SIZE) - 1) // max(1, BATCH_SIZE))
                        processed_count = len(out_records)
                        completed_batches = min(total_batches, math.ceil(processed_count / max(1, BATCH_SIZE)))
                        print(
                            f"Completed {completed_batches}/{total_batches} batches; "
                            f"processed {processed_count}/{total_quotes_all} quotes… snapshot written to CSV/JSONL.",
                            flush=True,
                        )
                        if BATCH_PREVIEW:
                            try:
                                _df = flatten_records(out_records[-max(1, PREVIEW_MAX_ITEMS*BATCH_SIZE):], base)
                                if not _df.empty:
                                    _row = _df.iloc[-1]
                                    print("\nPreview (last item):")
                                    print("- row_id:", _row.get("row_id", ""))
                                    print("- philosopher:", _row.get("philosopher", ""))
                                    _qt = str(_row.get("text", ""))
                                    print("- quote:", (_qt[:200] + "...") if len(_qt) > 200 else _qt)

                                    # Core concepts
                                    _prim = str(_row.get("core_concepts_primary", "")).strip()
                                    _sec = str(_row.get("core_concepts_secondary", "")).strip()
                                    print("- primary concepts:", _prim if _prim else "None")
                                    print("- secondary concepts:", _sec if _sec else "None")

                                    # Themes
                                    _themes = str(_row.get("themes", "")).strip()
                                    print("- themes:", _themes if _themes else "None")

                                    # Oppositions
                                    try:
                                        _opps_raw = _row.get("oppositions_json", "[]")
                                        _opps = json.loads(_opps_raw if isinstance(_opps_raw, str) and _opps_raw.strip() else "[]")
                                    except Exception:
                                        _opps = []
                                    if _opps:
                                        _opp_str = "; ".join([f"{o.get('term','')} ({o.get('type','')})".strip() for o in _opps if o])
                                        print("- oppositions:", _opp_str if _opp_str else "None")
                                    else:
                                        print("- oppositions:", "None")

                                    # Metaphors
                                    try:
                                        _met_raw = _row.get("metaphors_json", "[]")
                                        _mets = json.loads(_met_raw if isinstance(_met_raw, str) and _met_raw.strip() else "[]")
                                    except Exception:
                                        _mets = []
                                    if _mets:
                                        _met_str = "; ".join([f"{m.get('image','')} ({m.get('category','')})→{m.get('represents','')}".strip() for m in _mets if m])
                                        print("- metaphors:", _met_str if _met_str else "None")
                                    else:
                                        print("- metaphors:", "None")

                                    # Prescriptive
                                    try:
                                        _pre_raw = _row.get("prescriptive_json", "[]")
                                        _pres = json.loads(_pre_raw if isinstance(_pre_raw, str) and _pre_raw.strip() else "[]")
                                    except Exception:
                                        _pres = []
                                    if _pres:
                                        _pre_str = "; ".join([f"{p.get('type','')}: {p.get('prescription','')}".strip() for p in _pres if p])
                                        print("- prescriptive:", _pre_str if _pre_str else "None")
                                    else:
                                        print("- prescriptive:", "None")

                                    # Argument
                                    _am = str(_row.get("argument_method", "")).strip()
                                    _ad = str(_row.get("argument_devices", "")).strip()
                                    _as = str(_row.get("argument_structure", "")).strip()
                                    print("- argument methods:", _am if _am else "None")
                                    print("- rhetorical devices:", _ad if _ad else "None")
                                    print("- structure:", _as if _as else "None")

                                    # Actors
                                    _actors = str(_row.get("actors", "")).strip()
                                    print("- actors:", _actors if _actors else "None")

                                    # Scope
                                    _lvl = str(_row.get("scope_level", ""))
                                    _ctx = _row.get("scope_context", "")
                                    if _lvl and _ctx not in (None, "", float('nan')):
                                        print("- scope:", f"{_lvl} [{_ctx}]")
                                    elif _lvl:
                                        print("- scope:", _lvl)
                                    else:
                                        print("- scope:", "None")

                                    # Connections
                                    try:
                                        _conn_raw = _row.get("connections_json", "[]")
                                        _conns = json.loads(_conn_raw if isinstance(_conn_raw, str) and _conn_raw.strip() else "[]")
                                    except Exception:
                                        _conns = []
                                    if _conns:
                                        _conn_str = "; ".join([f"{c.get('concept','')}→{c.get('connects_to','')} ({c.get('relationship','')})".strip() for c in _conns if c])
                                        print("- connections:", _conn_str if _conn_str else "None")
                                    else:
                                        print("- connections:", "None")

                                    # Distinctive
                                    _dist = str(_row.get("distinctive", "")).strip()
                                    print("- distinctive:", _dist if _dist else "None")
                            except Exception as _e:
                                print("Preview failed:", _e)
    else:
        print("Nothing to process (all cached or limit=0). Set RESUME=False to force fresh generations.")

    # Final write
    if out_records:
        try:
            with open(OUT_JSONL, "w", encoding="utf-8") as f:
                for rec in out_records:
                    f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        except Exception as e:
            print("Final JSONL write failed:", e)

        try:
            flat = flatten_records(out_records, base)
            if not flat.empty:
                flat.to_csv(OUT_CSV, index=False)
        except Exception as e:
            print("Final CSV write failed:", e)

    print("Saved:")
    print(" ", OUT_JSONL)
    if os.path.exists(OUT_CSV):
        print(" ", OUT_CSV)

    if out_records:
        num_quotes = len(out_records)
        in_cost, out_cost, total = estimate_cost(num_quotes)
        print(f"Estimated cost for {num_quotes} quotes (avg in 300, out 220): ${total} (in ${in_cost} + out ${out_cost})")


# Execute when run in Colab cell or as a script
if __name__ == "__main__":
    run()




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded 1162 quotes from CSV
Cache hits used: 0; to process now: 1162
Processing 1162 quotes in batches of 8 with concurrency=8…
Completed 1/146 batches; processed 8/1162 quotes… snapshot written to CSV/JSONL.

Preview (last item):
- row_id: 15
- philosopher: Confucius
- quote: "The Master said, ‘The gentleman seeks neither a full belly nor a comfortable home. He is quick in action but cautious in speech. He goes to men possessed of the Way to have himself put right. Such a ...
- primary concepts: junzi; learning
- secondary concepts: self_restraint
- themes: moral_character; intellectual_pursuit
- oppositions: material_comfort_vs_moral_pursuit (ethical)
- metaphors: None
- prescriptive: ethical_guidance: seek_neither_full_belly_nor_comfortable_home; learning_guidance: be_quick_in_action_but_cautious_in_speech; self_cultivation: consult_those_who_possess_the_w

In [33]:
# === COMPLETE QUOTE EMBEDDINGS GENERATOR (FIXED) ===
# Paste this cell into Colab and run to create embeddings for all philosopher quotes

import os
import pandas as pd
import numpy as np
from typing import List, Tuple
from sklearn.metrics.pairwise import cosine_similarity

# Install dependencies
!pip -q install sentence-transformers pandas numpy scikit-learn

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Configuration
ROOT = "/content/drive/MyDrive/Chinese Philosophers"
CSV_PATH = os.path.join(ROOT, "chinese_philosophers_quotes_corrected.csv")
EMBEDDINGS_CSV = os.path.join(ROOT, "quote_embeddings_full.csv")
EMBEDDINGS_NPY = os.path.join(ROOT, "quote_embeddings.npy")
METADATA_CSV = os.path.join(ROOT, "quote_metadata.csv")

os.makedirs(ROOT, exist_ok=True)

def load_and_embed_quotes():
    """Load quotes and create embeddings"""
    print("📚 Loading quotes...")

    # Load the CSV
    df = pd.read_csv(CSV_PATH)
    df.columns = [c.strip().lower() for c in df.columns]

    # Add row_id if missing
    if "row_id" not in df.columns:
        df.insert(0, "row_id", range(len(df)))
        print("✅ Added row_id column")

    # Ensure we have the right columns
    if "text" not in df.columns:
        for cand in ["quote", "translation", "english", "content"]:
            if cand in df.columns:
                df = df.rename(columns={cand: "text"})
                break

    print(f"✅ Loaded {len(df)} quotes")
    print(f"Columns: {list(df.columns)}")

    # Create embeddings
    print("🧠 Computing embeddings...")
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    texts = df["text"].astype(str).tolist()

    # Compute embeddings in batches
    batch_size = 128
    embeddings_list = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, normalize_embeddings=True, show_progress_bar=False)
        embeddings_list.append(batch_embeddings)
        print(f"  Processed {min(i+batch_size, len(texts))}/{len(texts)} quotes")

    # Combine all embeddings
    embeddings = np.vstack(embeddings_list).astype(np.float32)
    print(f"✅ Created embeddings: {embeddings.shape}")

    return df, embeddings

def save_embeddings(df, embeddings):
    """Save embeddings in multiple formats"""
    print("💾 Saving embeddings...")

    # 1. Save raw embeddings as numpy array (most efficient)
    np.save(EMBEDDINGS_NPY, embeddings)
    print(f"✅ Saved embeddings array: {EMBEDDINGS_NPY}")

    # 2. Save metadata separately (for easy loading)
    # Use available columns, fallback gracefully
    metadata_cols = ["row_id", "text"]
    if "philosopher" in df.columns:
        metadata_cols.insert(1, "philosopher")
    if "work" in df.columns:
        metadata_cols.insert(-1, "work")
    if "chapter_verse" in df.columns:
        metadata_cols.insert(-1, "chapter_verse")

    metadata_df = df[metadata_cols].copy()
    metadata_df.to_csv(METADATA_CSV, index=False)
    print(f"✅ Saved metadata: {METADATA_CSV}")

    # 3. Save complete CSV with embeddings (larger but self-contained)
    # Create column names for embedding dimensions
    embed_cols = [f"dim_{i}" for i in range(embeddings.shape[1])]

    # Combine metadata with embeddings
    full_df = metadata_df.copy()
    embed_df = pd.DataFrame(embeddings, columns=embed_cols)
    result_df = pd.concat([full_df, embed_df], axis=1)

    result_df.to_csv(EMBEDDINGS_CSV, index=False)
    print(f"✅ Saved full embeddings CSV: {EMBEDDINGS_CSV}")

    return result_df

def create_search_function():
    """Create a search function for finding similar quotes"""
    search_code = '''
def search_similar_quotes(query_text, top_k=5):
    """Find quotes most similar to the query text"""
    # Load embeddings and metadata
    embeddings = np.load("''' + EMBEDDINGS_NPY + '''")
    metadata = pd.read_csv("''' + METADATA_CSV + '''")

    # Encode query
    from sentence_transformers import SentenceTransformer
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    query_embedding = model.encode([query_text], normalize_embeddings=True)

    # Compute similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]

    # Get top results
    top_indices = np.argsort(similarities)[::-1][:top_k]

    results = []
    for idx in top_indices:
        result = {
            'row_id': int(metadata.iloc[idx]['row_id']),
            'text': metadata.iloc[idx]['text'],
            'similarity': float(similarities[idx])
        }
        # Add optional columns if they exist
        if 'philosopher' in metadata.columns:
            result['philosopher'] = metadata.iloc[idx]['philosopher']
        if 'work' in metadata.columns:
            result['work'] = metadata.iloc[idx]['work']
        if 'chapter_verse' in metadata.columns:
            result['chapter_verse'] = metadata.iloc[idx]['chapter_verse']

        results.append(result)

    return results

# Example usage:
# results = search_similar_quotes("love and compassion", top_k=3)
# for r in results:
#     philosopher = r.get('philosopher', 'Unknown')
#     print(f"{r['similarity']:.3f} | {philosopher}: {r['text'][:100]}...")
'''

    # Save search function to file
    search_file = os.path.join(ROOT, "quote_search.py")
    with open(search_file, 'w') as f:
        f.write(search_code)

    print(f"✅ Saved search function: {search_file}")
    return search_code

# === MAIN EXECUTION ===
print("🚀 Starting quote embedding generation...")

try:
    # Load and embed
    df, embeddings = load_and_embed_quotes()

    # Save in multiple formats
    result_df = save_embeddings(df, embeddings)

    # Create search functionality
    search_code = create_search_function()

    print("\n" + "="*60)
    print("✅ SUCCESS! Created the following files:")
    print(f"  📊 {EMBEDDINGS_CSV} - Full CSV with embeddings")
    print(f"  🔢 {EMBEDDINGS_NPY} - Numpy array (efficient)")
    print(f"  📋 {METADATA_CSV} - Quote metadata only")
    print(f"  🔍 {os.path.join(ROOT, 'quote_search.py')} - Search function")

    print(f"\n📈 Embedding dimensions: {embeddings.shape[1]}")
    print(f"📚 Total quotes embedded: {len(df)}")

    # Show preview
    print("\n🔍 Preview of embeddings CSV:")
    preview_cols = ['row_id', 'text']
    if 'philosopher' in result_df.columns:
        preview_cols.insert(1, 'philosopher')
    preview_cols.extend(['dim_0', 'dim_1', 'dim_2'])
    print(result_df[preview_cols].head(3))

    # Test search function
    print("\n🧪 Testing search function...")
    exec(search_code)  # Load the search function

    test_results = search_similar_quotes("universal love", top_k=3)
    print("Top 3 results for 'universal love':")
    for i, r in enumerate(test_results, 1):
        philosopher = r.get('philosopher', 'Unknown')
        print(f"  {i}. [{r['similarity']:.3f}] {philosopher}: {r['text'][:80]}...")

except Exception as e:
    print(f"❌ Error: {e}")
    import traceback
    traceback.print_exc()

print("\n🎉 Done! You now have a complete vectorial search system for philosopher quotes!")

Mounted at /content/drive
🚀 Starting quote embedding generation...
📚 Loading quotes...
✅ Added row_id column
✅ Loaded 1162 quotes
Columns: ['row_id', 'philosopher', 'work', 'chapter_verse', 'text', 'source']
🧠 Computing embeddings...
  Processed 128/1162 quotes
  Processed 256/1162 quotes
  Processed 384/1162 quotes
  Processed 512/1162 quotes
  Processed 640/1162 quotes
  Processed 768/1162 quotes
  Processed 896/1162 quotes
  Processed 1024/1162 quotes
  Processed 1152/1162 quotes
  Processed 1162/1162 quotes
✅ Created embeddings: (1162, 384)
💾 Saving embeddings...
✅ Saved embeddings array: /content/drive/MyDrive/Chinese Philosophers/quote_embeddings.npy
✅ Saved metadata: /content/drive/MyDrive/Chinese Philosophers/quote_metadata.csv
✅ Saved full embeddings CSV: /content/drive/MyDrive/Chinese Philosophers/quote_embeddings_full.csv
✅ Saved search function: /content/drive/MyDrive/Chinese Philosophers/quote_search.py

✅ SUCCESS! Created the following files:
  📊 /content/drive/MyDrive/Ch

In [34]:
# ============================================
# SECTION 1: INITIAL SETUP AND IMPORTS
# ============================================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Core imports
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

# Visualization imports
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning imports
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Set visualization defaults
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

# Define paths
BASE_PATH = '/content/drive/MyDrive/Chinese Philosophers/'
print(f"✓ Setup complete. Base path: {BASE_PATH}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✓ Setup complete. Base path: /content/drive/MyDrive/Chinese Philosophers/


In [38]:
# ============================================
# SECTION 2: HELPER FUNCTIONS FOR DATA LOADING & PROCESSING
# ============================================

def load_embeddings_and_metadata(base_path=BASE_PATH):
    """Load the raw embeddings and corresponding metadata."""
    try:
        # Load the numpy embeddings
        embeddings = np.load(f'{base_path}quote_embeddings.npy')

        # Load the metadata
        metadata = pd.read_csv(f'{base_path}quote_metadata.csv')

        print(f"✓ Loaded embeddings: shape {embeddings.shape}")
        print(f"✓ Loaded metadata: {len(metadata)} rows")
        print(f"  Philosophers: {metadata['philosopher'].value_counts().to_dict()}")

        return embeddings, metadata
    except Exception as e:
        print(f"❌ Error loading embeddings/metadata: {e}")
        return None, None

def load_csv_safe(filename, base_path=BASE_PATH):
    """Safely load CSV with error handling."""
    try:
        df = pd.read_csv(f'{base_path}{filename}')
        print(f"✓ Loaded {filename}: {len(df)} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"❌ Error loading {filename}: {e}")
        return None

def parse_semicolon_field(field_value):
    """Parse semicolon-separated string into list, handling NaN."""
    if pd.isna(field_value):
        return []
    return [item.strip() for item in str(field_value).split(';') if item.strip()]

def parse_json_field(field_value):
    """Parse JSON string field, returning empty list if invalid."""
    if pd.isna(field_value):
        return []
    try:
        return json.loads(field_value)
    except:
        return []

def compute_philosopher_centroids(embeddings, metadata):
    """Compute the centroid (mean vector) for each philosopher."""
    centroids = {}

    for philosopher in metadata['philosopher'].unique():
        # Get indices for this philosopher
        mask = metadata['philosopher'] == philosopher
        philosopher_embeddings = embeddings[mask]

        # Compute centroid (mean across all quotes)
        centroid = philosopher_embeddings.mean(axis=0)
        centroids[philosopher] = centroid

        print(f"✓ Computed centroid for {philosopher}: shape {centroid.shape}")

    return centroids

def find_bridge_quotes(embeddings, metadata, n_bridges=10):
    """Find quotes from each philosopher closest to the other's centroid."""
    centroids = compute_philosopher_centroids(embeddings, metadata)
    bridges = {}

    for source_phil in ['Confucius', 'Mozi']:
        target_phil = 'Mozi' if source_phil == 'Confucius' else 'Confucius'

        # Get source philosopher's quotes
        source_mask = metadata['philosopher'] == source_phil
        source_embeddings = embeddings[source_mask]
        source_metadata = metadata[source_mask].copy()

        # Compute similarities to target centroid
        target_centroid = centroids[target_phil].reshape(1, -1)
        similarities = cosine_similarity(source_embeddings, target_centroid).flatten()

        # Get top N most similar
        top_indices = similarities.argsort()[-n_bridges:][::-1]

        bridge_df = source_metadata.iloc[top_indices].copy()
        bridge_df['similarity_to_other'] = similarities[top_indices]

        bridges[f"{source_phil}_to_{target_phil}"] = bridge_df

    return bridges

# Test the loading functions
print("=" * 50)
print("TESTING HELPER FUNCTIONS")
print("=" * 50)

# Test embedding loading
embeddings, metadata = load_embeddings_and_metadata()

# Test CSV loading
quotes_df = load_csv_safe('chinese_philosophers_quotes_corrected.csv')
analysis_df = load_csv_safe('per_quote_qwen_multidim.csv')

print("\n✓ All helper functions defined and tested!")

TESTING HELPER FUNCTIONS
✓ Loaded embeddings: shape (1162, 384)
✓ Loaded metadata: 1162 rows
  Philosophers: {'Mozi': 634, 'Confucius': 528}
✓ Loaded chinese_philosophers_quotes_corrected.csv: 1162 rows, 5 columns
✓ Loaded per_quote_qwen_multidim.csv: 1090 rows, 19 columns

✓ All helper functions defined and tested!


In [39]:
# ============================================
# ANALYSIS 1: EMBEDDING SPACE PHILOSOPHICAL TERRITORIES
# ============================================

print("ANALYSIS 1: Philosophical Territories in Embedding Space")
print("=" * 60)

# Reduce embeddings to 2D using UMAP and PCA
print("Computing dimensionality reductions...")

# UMAP reduction (better for preserving local structure)
umap_reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.1)
embeddings_umap = umap_reducer.fit_transform(embeddings)

# PCA reduction (better for preserving global structure)
pca_reducer = PCA(n_components=2, random_state=42)
embeddings_pca = pca_reducer.fit_transform(embeddings)

print(f"✓ UMAP variance preserved: Not applicable (non-linear)")
print(f"✓ PCA variance explained: {pca_reducer.explained_variance_ratio_.sum():.2%}")

# Create visualization dataframe
viz_df = metadata.copy()
viz_df['UMAP_1'] = embeddings_umap[:, 0]
viz_df['UMAP_2'] = embeddings_umap[:, 1]
viz_df['PCA_1'] = embeddings_pca[:, 0]
viz_df['PCA_2'] = embeddings_pca[:, 1]

# Create side-by-side interactive plots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('UMAP Projection', 'PCA Projection'),
    horizontal_spacing=0.15
)

# Color mapping
colors = {'Confucius': '#1f77b4', 'Mozi': '#ff7f0e'}

# UMAP plot
for philosopher in ['Confucius', 'Mozi']:
    phil_data = viz_df[viz_df['philosopher'] == philosopher]
    fig.add_trace(
        go.Scatter(
            x=phil_data['UMAP_1'],
            y=phil_data['UMAP_2'],
            mode='markers',
            name=philosopher,
            marker=dict(
                color=colors[philosopher],
                size=6,
                opacity=0.7,
                line=dict(width=0.5, color='white')
            ),
            text=[f"{row['philosopher']}<br>Quote: {row['text'][:100]}..."
                  for _, row in phil_data.iterrows()],
            hovertemplate='%{text}<extra></extra>',
            legendgroup=philosopher,
            showlegend=True
        ),
        row=1, col=1
    )

# PCA plot
for philosopher in ['Confucius', 'Mozi']:
    phil_data = viz_df[viz_df['philosopher'] == philosopher]
    fig.add_trace(
        go.Scatter(
            x=phil_data['PCA_1'],
            y=phil_data['PCA_2'],
            mode='markers',
            name=philosopher,
            marker=dict(
                color=colors[philosopher],
                size=6,
                opacity=0.7,
                line=dict(width=0.5, color='white')
            ),
            text=[f"{row['philosopher']}<br>Quote: {row['text'][:100]}..."
                  for _, row in phil_data.iterrows()],
            hovertemplate='%{text}<extra></extra>',
            legendgroup=philosopher,
            showlegend=False
        ),
        row=1, col=2
    )

fig.update_layout(
    title_text="<b>Philosophical Territories in Semantic Space</b><br><sup>Do Mozi and Confucius occupy distinct regions of thought?</sup>",
    title_x=0.5,
    height=600,
    width=1200,
    template='plotly_white',
    hovermode='closest',
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
    )
)

fig.update_xaxes(title_text="UMAP Dimension 1", row=1, col=1)
fig.update_yaxes(title_text="UMAP Dimension 2", row=1, col=1)
fig.update_xaxes(title_text=f"PCA Dimension 1 ({pca_reducer.explained_variance_ratio_[0]:.1%} var)", row=1, col=2)
fig.update_yaxes(title_text=f"PCA Dimension 2 ({pca_reducer.explained_variance_ratio_[1]:.1%} var)", row=1, col=2)

fig.show()

# Compute overlap statistics
print("\n📊 TERRITORIAL ANALYSIS:")
print("-" * 40)

# Find border regions (quotes in mixed neighborhoods)
from sklearn.neighbors import NearestNeighbors

for projection_name, projection_data in [('UMAP', embeddings_umap), ('PCA', embeddings_pca)]:
    # Find 10 nearest neighbors for each quote
    nbrs = NearestNeighbors(n_neighbors=11).fit(projection_data)  # 11 because it includes self
    distances, indices = nbrs.kneighbors(projection_data)

    # Check philosophical diversity in neighborhood
    border_quotes = []
    for i, neighbors in enumerate(indices):
        neighbor_philosophers = metadata.iloc[neighbors[1:]]['philosopher'].values  # Exclude self
        confucius_count = (neighbor_philosophers == 'Confucius').sum()
        mozi_count = (neighbor_philosophers == 'Mozi').sum()

        # Consider it a border quote if neighborhood is mixed (at least 30% from other philosopher)
        if min(confucius_count, mozi_count) >= 3:
            border_quotes.append(i)

    border_percentage = len(border_quotes) / len(metadata) * 100
    print(f"\n{projection_name} Results:")
    print(f"  • Border quotes (mixed neighborhoods): {len(border_quotes)} ({border_percentage:.1f}%)")
    print(f"  • Suggests {'significant overlap' if border_percentage > 30 else 'moderate overlap' if border_percentage > 15 else 'distinct territories'}")

print("\n💡 PHILOSOPHICAL INSIGHT:")
print("If the philosophers occupy distinct regions with minimal overlap,")
print("it suggests fundamentally different conceptual frameworks.")
print("Border quotes represent potential common ground or transition ideas.")

ANALYSIS 1: Philosophical Territories in Embedding Space
Computing dimensionality reductions...
✓ UMAP variance preserved: Not applicable (non-linear)
✓ PCA variance explained: 13.89%



📊 TERRITORIAL ANALYSIS:
----------------------------------------

UMAP Results:
  • Border quotes (mixed neighborhoods): 57 (4.9%)
  • Suggests distinct territories

PCA Results:
  • Border quotes (mixed neighborhoods): 239 (20.6%)
  • Suggests moderate overlap

💡 PHILOSOPHICAL INSIGHT:
If the philosophers occupy distinct regions with minimal overlap,
it suggests fundamentally different conceptual frameworks.
Border quotes represent potential common ground or transition ideas.


In [40]:
# ============================================
# ANALYSIS 2: CONCEPT FREQUENCY RADAR CHARTS
# ============================================

print("ANALYSIS 2: Philosophical Fingerprints via Concept Radar Charts")
print("=" * 60)

# Merge the analysis data to get concept information
merged_df = metadata.merge(analysis_df[['row_id', 'core_concepts_primary', 'core_concepts_secondary']],
                           on='row_id', how='left')

# Parse and collect all primary concepts
all_primary_concepts = []
philosopher_concepts = {'Confucius': [], 'Mozi': []}

for _, row in merged_df.iterrows():
    if pd.notna(row['core_concepts_primary']):
        concepts = parse_semicolon_field(row['core_concepts_primary'])
        all_primary_concepts.extend(concepts)
        philosopher_concepts[row['philosopher']].extend(concepts)

# Count concept frequencies for each philosopher
from collections import Counter

confucius_counts = Counter(philosopher_concepts['Confucius'])
mozi_counts = Counter(philosopher_concepts['Mozi'])

# Get top 15 most common concepts across both philosophers
all_concept_counts = Counter(all_primary_concepts)
top_concepts = [concept for concept, _ in all_concept_counts.most_common(15)]

print(f"✓ Found {len(set(all_primary_concepts))} unique primary concepts")
print(f"✓ Confucius total concept mentions: {len(philosopher_concepts['Confucius'])}")
print(f"✓ Mozi total concept mentions: {len(philosopher_concepts['Mozi'])}")

# Prepare data for radar chart
radar_data = []
for concept in top_concepts:
    # Normalize by total concepts to get relative frequency
    conf_freq = confucius_counts.get(concept, 0) / len(philosopher_concepts['Confucius']) * 100
    mozi_freq = mozi_counts.get(concept, 0) / len(philosopher_concepts['Mozi']) * 100
    radar_data.append({
        'concept': concept.replace('_', ' ').title(),
        'Confucius': conf_freq,
        'Mozi': mozi_freq
    })

radar_df = pd.DataFrame(radar_data)

# Create side-by-side radar charts
fig = go.Figure()

# Add Confucius trace
fig.add_trace(go.Scatterpolar(
    r=radar_df['Confucius'].values,
    theta=radar_df['concept'].values,
    fill='toself',
    name='Confucius',
    line_color='#1f77b4',
    fillcolor='rgba(31, 119, 180, 0.3)',
    hovertemplate='%{theta}<br>Frequency: %{r:.1f}%<extra></extra>'
))

# Add Mozi trace
fig.add_trace(go.Scatterpolar(
    r=radar_df['Mozi'].values,
    theta=radar_df['concept'].values,
    fill='toself',
    name='Mozi',
    line_color='#ff7f0e',
    fillcolor='rgba(255, 127, 14, 0.3)',
    hovertemplate='%{theta}<br>Frequency: %{r:.1f}%<extra></extra>'
))

fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(radar_df['Confucius'].max(), radar_df['Mozi'].max()) * 1.1],
            ticksuffix='%'
        )
    ),
    showlegend=True,
    title={
        'text': "<b>Philosophical Fingerprints: Core Concept Frequencies</b><br><sup>What does each philosopher emphasize?</sup>",
        'x': 0.5,
        'xanchor': 'center'
    },
    height=700,
    width=900,
    template='plotly_white'
)

fig.show()

# Create comparison bar chart for clearer differences
print("\n📊 CONCEPT EMPHASIS COMPARISON:")
print("-" * 40)

# Calculate difference in emphasis
radar_df['Difference'] = radar_df['Mozi'] - radar_df['Confucius']
radar_df['Abs_Difference'] = radar_df['Difference'].abs()
radar_df_sorted = radar_df.sort_values('Difference')

# Create diverging bar chart
fig2 = go.Figure()

# Add bars colored by who emphasizes more
colors = ['#ff7f0e' if x > 0 else '#1f77b4' for x in radar_df_sorted['Difference']]

fig2.add_trace(go.Bar(
    y=radar_df_sorted['concept'],
    x=radar_df_sorted['Difference'],
    orientation='h',
    marker_color=colors,
    text=[f"{abs(x):.1f}%" for x in radar_df_sorted['Difference']],
    textposition='outside',
    hovertemplate='%{y}<br>Difference: %{x:.1f}%<br>(Positive = Mozi emphasizes more)<extra></extra>'
))

fig2.update_layout(
    title={
        'text': "<b>Concept Emphasis Differences</b><br><sup>← Confucius emphasizes more | Mozi emphasizes more →</sup>",
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title="Difference in Frequency (%)",
    yaxis_title="",
    height=600,
    width=900,
    template='plotly_white',
    xaxis=dict(zeroline=True, zerolinewidth=2, zerolinecolor='black'),
    showlegend=False
)

fig2.show()

# Analysis summary
print("\nTop concepts by philosopher:")
print("\n🔷 CONFUCIUS's Top 5 Concepts:")
for concept, count in confucius_counts.most_common(5):
    freq = count / len(philosopher_concepts['Confucius']) * 100
    print(f"  • {concept.replace('_', ' ').title()}: {freq:.1f}%")

print("\n🔶 MOZI's Top 5 Concepts:")
for concept, count in mozi_counts.most_common(5):
    freq = count / len(philosopher_concepts['Mozi']) * 100
    print(f"  • {concept.replace('_', ' ').title()}: {freq:.1f}%")

# Find unique concepts
unique_confucius = set(confucius_counts.keys()) - set(mozi_counts.keys())
unique_mozi = set(mozi_counts.keys()) - set(confucius_counts.keys())

print(f"\n💡 PHILOSOPHICAL INSIGHT:")
print(f"  • Confucius has {len(unique_confucius)} unique concepts")
print(f"  • Mozi has {len(unique_mozi)} unique concepts")
print(f"  • Shared concepts: {len(set(confucius_counts.keys()) & set(mozi_counts.keys()))}")
print("\nThe radar chart reveals each philosopher's 'conceptual DNA' -")
print("what ideas they return to repeatedly in their teachings.")

ANALYSIS 2: Philosophical Fingerprints via Concept Radar Charts
✓ Found 1027 unique primary concepts
✓ Confucius total concept mentions: 1246
✓ Mozi total concept mentions: 1579



📊 CONCEPT EMPHASIS COMPARISON:
----------------------------------------



Top concepts by philosopher:

🔷 CONFUCIUS's Top 5 Concepts:
  • Virtue: 5.6%
  • Junzi: 5.4%
  • Propriety: 4.9%
  • Benevolence: 4.0%
  • Ren: 3.9%

🔶 MOZI's Top 5 Concepts:
  • Yi: 3.7%
  • Defensive Military Strategy: 2.5%
  • Meritocracy: 2.3%
  • Universal Love: 2.2%
  • Ren: 1.9%

💡 PHILOSOPHICAL INSIGHT:
  • Confucius has 323 unique concepts
  • Mozi has 622 unique concepts
  • Shared concepts: 82

The radar chart reveals each philosopher's 'conceptual DNA' -
what ideas they return to repeatedly in their teachings.


In [41]:
# ============================================
# ANALYSIS 3: RHETORICAL STRATEGY COMPARISON
# ============================================

print("ANALYSIS 3: Rhetorical Strategies - How They Argue")
print("=" * 60)

# Ensure we have the analysis data with rhetorical fields
rhetorical_df = metadata.merge(
    analysis_df[['row_id', 'argument_method', 'argument_devices', 'argument_structure']],
    on='row_id',
    how='left'
)

# Remove rows with missing rhetorical data
rhetorical_df = rhetorical_df.dropna(subset=['argument_method'])
print(f"✓ Analyzing {len(rhetorical_df)} quotes with rhetorical data")

# 1. ARGUMENT METHODS COMPARISON
print("\n📊 ARGUMENT METHODS:")
print("-" * 40)

method_comparison = []
for philosopher in ['Confucius', 'Mozi']:
    phil_data = rhetorical_df[rhetorical_df['philosopher'] == philosopher]
    method_counts = phil_data['argument_method'].value_counts()
    method_freq = (method_counts / len(phil_data) * 100).to_dict()

    for method, freq in method_freq.items():
        method_comparison.append({
            'Philosopher': philosopher,
            'Method': method.replace('_', ' ').title(),
            'Frequency': freq,
            'Count': method_counts[method]
        })

method_df = pd.DataFrame(method_comparison)

# Create grouped bar chart for argument methods
fig_methods = go.Figure()

for philosopher in ['Confucius', 'Mozi']:
    phil_methods = method_df[method_df['Philosopher'] == philosopher]
    color = '#1f77b4' if philosopher == 'Confucius' else '#ff7f0e'

    fig_methods.add_trace(go.Bar(
        name=philosopher,
        x=phil_methods['Method'],
        y=phil_methods['Frequency'],
        marker_color=color,
        text=[f"{freq:.1f}%" for freq in phil_methods['Frequency']],
        textposition='outside',
        hovertemplate='%{x}<br>Frequency: %{y:.1f}%<br>Count: %{customdata}<extra></extra>',
        customdata=phil_methods['Count']
    ))

fig_methods.update_layout(
    title="<b>Argument Methods: How They Make Their Points</b>",
    xaxis_title="Argument Method",
    yaxis_title="Frequency (%)",
    barmode='group',
    height=500,
    width=1000,
    template='plotly_white',
    legend=dict(x=0.85, y=0.95)
)

fig_methods.show()

# 2. ARGUMENT DEVICES ANALYSIS
print("\n📊 RHETORICAL DEVICES:")
print("-" * 40)

# Parse and collect all devices
all_devices = []
philosopher_devices = {'Confucius': [], 'Mozi': []}

for _, row in rhetorical_df.iterrows():
    if pd.notna(row['argument_devices']):
        devices = parse_semicolon_field(row['argument_devices'])
        all_devices.extend(devices)
        philosopher_devices[row['philosopher']].extend(devices)

# Count device frequencies
from collections import Counter
confucius_devices = Counter(philosopher_devices['Confucius'])
mozi_devices = Counter(philosopher_devices['Mozi'])

# Get top 10 most common devices
all_device_counts = Counter(all_devices)
top_devices = [device for device, _ in all_device_counts.most_common(10)]

# Create comparison data
device_comparison = []
for device in top_devices:
    conf_count = confucius_devices.get(device, 0)
    mozi_count = mozi_devices.get(device, 0)

    device_comparison.append({
        'Device': device.replace('_', ' ').title(),
        'Confucius': conf_count / len(philosopher_devices['Confucius']) * 100,
        'Mozi': mozi_count / len(philosopher_devices['Mozi']) * 100,
        'Difference': (mozi_count / len(philosopher_devices['Mozi']) * 100) -
                     (conf_count / len(philosopher_devices['Confucius']) * 100)
    })

device_df = pd.DataFrame(device_comparison)
device_df = device_df.sort_values('Difference')

# Create diverging bar chart for devices
fig_devices = go.Figure()

colors = ['#ff7f0e' if x > 0 else '#1f77b4' for x in device_df['Difference']]

fig_devices.add_trace(go.Bar(
    y=device_df['Device'],
    x=device_df['Difference'],
    orientation='h',
    marker_color=colors,
    text=[f"{abs(x):.1f}%" for x in device_df['Difference']],
    textposition='outside',
    hovertemplate='%{y}<br>Difference: %{x:.1f}%<br>(Positive = Mozi uses more)<extra></extra>'
))

fig_devices.update_layout(
    title={
        'text': "<b>Rhetorical Device Preferences</b><br><sup>← Confucius uses more | Mozi uses more →</sup>",
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title="Difference in Usage (%)",
    yaxis_title="",
    height=500,
    width=900,
    template='plotly_white',
    xaxis=dict(zeroline=True, zerolinewidth=2, zerolinecolor='black')
)

fig_devices.show()

# 3. ARGUMENT STRUCTURE PATTERNS
print("\n📊 ARGUMENT STRUCTURES:")
print("-" * 40)

# Analyze argument structures
structure_pivot = rhetorical_df.groupby(['philosopher', 'argument_structure']).size().unstack(fill_value=0)
structure_pct = structure_pivot.div(structure_pivot.sum(axis=1), axis=0) * 100

# Create heatmap
fig_structure = go.Figure(data=go.Heatmap(
    z=structure_pct.values,
    x=structure_pct.columns.str.replace('_', ' ').str.title(),
    y=structure_pct.index,
    colorscale='RdBu_r',
    text=[[f"{val:.1f}%" for val in row] for row in structure_pct.values],
    texttemplate="%{text}",
    textfont={"size": 12},
    colorbar=dict(title="Frequency (%)")
))

fig_structure.update_layout(
    title="<b>Argument Structure Preferences</b>",
    xaxis_title="Structure Type",
    yaxis_title="Philosopher",
    height=400,
    width=900,
    template='plotly_white'
)

fig_structure.show()

# Summary statistics
print("\n💡 RHETORICAL INSIGHTS:")
print("-" * 40)

for philosopher in ['Confucius', 'Mozi']:
    phil_data = rhetorical_df[rhetorical_df['philosopher'] == philosopher]

    # Most common method
    top_method = phil_data['argument_method'].value_counts().iloc[0]
    top_method_name = phil_data['argument_method'].value_counts().index[0]

    # Most common structure
    top_structure = phil_data['argument_structure'].value_counts().iloc[0]
    top_structure_name = phil_data['argument_structure'].value_counts().index[0]

    # Device variety
    unique_devices = set()
    for devices in phil_data['argument_devices'].dropna():
        unique_devices.update(parse_semicolon_field(devices))

    print(f"\n{philosopher}:")
    print(f"  • Primary method: {top_method_name.replace('_', ' ').title()} ({top_method/len(phil_data)*100:.1f}%)")
    print(f"  • Primary structure: {top_structure_name.replace('_', ' ').title()} ({top_structure/len(phil_data)*100:.1f}%)")
    print(f"  • Rhetorical device variety: {len(unique_devices)} unique devices")

print("\n🔍 INTERPRETATION:")
print("These patterns reveal HOW each philosopher persuades:")
print("• Methods show their logical approach (analogies vs principles)")
print("• Devices reveal their rhetorical toolkit")
print("• Structures indicate how they organize arguments")

ANALYSIS 3: Rhetorical Strategies - How They Argue
✓ Analyzing 1057 quotes with rhetorical data

📊 ARGUMENT METHODS:
----------------------------------------



📊 RHETORICAL DEVICES:
----------------------------------------



📊 ARGUMENT STRUCTURES:
----------------------------------------



💡 RHETORICAL INSIGHTS:
----------------------------------------

Confucius:
  • Primary method: Exemplification (5.2%)
  • Primary structure: Assertion With Qualification (2.7%)
  • Rhetorical device variety: 277 unique devices

Mozi:
  • Primary method: Empirical Observation; Technical Specification (3.2%)
  • Primary structure: Problem-Solution (6.5%)
  • Rhetorical device variety: 307 unique devices

🔍 INTERPRETATION:
These patterns reveal HOW each philosopher persuades:
• Methods show their logical approach (analogies vs principles)
• Devices reveal their rhetorical toolkit
• Structures indicate how they organize arguments


In [42]:
# ============================================
# ANALYSIS 4: METAPHOR SOURCE DOMAIN ANALYSIS
# ============================================

print("ANALYSIS 4: Metaphor Source Domains - Where They Find Meaning")
print("=" * 60)

# Merge to get metaphor data
metaphor_df = metadata.merge(
    analysis_df[['row_id', 'metaphors_json']],
    on='row_id',
    how='left'
)

# Parse metaphors and extract source domains
def extract_metaphor_domains(metaphor_json):
    """Extract source domains from metaphor JSON."""
    domains = []
    if pd.notna(metaphor_json):
        try:
            metaphors = json.loads(metaphor_json)
            for metaphor in metaphors:
                if isinstance(metaphor, str):
                    # Simple heuristic: first noun-like word often indicates domain
                    domains.append(metaphor.split()[0] if metaphor else 'unknown')
                elif isinstance(metaphor, dict) and 'domain' in metaphor:
                    domains.append(metaphor['domain'])
        except:
            pass
    return domains

# Categorize metaphors into broader domains
domain_categories = {
    'Nature': ['water', 'river', 'mountain', 'tree', 'plant', 'animal', 'bird', 'fish',
               'wind', 'sky', 'earth', 'stone', 'jade', 'gold', 'season', 'weather'],
    'Governance': ['ruler', 'king', 'minister', 'state', 'government', 'law', 'decree',
                   'territory', 'border', 'army', 'soldier', 'war', 'peace'],
    'Crafts': ['craft', 'tool', 'wheel', 'cart', 'boat', 'building', 'construction',
               'weaving', 'pottery', 'smith', 'carpenter', 'artisan', 'skill'],
    'Family': ['father', 'mother', 'son', 'daughter', 'brother', 'sister', 'ancestor',
               'family', 'parent', 'child', 'elder', 'younger'],
    'Body': ['heart', 'mind', 'body', 'hand', 'eye', 'ear', 'mouth', 'face', 'head',
             'foot', 'blood', 'bone', 'health', 'illness'],
    'Agriculture': ['farm', 'field', 'crop', 'harvest', 'seed', 'grain', 'rice', 'soil',
                    'plow', 'cultivation', 'farmer', 'agriculture'],
    'Learning': ['teacher', 'student', 'master', 'disciple', 'learning', 'knowledge',
                 'wisdom', 'book', 'study', 'practice', 'cultivation'],
    'Journey': ['path', 'road', 'way', 'journey', 'travel', 'destination', 'bridge',
                'gate', 'door', 'step', 'walking', 'distance']
}

# Collect all metaphors by philosopher
philosopher_metaphors = {'Confucius': [], 'Mozi': []}
philosopher_domain_counts = {'Confucius': {d: 0 for d in domain_categories.keys()},
                            'Mozi': {d: 0 for d in domain_categories.keys()}}

for _, row in metaphor_df.iterrows():
    if pd.notna(row['metaphors_json']):
        try:
            metaphors = json.loads(row['metaphors_json'])
            if metaphors:  # Check if list is not empty
                philosopher_metaphors[row['philosopher']].extend(metaphors)

                # Categorize each metaphor
                for metaphor in metaphors:
                    metaphor_lower = str(metaphor).lower()
                    categorized = False
                    for domain, keywords in domain_categories.items():
                        if any(keyword in metaphor_lower for keyword in keywords):
                            philosopher_domain_counts[row['philosopher']][domain] += 1
                            categorized = True
                            break
                    if not categorized:
                        # Add to 'Other' category if not found
                        if 'Other' not in philosopher_domain_counts[row['philosopher']]:
                            philosopher_domain_counts[row['philosopher']]['Other'] = 0
                        philosopher_domain_counts[row['philosopher']]['Other'] += 1
        except:
            pass

# Prepare data for visualization
domain_data = []
for domain in list(domain_categories.keys()) + ['Other']:
    if domain in philosopher_domain_counts['Confucius'] or domain in philosopher_domain_counts['Mozi']:
        conf_count = philosopher_domain_counts['Confucius'].get(domain, 0)
        mozi_count = philosopher_domain_counts['Mozi'].get(domain, 0)

        # Only include domains that have at least some metaphors
        if conf_count > 0 or mozi_count > 0:
            domain_data.append({
                'Domain': domain,
                'Confucius': conf_count,
                'Mozi': mozi_count,
                'Total': conf_count + mozi_count
            })

domain_df = pd.DataFrame(domain_data)
domain_df = domain_df.sort_values('Total', ascending=True)

# Create bubble chart
fig_bubble = go.Figure()

# Calculate positions for bubble chart
import numpy as np
x_positions = {'Confucius': 1, 'Mozi': 3}
y_scale = np.linspace(0, len(domain_df)-1, len(domain_df))

for i, row in enumerate(domain_df.itertuples()):
    # Confucius bubble
    if row.Confucius > 0:
        fig_bubble.add_trace(go.Scatter(
            x=[x_positions['Confucius']],
            y=[y_scale[i]],
            mode='markers+text',
            marker=dict(
                size=np.sqrt(row.Confucius) * 15,
                color='#1f77b4',
                opacity=0.7,
                line=dict(color='white', width=2)
            ),
            text=f"{row.Confucius}",
            textposition="middle center",
            textfont=dict(color='white', size=10),
            name='Confucius',
            showlegend=i==0,
            hovertemplate=f"{row.Domain}<br>Count: {row.Confucius}<extra></extra>"
        ))

    # Mozi bubble
    if row.Mozi > 0:
        fig_bubble.add_trace(go.Scatter(
            x=[x_positions['Mozi']],
            y=[y_scale[i]],
            mode='markers+text',
            marker=dict(
                size=np.sqrt(row.Mozi) * 15,
                color='#ff7f0e',
                opacity=0.7,
                line=dict(color='white', width=2)
            ),
            text=f"{row.Mozi}",
            textposition="middle center",
            textfont=dict(color='white', size=10),
            name='Mozi',
            showlegend=i==0,
            hovertemplate=f"{row.Domain}<br>Count: {row.Mozi}<extra></extra>"
        ))

    # Domain label
    fig_bubble.add_trace(go.Scatter(
        x=[2],
        y=[y_scale[i]],
        mode='text',
        text=row.Domain,
        textposition="middle center",
        textfont=dict(size=12, color='black'),
        showlegend=False,
        hoverinfo='skip'
    ))

fig_bubble.update_layout(
    title={
        'text': "<b>Metaphor Source Domains</b><br><sup>What imagery do they use to explain their ideas?</sup>",
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis=dict(
        range=[0, 4],
        showticklabels=False,
        showgrid=False,
        zeroline=False
    ),
    yaxis=dict(
        showticklabels=False,
        showgrid=False,
        zeroline=False
    ),
    height=600,
    width=900,
    template='plotly_white',
    annotations=[
        dict(x=1, y=-1.5, text="<b>Confucius</b>", showarrow=False, font=dict(size=14, color='#1f77b4')),
        dict(x=3, y=-1.5, text="<b>Mozi</b>", showarrow=False, font=dict(size=14, color='#ff7f0e'))
    ]
)

fig_bubble.show()

# Create stacked percentage bar chart
domain_df['Conf_pct'] = domain_df['Confucius'] / domain_df['Total'] * 100
domain_df['Mozi_pct'] = domain_df['Mozi'] / domain_df['Total'] * 100

fig_stacked = go.Figure()

fig_stacked.add_trace(go.Bar(
    y=domain_df['Domain'],
    x=domain_df['Conf_pct'],
    name='Confucius',
    orientation='h',
    marker_color='#1f77b4',
    hovertemplate='%{y}<br>Confucius: %{x:.1f}%<extra></extra>'
))

fig_stacked.add_trace(go.Bar(
    y=domain_df['Domain'],
    x=domain_df['Mozi_pct'],
    name='Mozi',
    orientation='h',
    marker_color='#ff7f0e',
    hovertemplate='%{y}<br>Mozi: %{x:.1f}%<extra></extra>'
))

fig_stacked.update_layout(
    title="<b>Metaphor Domain Distribution</b><br><sup>Relative preference for each source domain</sup>",
    xaxis_title="Percentage of Domain Usage",
    yaxis_title="",
    barmode='stack',
    height=500,
    width=900,
    template='plotly_white',
    legend=dict(x=0.85, y=0.95)
)

fig_stacked.show()

# Print insights
print("\n💡 METAPHORICAL INSIGHTS:")
print("-" * 40)

print(f"\nTotal metaphors found:")
print(f"  • Confucius: {len(philosopher_metaphors['Confucius'])} metaphors")
print(f"  • Mozi: {len(philosopher_metaphors['Mozi'])} metaphors")

print("\nTop 3 domains by philosopher:")
for philosopher in ['Confucius', 'Mozi']:
    print(f"\n{philosopher}:")
    sorted_domains = sorted(philosopher_domain_counts[philosopher].items(),
                          key=lambda x: x[1], reverse=True)[:3]
    for domain, count in sorted_domains:
        if count > 0:
            pct = count / sum(philosopher_domain_counts[philosopher].values()) * 100
            print(f"  • {domain}: {count} metaphors ({pct:.1f}%)")

# Find distinctive domains
print("\n🔍 DISTINCTIVE PREFERENCES:")
for domain in domain_df['Domain'].values:
    row = domain_df[domain_df['Domain'] == domain].iloc[0]
    if row['Total'] >= 5:  # Only consider domains with enough data
        if row['Conf_pct'] > 70:
            print(f"  • {domain}: Strongly preferred by Confucius ({row['Conf_pct']:.0f}%)")
        elif row['Mozi_pct'] > 70:
            print(f"  • {domain}: Strongly preferred by Mozi ({row['Mozi_pct']:.0f}%)")

print("\n📚 INTERPRETATION:")
print("Metaphor domains reveal how each philosopher sees the world:")
print("• Nature metaphors → organic, harmonious worldview")
print("• Governance metaphors → political, hierarchical thinking")
print("• Craft metaphors → practical, skill-based approach")
print("• Family metaphors → relational, social emphasis")

ANALYSIS 4: Metaphor Source Domains - Where They Find Meaning



💡 METAPHORICAL INSIGHTS:
----------------------------------------

Total metaphors found:
  • Confucius: 401 metaphors
  • Mozi: 884 metaphors

Top 3 domains by philosopher:

Confucius:
  • Other: 210 metaphors (52.4%)
  • Nature: 63 metaphors (15.7%)
  • Governance: 32 metaphors (8.0%)

Mozi:
  • Other: 463 metaphors (52.4%)
  • Nature: 139 metaphors (15.7%)
  • Governance: 82 metaphors (9.3%)

🔍 DISTINCTIVE PREFERENCES:
  • Crafts: Strongly preferred by Mozi (81%)
  • Governance: Strongly preferred by Mozi (72%)

📚 INTERPRETATION:
Metaphor domains reveal how each philosopher sees the world:
• Nature metaphors → organic, harmonious worldview
• Governance metaphors → political, hierarchical thinking
• Craft metaphors → practical, skill-based approach
• Family metaphors → relational, social emphasis


In [46]:
# ============================================
# ANALYSIS 5: PHILOSOPHICAL BRIDGE FINDER (FIXED V2)
# ============================================

print("ANALYSIS 5: Finding Common Ground Through Bridge Quotes")
print("=" * 60)

# Find bridge quotes using the helper function
bridges = find_bridge_quotes(embeddings, metadata, n_bridges=10)

# Since metadata already has row_id and text, we can use it directly
# Let's just get additional info from analysis_df if available
concept_cols = ['row_id', 'core_concepts_primary', 'themes', 'argument_method']
concepts_available = [col for col in concept_cols if col in analysis_df.columns]
if concepts_available:
    concepts_for_merge = analysis_df[concepts_available].copy()
else:
    concepts_for_merge = None

print("\n🌉 BRIDGE QUOTES - Finding Common Ground")
print("=" * 60)

for bridge_type, bridge_df in bridges.items():
    source_phil = bridge_type.split('_to_')[0]
    target_phil = bridge_type.split('_to_')[1]

    print(f"\n📖 {source_phil} quotes closest to {target_phil}'s thinking:")
    print("-" * 50)

    # Bridge_df already has text from metadata
    bridge_full = bridge_df.copy()

    # Merge with concepts if available
    if concepts_for_merge is not None:
        bridge_full = bridge_full.merge(concepts_for_merge, on='row_id', how='left')

    # Display top 5 with analysis
    for i, row in enumerate(bridge_full.head(5).itertuples(), 1):
        print(f"\n{i}. Similarity Score: {row.similarity_to_other:.3f}")

        # Show quote from the text field
        quote = row.text if hasattr(row, 'text') and pd.notna(row.text) else "Text not available"
        if len(quote) > 300:
            quote = quote[:297] + "..."
        print(f"   Quote: \"{quote}\"")

        # Show source if available (from metadata)
        if hasattr(row, 'work') and pd.notna(row.work):
            source_info = f"{row.work}"
            if hasattr(row, 'chapter_verse') and pd.notna(row.chapter_verse):
                source_info += f" ({row.chapter_verse})"
            print(f"   Source: {source_info}")

        # Show concepts if available
        if hasattr(row, 'core_concepts_primary') and pd.notna(row.core_concepts_primary):
            concepts = parse_semicolon_field(row.core_concepts_primary)[:5]
            if concepts:
                print(f"   Key Concepts: {', '.join(concepts)}")

# Create visualization of bridge quotes in embedding space
print("\n\n📊 VISUALIZING BRIDGE QUOTES IN SEMANTIC SPACE")
print("-" * 50)

# Get UMAP coordinates (reuse from Analysis 1 if available, or compute new)
if 'viz_df' not in locals():
    umap_reducer = umap.UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.1)
    embeddings_umap = umap_reducer.fit_transform(embeddings)
    viz_df = metadata.copy()
    viz_df['UMAP_1'] = embeddings_umap[:, 0]
    viz_df['UMAP_2'] = embeddings_umap[:, 1]

# Create interactive plot highlighting bridge quotes
fig = go.Figure()

# Plot all quotes in gray first
for philosopher in ['Confucius', 'Mozi']:
    phil_data = viz_df[viz_df['philosopher'] == philosopher]

    # Non-bridge quotes (background)
    bridge_ids = set()
    for bridge_df in bridges.values():
        bridge_ids.update(bridge_df['row_id'].values)

    non_bridge_mask = ~phil_data['row_id'].isin(bridge_ids)
    non_bridge_data = phil_data[non_bridge_mask]

    fig.add_trace(go.Scatter(
        x=non_bridge_data['UMAP_1'],
        y=non_bridge_data['UMAP_2'],
        mode='markers',
        name=f'{philosopher} (regular)',
        marker=dict(
            color='lightgray',
            size=4,
            opacity=0.3
        ),
        hovertemplate='%{text}<extra></extra>',
        text=[f"{row['philosopher']}<br>{row['text'][:100]}..." for _, row in non_bridge_data.iterrows()],
        showlegend=True
    ))

# Highlight bridge quotes
colors = {'Confucius_to_Mozi': '#1f77b4', 'Mozi_to_Confucius': '#ff7f0e'}
symbols = {'Confucius_to_Mozi': 'circle', 'Mozi_to_Confucius': 'square'}

for bridge_type, bridge_df in bridges.items():
    # Get UMAP coordinates for bridge quotes
    bridge_viz = viz_df[viz_df['row_id'].isin(bridge_df['row_id'])]
    source_phil = bridge_type.split('_to_')[0]

    fig.add_trace(go.Scatter(
        x=bridge_viz['UMAP_1'],
        y=bridge_viz['UMAP_2'],
        mode='markers',
        name=f'{source_phil} bridges',
        marker=dict(
            color=colors[bridge_type],
            size=12,
            opacity=0.8,
            symbol=symbols[bridge_type],
            line=dict(width=2, color='white')
        ),
        text=[f"BRIDGE: {row['philosopher']}<br>{row['text'][:150]}..."
              for _, row in bridge_viz.iterrows()],
        hovertemplate='%{text}<extra></extra>'
    ))

# Add centroids as large markers
centroids = compute_philosopher_centroids(embeddings, metadata)
centroids_2d = umap_reducer.transform(np.array([centroids['Confucius'], centroids['Mozi']]))

fig.add_trace(go.Scatter(
    x=centroids_2d[:, 0],
    y=centroids_2d[:, 1],
    mode='markers+text',
    name='Centroids',
    marker=dict(
        color=['#1f77b4', '#ff7f0e'],
        size=20,
        symbol='star',
        line=dict(width=2, color='white')
    ),
    text=['Confucius Center', 'Mozi Center'],
    textposition='top center',
    showlegend=True
))

fig.update_layout(
    title={
        'text': "<b>Bridge Quotes: Philosophical Common Ground</b><br><sup>Highlighted quotes are closest to the other philosopher's thinking</sup>",
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis_title="UMAP Dimension 1",
    yaxis_title="UMAP Dimension 2",
    height=700,
    width=1000,
    template='plotly_white'
)

fig.show()

# Analyze common themes in bridge quotes
print("\n💡 BRIDGE QUOTE ANALYSIS - What Unites Them?")
print("-" * 50)

if concepts_for_merge is not None:
    all_bridge_ids = []
    for bridge_df in bridges.values():
        all_bridge_ids.extend(bridge_df['row_id'].values)

    bridge_concepts_df = concepts_for_merge[concepts_for_merge['row_id'].isin(all_bridge_ids)]

    # Collect all concepts from bridge quotes
    bridge_concepts = []
    bridge_themes = []

    for _, row in bridge_concepts_df.iterrows():
        if pd.notna(row['core_concepts_primary']):
            bridge_concepts.extend(parse_semicolon_field(row['core_concepts_primary']))
        if pd.notna(row['themes']):
            bridge_themes.extend(parse_semicolon_field(row['themes']))

    from collections import Counter
    concept_counts = Counter(bridge_concepts)
    theme_counts = Counter(bridge_themes)

    if concept_counts:
        print("\nMost common concepts in bridge quotes:")
        for concept, count in concept_counts.most_common(5):
            print(f"  • {concept}: appears in {count} bridge quotes")

    if theme_counts:
        print("\nMost common themes in bridge quotes:")
        for theme, count in theme_counts.most_common(5):
            print(f"  • {theme}: appears in {count} bridge quotes")

print("\n🔍 INTERPRETATION:")
print("-" * 50)
print("Bridge quotes reveal surprising common ground between the philosophers.")
print("These are the ideas where their thinking converges, suggesting")
print("universal principles or shared cultural values that transcend")
print("their philosophical differences. They may represent:")
print("• Shared cultural foundations")
print("• Universal human concerns")
print("• Points where their philosophies complement rather than oppose")

ANALYSIS 5: Finding Common Ground Through Bridge Quotes
✓ Computed centroid for Confucius: shape (384,)
✓ Computed centroid for Mozi: shape (384,)

🌉 BRIDGE QUOTES - Finding Common Ground

📖 Confucius quotes closest to Mozi's thinking:
--------------------------------------------------

1. Similarity Score: 0.661
   Quote: "'En Ch'eng Tzu killed Duke Chien. After washing himself ceremonially, Confucius went to court and reported to Duke Ai, saying, ‘He, Ch'en Heng has killed his lord. May I request that an army be sent to punish him?' The Duke answered, ‘Tell the three noble lords.' Confucius said, ‘I have reported ..."
   Source: Analects (14:21)
   Key Concepts: duty, ritual_purification, feudal_hierarchy

2. Similarity Score: 0.649
   Quote: "The Master said, 'I cannot say.' Meng Wu Po repeated the question. The Master said, 'Yu can be given the responsibility of managing the military levies in a state of a thousand chariots, but whether he is benevolent or not I cannot say.' [2] 'W


💡 BRIDGE QUOTE ANALYSIS - What Unites Them?
--------------------------------------------------

Most common concepts in bridge quotes:
  • benevolence: appears in 3 bridge quotes
  • yi: appears in 3 bridge quotes
  • impartial_caring: appears in 3 bridge quotes
  • ritual: appears in 2 bridge quotes
  • the_way: appears in 2 bridge quotes

Most common themes in bridge quotes:
  • ethical_governance: appears in 4 bridge quotes
  • ritual_practice: appears in 2 bridge quotes
  • political_ethics: appears in 2 bridge quotes
  • social_order: appears in 2 bridge quotes
  • moral_character: appears in 1 bridge quotes

🔍 INTERPRETATION:
--------------------------------------------------
Bridge quotes reveal surprising common ground between the philosophers.
These are the ideas where their thinking converges, suggesting
universal principles or shared cultural values that transcend
their philosophical differences. They may represent:
• Shared cultural foundations
• Universal human concerns
•