In [2]:


!pip install simalign==1.0.8    # or latest simalign
!pip install pandas scipy tqdm



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os
import csv
from collections import defaultdict, Counter

# ---------- CONFIG ----------
CONLLU_ROOT = "conllu/conllu_file"
OUT_CSV = "conllu/adjectives_appraisal_with_examples_by_Domain_thresholded.csv"
KEEP_TOP_N = 20            # top-N adjectives per (lang, Domain) by total count
EXAMPLES_PER_CLASS = 5     # how many example sentences to keep per classification
CONFIDENCE_THRESHOLD = 0.50  # set to 0.50 or 0.60 as desired
# ---------- end CONFIG ----------

# ---- (keep your pattern sets unchanged) ----
FIRST_PERSON = {
    "i", "me", "we", "us",
    "my", "our",
    "mine", "ours",
    "myself", "ourselves"
}
SECOND_PERSON = {"you", "your", "yours", "yourself", "yourselves"}
THIRD_PERSON_PRONOUNS = {"he", "she", "it", "they", "him", "her", "them", "his", "hers", "theirs"}
APPRECIATION_PRONOUNS = {
    "it", "there", "here", "this", "that", "these", "those",
    "something", "anything", "nothing", "everything",
    "someone", "anyone", "no one", "everyone",
    "somebody", "anybody", "nobody", "everybody",
    "all", "each", "both", "neither", "none", "one",
    "what", "whatever", "which", "whichever",
    "this one", "that one", "such", "so"
}
FEEL_VERBS = {
    "be", "become", "get", "remain", "stay", "keep", "prove", "turn",
    "seem", "appear", "sound", "look", "smell", "taste", "feel",
    "think", "consider", "believe", "find", "judge", "deem", "regard",
    "say", "claim", "argue", "maintain", "suggest", "assert", "contend",
    "want", "wish", "hope", "desire", "prefer",
    "like", "love", "enjoy", "admire", "appreciate",
    "dislike", "hate", "detest", "despise",
    "fear", "dread", "regret", "worry", "doubt", "mourn",
    "must", "should", "ought", "need", "have_to",
    "watch", "observe", "notice", "perceive", "recognize",
    "value", "assess", "evaluate", "review", "approve", "disapprove",
    "render", "make", "leave", "drive"
}

# ---------- IO: parse conllu ----------
def parse_conllu_file(path):
    """Return list of sentences; each sentence is a list of token dicts."""
    sentences = []
    sent = []
    with open(path, "r", encoding="utf-8") as fh:
        for line in fh:
            line = line.rstrip("\n")
            if not line:
                if sent:
                    sentences.append(sent)
                    sent = []
                continue
            if line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) < 8:
                continue
            if "-" in parts[0] or "." in parts[0]:
                continue
            try:
                idx = int(parts[0])
            except:
                continue
            token = {
                "id": idx,
                "form": parts[1],
                "lemma": parts[2].lower(),
                "upos": parts[3],
                "head": int(parts[6]) if parts[6].isdigit() else 0,
                "deprel": parts[7],
            }
            sent.append(token)
    if sent:
        sentences.append(sent)
    return sentences

# ---------- classification & example collection ----------
def _append_example(adj_examples, lemma, category, sentence_text):
    """Append sentence_text to adj_examples[lemma][category] up to EXAMPLES_PER_CLASS, avoiding duplicates."""
    lst = adj_examples.setdefault(lemma, {"Affect":[], "Judgement":[], "Appreciation":[], "ambiguous":[]})
    if sentence_text in lst[category]:
        return
    if len(lst[category]) < EXAMPLES_PER_CLASS:
        lst[category].append(sentence_text)

def classify_adjectives_in_sentence(sent, adj_stats, adj_examples):
    by_id = {t["id"]: t for t in sent}
    children = defaultdict(list)
    for t in sent:
        children[t["head"]].append(t)

    sentence_text = " ".join(t["form"] for t in sent)

    for t in sent:
        if t["upos"] != "ADJ":
            continue
        lemma = t["lemma"]
        adj_stats[lemma]["total"] += 1

        subj_candidates = [c for c in children.get(t["id"], []) if c["deprel"].startswith("nsubj")]
        cop_children = [c for c in children.get(t["id"], []) if c["deprel"] == "cop"]
        classified = False

        if subj_candidates:
            subj = subj_candidates[0]
            s_lemma = subj["lemma"]
            s_upos = subj["upos"]
            if s_upos == "PRON" and s_lemma in FIRST_PERSON:
                adj_stats[lemma]["Affect"] += 1
                _append_example(adj_examples, lemma, "Affect", sentence_text)
                classified = True
            elif s_upos == "PRON" and s_lemma in THIRD_PERSON_PRONOUNS:
                adj_stats[lemma]["Judgement"] += 1
                _append_example(adj_examples, lemma, "Judgement", sentence_text)
                classified = True
            elif s_upos == "PROPN":
                adj_stats[lemma]["Judgement"] += 1
                _append_example(adj_examples, lemma, "Judgement", sentence_text)
                classified = True
            elif s_upos == "PRON" and s_lemma in APPRECIATION_PRONOUNS:
                adj_stats[lemma]["Appreciation"] += 1
                _append_example(adj_examples, lemma, "Appreciation", sentence_text)
                classified = True
            elif s_upos == "NOUN":
                adj_stats[lemma]["Appreciation"] += 1
                _append_example(adj_examples, lemma, "Appreciation", sentence_text)
                classified = True

        if not classified and t["deprel"] == "xcomp":
            head = by_id.get(t["head"])
            if head and head["upos"].startswith("V"):
                verb_children = children.get(head["id"], [])
                v_subjs = [c for c in verb_children if c["deprel"].startswith("nsubj")]
                if v_subjs:
                    vsub = v_subjs[0]
                    vl = vsub["lemma"]; vup = vsub["upos"]
                    if vup == "PRON" and vl in FIRST_PERSON:
                        adj_stats[lemma]["Affect"] += 1
                        _append_example(adj_examples, lemma, "Affect", sentence_text)
                        classified = True
                    elif vup == "PRON" and vl in THIRD_PERSON_PRONOUNS:
                        adj_stats[lemma]["Judgement"] += 1
                        _append_example(adj_examples, lemma, "Judgement", sentence_text)
                        classified = True
                    elif vup == "PROPN":
                        adj_stats[lemma]["Judgement"] += 1
                        _append_example(adj_examples, lemma, "Judgement", sentence_text)
                        classified = True
                    elif vup == "NOUN":
                        adj_stats[lemma]["Appreciation"] += 1
                        _append_example(adj_examples, lemma, "Appreciation", sentence_text)
                        classified = True
                else:
                    if head["lemma"] in FEEL_VERBS:
                        adj_stats[lemma]["Affect"] += 1
                        _append_example(adj_examples, lemma, "Affect", sentence_text)
                        classified = True

        if not classified and cop_children:
            if subj_candidates:
                subj = subj_candidates[0]
                sl = subj["lemma"]; sup = subj["upos"]
                if sup == "PRON" and sl in FIRST_PERSON:
                    adj_stats[lemma]["Affect"] += 1
                    _append_example(adj_examples, lemma, "Affect", sentence_text)
                    classified = True
                elif sup == "PRON" and sl in THIRD_PERSON_PRONOUNS:
                    adj_stats[lemma]["Judgement"] += 1
                    _append_example(adj_examples, lemma, "Judgement", sentence_text)
                    classified = True
                elif sup == "PROPN":
                    adj_stats[lemma]["Judgement"] += 1
                    _append_example(adj_examples, lemma, "Judgement", sentence_text)
                    classified = True
                elif sup == "NOUN":
                    adj_stats[lemma]["Appreciation"] += 1
                    _append_example(adj_examples, lemma, "Appreciation", sentence_text)
                    classified = True
                elif sup == "PRON" and sl in APPRECIATION_PRONOUNS:
                    adj_stats[lemma]["Appreciation"] += 1
                    _append_example(adj_examples, lemma, "Appreciation", sentence_text)
                    classified = True

        if not classified:
            adj_stats[lemma]["ambiguous"] += 1
            _append_example(adj_examples, lemma, "ambiguous", sentence_text)

# ---------- aggregation (with thresholding) ----------
def compute_appraisal_with_examples_by_class(root, top_n=None, lang_filter=None, confidence_threshold=0.50):
    all_rows = []
    for lang in sorted(os.listdir(root)):
        if lang_filter and lang != lang_filter:
            continue
        lpath = os.path.join(root, lang)
        if not os.path.isdir(lpath):
            continue
        for Domain in sorted(os.listdir(lpath)):
            dpath = os.path.join(lpath, Domain)
            if not os.path.isdir(dpath):
                continue

            # Capitalize Domain names
            Domain = Domain.capitalize()

            adj_stats = defaultdict(lambda: Counter({"total":0, "Affect":0, "Judgement":0, "Appreciation":0, "ambiguous":0}))
            adj_examples = {}

            for fname in sorted(os.listdir(dpath)):
                if not fname.endswith(".conllu"):
                    continue
                path = os.path.join(dpath, fname)
                sents = parse_conllu_file(path)
                for s in sents:
                    classify_adjectives_in_sentence(s, adj_stats, adj_examples)

            rows = []
            for lemma, cnts in adj_stats.items():
                tot = cnts["total"]
                aff = cnts["Affect"]
                jud = cnts["Judgement"]
                app = cnts["Appreciation"]
                amb = cnts["ambiguous"]
                co_total = aff + jud + app
                if co_total > 0:
                    p_aff = aff / co_total
                    p_jud = jud / co_total
                    p_app = app / co_total
                else:
                    p_aff = p_jud = p_app = 0.0

                probs = {"Affect": p_aff, "Judgement": p_jud, "Appreciation": p_app}
                max_prob = max(probs.values())
                classes_meeting = [k for k, v in probs.items() if v >= confidence_threshold]
                assigned_class = None
                assigned_reason = ""
                is_confident = False

                if len(classes_meeting) == 1:
                    assigned_class = classes_meeting[0]
                    is_confident = True
                    assigned_reason = f"single_class_above_threshold ({confidence_threshold})"
                elif len(classes_meeting) > 1:
                    sorted_by_prob = sorted(probs.items(), key=lambda x: x[1], reverse=True)
                    top_name, top_val = sorted_by_prob[0]
                    second_val = sorted_by_prob[1][1]
                    if top_val > second_val and top_val >= confidence_threshold:
                        assigned_class = top_name
                        is_confident = True
                        assigned_reason = "highest_prob_above_threshold (others below)"
                    else:
                        assigned_class = "ambiguous_tie"
                        assigned_reason = "multiple_classes_above_threshold_or_tied"
                else:
                    if max_prob == 0:
                        assigned_class = "ambiguous_no_evidence"
                        assigned_reason = "no_Affect/Judgement/Appreciation_evidence"
                    else:
                        assigned_class = "ambiguous_low_confidence"
                        assigned_reason = f"max_prob_below_threshold ({max_prob:.3f} < {confidence_threshold})"
                    is_confident = False

                examples = adj_examples.get(lemma, {"Affect":[], "Judgement":[], "Appreciation":[], "ambiguous":[]})
                ex_aff = " || ".join(examples["Affect"])
                ex_jud = " || ".join(examples["Judgement"])
                ex_app = " || ".join(examples["Appreciation"])
                ex_amb = " || ".join(examples["ambiguous"])
                if round(max_prob, 4) > confidence_threshold:
                    rows.append({
                        "language": lang,
                        "Domain": Domain,
                        "adjective": lemma,
                        "total_count": tot,
                        "Affect_count": aff,
                        "Judgement_count": jud,
                        "Appreciation_count": app,
                        "ambiguous_count": amb,
                        "p_Affect": round(p_aff, 4),
                        "p_Judgement": round(p_jud, 4),
                        "p_Appreciation": round(p_app, 4),
                        "assigned_class": assigned_class,
                        "assigned_confidence": round(max_prob, 4),
                        "example_Affect": ex_aff,
                        "example_Judgement": ex_jud,
                        "example_Appreciation": ex_app,
                        "example_ambiguous": ex_amb
                    })

            rows = sorted(rows, key=lambda r: r["total_count"], reverse=True)
            if top_n:
                rows = rows[:top_n]
            all_rows.extend(rows)
    return all_rows

# ---------- run ----------
def main(threshold=CONFIDENCE_THRESHOLD):
    out = compute_appraisal_with_examples_by_class(CONLLU_ROOT, top_n=KEEP_TOP_N, lang_filter="en", confidence_threshold=threshold)
    if not out:
        print("No results (check your CONLLU_ROOT path and folder structure).")
        return

    os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)
    with open(OUT_CSV, "w", encoding="utf-8", newline="") as fh:
        writer = csv.DictWriter(fh, fieldnames=list(out[0].keys()))
        writer.writeheader()
        for row in out:
            writer.writerow(row)
    print("Saved:", OUT_CSV)
    print(f"Used confidence threshold = {threshold}")

if __name__ == "__main__":



  
    main()


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'conllu/conllu_file'