# Working with Universal Dependencies


In [5]:
# if you don't have conllu yet, uncomment the following
!python -m pip install conllu

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import conllu # reading Universal Dependency files in the CONLLu format
import os
import json
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from collections import defaultdict, Counter

In [7]:
with open("ko_kaist-ud-train.conllu", encoding="utf-8") as f:
    data = f.read()

## Task 1

In [8]:
OUT_DIR = 'data/project_outputs'
os.makedirs(OUT_DIR, exist_ok=True)

def parse_conllu(path):
    """
    Return list of sentences; each sentence is a list of token dicts:
    {id:int, form:str, lemma:str, upos:str, head:int or None, deprel:str}
    Ignores multiword lines (1-2) and empty-node decimal ids (3.1).
    """
    sents = []
    tokens = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if tokens:
                    sents.append(tokens)
                    tokens = []
                continue
            if line.startswith("#"):
                continue
            cols = line.split("\t")
            if len(cols) != 10:
                continue
            id_field = cols[0]
            # skip multiword / empty nodes
            if "-" in id_field or "." in id_field:
                continue
            try:
                tid = int(id_field)
            except ValueError:
                continue
            form = cols[1]
            lemma = cols[2]
            upos = cols[3]
            head = cols[6]
            deprel = cols[7]
            try:
                head_int = int(head) if head != "_" else None
            except ValueError:
                head_int = None
            tok = {"id": tid, "form": form, "lemma": lemma, "upos": upos, "head": head_int, "deprel": deprel}
            tokens.append(tok)
    # final
    if tokens:
        sents.append(tokens)
    return sents

sents = parse_conllu("ko_kaist-ud-train.conllu")

verb_instances = 0
obj_instances = 0
obj_immediate = 0
obj_within3 = 0
obj_to_right = 0
obj_any_before = 0
dist_counter = Counter()

examples = defaultdict(list) 

for sent_idx, tokens in enumerate(sents):
    id_to_tok = {t['id']: t for t in tokens}
    dependents = defaultdict(list)
    for t in tokens:
        if t['head'] is not None and t['head'] in id_to_tok:
            dependents[t['head']].append(t)
    id_to_index = {t['id']: i for i, t in enumerate(tokens)}

    for i, tok in enumerate(tokens):
        if tok['upos'] == 'VERB':
            verb_instances += 1
            verb_id = tok['id']
            deps = dependents.get(verb_id, [])
            for dep in deps:
                if dep['deprel'] == 'obj':
                    obj_instances += 1
                    obj_idx = id_to_index.get(dep['id'])
                    verb_idx = i
                    if obj_idx is None:
                        continue
                    diff = verb_idx - obj_idx  # positive if object is to verb's left
                    dist_counter[diff] += 1
                    if diff == 1:
                        obj_immediate += 1
                    if 1 <= diff <= 3:
                        obj_within3 += 1
                    if diff < 0:
                        obj_to_right += 1
                    if diff > 0:
                        obj_any_before += 1

                    
                    lemma = tok['lemma'] if tok['lemma'] != '_' else tok['form']
                    if len(examples[lemma]) < 5:
                        sent_form = ' '.join([t['form'] for t in tokens])
                        examples[lemma].append((dep['form'], tok['form'], sent_form))

if obj_instances > 0:
    pct_immediate = obj_immediate / obj_instances * 100
    pct_within3 = obj_within3 / obj_instances * 100
    pct_after = obj_to_right / obj_instances * 100
else:
    pct_immediate = pct_within3 = 0.0

print('Verb instances (VERB tokens):', verb_instances)
print('Total obj instances:', obj_instances)
print(f'Objects immediately before verb: {obj_immediate} ({pct_immediate:.2f}%)')
print(f'Objects within 3 before verb: {obj_within3} ({pct_within3:.2f}%)')
print(f'Objects after verb: {obj_to_right} ({pct_after:.2f}%)')
print(f'Objects before verb (any distance): {obj_any_before} ({(obj_any_before / obj_instances * 100) if obj_instances > 0 else 0.0:.2f}%)')

dist_items = sorted(dist_counter.items())
df_dist = pd.DataFrame(dist_items, columns=['verb_minus_obj_index', 'count'])
df_dist.to_csv(os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'), index=False)
print('Saved distance distribution to', os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'))


Verb instances (VERB tokens): 55805
Total obj instances: 13912
Objects immediately before verb: 10066 (72.35%)
Objects within 3 before verb: 12937 (92.99%)
Objects after verb: 0 (0.00%)
Objects before verb (any distance): 13912 (100.00%)
Saved distance distribution to data/project_outputs\object_verb_distance_distribution.csv


### Task 1.1
The generalization we can see from the above results is that an object _never_ occurs after a verb, it _must_ occur before the verb.

### Task 1.2
Our generalization generally holds true. While in our corupus, an object _never_ follows a verb, it does not always have to come immediately before a verb (it does at a 72.35% rate). We have found that the object comes within 3 tokens before the verb, though this is also not always true (it occurs 92.99% of the time). The exceptions to this rule occur when objects are farther than 3 tokens from the verb, but still are before the verb, since we could not find any case where it follows a verb (at any length).

## Task 2

In [None]:
import json
import os
import re
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

from gensim.models import KeyedVectors  # used by the loader (optional)

CONLLU_PATH = "ko_kaist-ud-train.conllu"   #Just a reminder, all files are directly just in the same folder as this code.
OUT_DIR = "data/project_outputs_task2"
os.makedirs(OUT_DIR, exist_ok=True)

VERB_UPOS = ("VERB", "AUX")   # token UPOS considered verbs
COPULA_CANONICAL = "이"       # canonical token for copula (이)
EXCLUDE_VERBS = {COPULA_CANONICAL}  # verbs to exclude from negation analyses

def normalize_korean_verb(lemma_or_form):
    """
        Normalize a Korean verb token (lemma or form) to a canonical representation.
        Rules:
        - If segmented with '+', take first morpheme unless copula '이' is present, in which case return '이'
        - If unsegmented and ends with '다', strip terminal '다'
        - Otherwise return as is
    """
    if not lemma_or_form or lemma_or_form == "_":
        return None

    s = lemma_or_form.replace("＋", "+").replace("‧", "+").strip()

    #if segmented with '+', split into morphemes
    if "+" in s:
        parts = [p for p in s.split("+") if p]  # drop empty parts
        #if any part equals the copula morpheme '이', treat as copula
        if any(p == "이" for p in parts):
            return COPULA_CANONICAL
        return parts[0]

    #if unsegmented but ends with '다', strip terminal '다' 
    if len(s) > 1 and s.endswith("다"):
        return s[:-1]

    return s

def aggregate_normalized_counts(counter, locations):
    """
        Ensures counts/locations are collapsed to normalized keys.
        Returns new Counter and locations dict.
    """
    norm_counter = Counter()
    norm_locations = defaultdict(list)
    if locations:
        for raw_key, locs in locations.items():
            norm = normalize_korean_verb(raw_key)
            norm_counter[norm] += len(locs)
            norm_locations[norm].extend(locs)
    else:
        for raw_key, cnt in counter.items():
            norm = normalize_korean_verb(raw_key)
            norm_counter[norm] += cnt
    return norm_counter, norm_locations

def verb_frequencies(sents, verb_upos=VERB_UPOS):
    """
        Count frequencies and record locations of verbs in sentences.
        Returns a Counter of normalized verbs and a dict of locations.
    """
    counter = Counter()
    locations = defaultdict(list)  # canonical_verb -> list of (sent_idx, token_idx, original_token)
    # iterate over sentences and tokens
    for si, tokens in enumerate(sents):
        for i, tok in enumerate(tokens):
            if tok["upos"] in verb_upos:
                lemma = tok["lemma"] if tok["lemma"] != "_" else tok["form"]
                norm = normalize_korean_verb(lemma)
                counter[norm] += 1
                locations[norm].append((si, i, tok))
    return counter, locations

def select_verbs_by_quantiles(counter, top_pct=0.20, next_pct=0.20, top_k_each=20):
    """
        Select verbs by frequency quantiles.
        Returns two lists: top verbs and next verbs.
    """
    items = counter.most_common()
    types = [v for v, _ in items]
    n_types = len(types)
    top_n_types = max(1, int(n_types * top_pct))
    next_n_types = max(1, int(n_types * next_pct))
    top_type_set = set(types[:top_n_types])
    next_type_set = set(types[top_n_types: top_n_types + next_n_types])
    top_candidates = [v for v, _ in items if v in top_type_set]
    next_candidates = [v for v, _ in items if v in next_type_set]
    return top_candidates[:top_k_each], next_candidates[:top_k_each]

def extract_verb_sets(sents, verb_locations, verbs, 
                      subj_deprels=("nsubj", "nsubj:pass", "csubj"), 
                      obj_deprels_prefix=("obj",), 
                      modifier_deprels_prefixes=("advmod", "amod", "nmod", "obl", "advcl", "compound")):
    """
        Extract sets of subjects, objects, modifiers, and surrounding words for given verbs.
        Returns a dict: verb -> {subject:Counter, object:Counter, modifier:Counter, before:Counter, after:Counter, occurrences:int}
    """
    results = {}
    for verb in verbs:
        subj_c = Counter()
        obj_c = Counter()
        mod_c = Counter()
        before_c = Counter()
        after_c = Counter()
        occ = 0
        locs = verb_locations.get(verb, [])
        for si, vi, original_tok in locs:
            sent = sents[si]
            if vi < 0 or vi >= len(sent):
                continue
            occ += 1
            id_to_tok = {t['id']: t for t in sent}
            dependents = defaultdict(list)
            for t in sent:
                h = t['head']
                if h is not None and h in id_to_tok:
                    dependents[h].append(t)
            v_deps = dependents.get(original_tok['id'], [])
            for dep in v_deps:
                deprel = dep['deprel']
                if deprel in subj_deprels or deprel.startswith("nsubj"):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    subj_c[head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in obj_deprels_prefix):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    obj_c[head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in modifier_deprels_prefixes):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    mod_c[head_form] += 1
            if vi - 1 >= 0:
                before_c[sent[vi-1]['form']] += 1
            if vi + 1 < len(sent):
                after_c[sent[vi+1]['form']] += 1
        results[verb] = {
            "subject": subj_c,
            "object": obj_c,
            "modifier": mod_c,
            "before": before_c,
            "after": after_c,
            "occurrences": occ
        }
    return results

def load_korean_vector_model(bin_path=None, tsv_path=None, verbose=True):
    """
        Load a Korean word vector model from binary or TSV format.
        Handles Kyubyong multi-line bracketed format (index<TAB>word<TAB>[ v v v ... across lines ... ])
        Returns a KeyedVectors instance.
    """
    attempts = []
    
    def _log(msg):
        if verbose:
            print(msg)

    if tsv_path and os.path.exists(tsv_path):
        try:
            _log(f"Inspecting text vector file {tsv_path} ...")
            words = []
            vecs = []
            dim = None
            with open(tsv_path, "r", encoding="utf-8", errors="replace") as fh:
                line_num = 0
                collecting = False
                current_word = None
                current_vals = []
                open_bracket_re = re.compile(r'^\s*\d+\t([^\t]+)\t\[')
                for raw_line in fh:
                    line_num += 1
                    line = raw_line.rstrip("\n")
                    if not line.strip():
                        continue
                    if not collecting:
                        m = open_bracket_re.match(line)
                        if m:
                            current_word = m.group(1)
                            idx = line.find('[')
                            after = line[idx+1:].strip()
                            if after:
                                parts = re.split(r'\s+', after)
                                for p in parts:
                                    if p == ']' or p == '':
                                        continue
                                    p_clean = p.strip("[],")
                                    if p_clean:
                                        current_vals.append(p_clean)
                            collecting = True
                        else:
                            parts0 = line.split("\t")
                            if len(parts0) >= 2 and parts0[0].isdigit():
                                current_word = parts0[1]
                                current_vals = []
                                collecting = True
                                if '[' in line:
                                    idx = line.find('[')
                                    after = line[idx+1:].strip()
                                    if after:
                                        parts = re.split(r'\s+', after)
                                        for p in parts:
                                            p_clean = p.strip("[],")
                                            if p_clean:
                                                current_vals.append(p_clean)
                            else:
                                continue
                    else:
                        if ']' in line:
                            before = line.split(']')[0].strip()
                            if before:
                                parts = re.split(r'\s+', before)
                                for p in parts:
                                    p_clean = p.strip("[],")
                                    if p_clean:
                                        current_vals.append(p_clean)
                            try:
                                vec = np.array([float(x) for x in current_vals], dtype=np.float32)
                            except Exception as e:
                                _log(f"Warning: failed to convert vector for word {current_word} at line {line_num}: {e}")
                                current_word = None
                                current_vals = []
                                collecting = False
                                continue
                            if dim is None:
                                dim = vec.shape[0]
                            else:
                                if vec.shape[0] != dim:
                                    _log(f"Warning: inconsistent dim for word {current_word}: {vec.shape[0]} vs {dim}; skipping")
                                    current_word = None
                                    current_vals = []
                                    collecting = False
                                    continue
                            words.append(current_word)
                            vecs.append(vec)
                            current_word = None
                            current_vals = []
                            collecting = False
                        else:
                            parts = re.split(r'\s+', line.strip())
                            for p in parts:
                                p_clean = p.strip("[],")
                                if p_clean:
                                    current_vals.append(p_clean)
            if not words:
                raise RuntimeError("Parsed zero word vectors from the TSV file.")
            arr = np.vstack(vecs)
            kv = KeyedVectors(vector_size=arr.shape[1])
            kv.add_vectors(words, arr)
            _log(f"Constructed KeyedVectors from text file with {len(words)} words and dim {arr.shape[1]}.")
            return kv
        except Exception as e:
            attempts.append(("w2v_text_kyubyong", str(e)))

    msg = "Failed to load model via TSV Kyubyong parser. Attempts:\n" + "\n".join(f"{k}: {v}" for k, v in attempts)
    raise RuntimeError(msg)

COMMON_PARTICLES = {
    "이","가","을","를","은","는","에","에서","으로","로","와","과","에게","께","부터","까지","만","도","뿐","처럼","까지","보다"
}

def kaist_to_surface_candidates(token):
    if token is None:
        return []
    t = str(token).strip()
    if not t or t == "_":
        return []
    candidates = []
    if "+" in t:
        joined = t.replace("+", "")
        candidates.append(joined)
        if joined.endswith("다") and len(joined) > 1:
            candidates.append(joined[:-1])
        parts = [p for p in t.split("+") if p]
        if parts:
            candidates.append(parts[0])
            if parts[-1] in COMMON_PARTICLES and len(parts) >= 2:
                maybe = "".join(parts[:-1])
                candidates.append(maybe)
    else:
        candidates.append(t)
        if len(t) > 1 and t.endswith("다"):
            candidates.append(t[:-1])
    seen = set()
    out = []
    for c in candidates:
        if c and c not in seen:
            seen.add(c)
            out.append(c)
    return out

def centroid_of_words(model, words):
    vecs = []
    missed = []
    for w in words:
        if w is None:
            continue
        if isinstance(w, (tuple, list)):
            keyword = w[0]
        else:
            keyword = str(w)
        candidates = kaist_to_surface_candidates(keyword)
        found_key = None
        for cand in candidates:
            if cand in model:
                found_key = cand
                break
        if found_key is None and keyword in model:
            found_key = keyword
        if found_key is not None:
            vecs.append(model[found_key])
        else:
            missed.append(keyword)
    if not vecs:
        return None, 0, missed
    arr = np.vstack(vecs)
    return np.mean(arr, axis=0), arr.shape[0], missed

def topk_neighbors_from_centroid(model, centroid_vec, k=10):
    if centroid_vec is None:
        return []
    return model.similar_by_vector(centroid_vec, topn=k)
# -------------------------------------------------------------------------------

# ----------------------------
# Negation analysis helpers and extraction
# ----------------------------
def is_short_neg_token(tok):
    if not tok:
        return False
    if (tok.get("lemma") == "안" or tok.get("form") == "안") and tok.get("upos") == "ADV":
        return True
    if tok.get("form") == "안":
        return True
    return False

def is_long_neg_token(tok):
    if not tok:
        return False
    lemma = tok.get("lemma") or ""
    form = tok.get("form") or ""
    upos = tok.get("upos") or ""
    deprel = tok.get("deprel") or ""
    if ("않" in lemma) or (form.startswith("않")):
        if upos == "AUX" or deprel == "aux" or upos == "VERB":
            return True
    return False

def is_irregular_neg_token(tok):
    if not tok:
        return False
    lemma = tok.get("lemma") or ""
    form = tok.get("form") or ""
    if lemma.startswith("없") or form.startswith("없"):
        return True
    if lemma.startswith("아니") or form.startswith("아니"):
        return True
    return False

def classify_verb_negation(sent, verb_idx, verb_tok, dependents_map):
    verb_id = verb_tok['id']
    deps = dependents_map.get(verb_id, [])
    for d in deps:
        if is_long_neg_token(d):
            return "long"
    for d in deps:
        if is_short_neg_token(d):
            return "short"
    if verb_idx - 1 >= 0:
        prev = sent[verb_idx - 1]
        if is_short_neg_token(prev):
            return "short"
    if is_irregular_neg_token(verb_tok):
        return "irregular"
    return "affirmative"

def extract_negation_sensitive_sets(sents, verb_locations, verbs):
    NEG_TYPES = ("long","short","irregular","affirmative")
    results = {}
    for verb in verbs:
        buckets = {nt: {"subject": Counter(), "object": Counter(), "modifier": Counter(), "before": Counter(), "after": Counter(), "occurrences": 0} for nt in NEG_TYPES}
        locs = verb_locations.get(verb, [])
        for si, vi, original_tok in locs:
            sent = sents[si]
            if vi < 0 or vi >= len(sent):
                continue
            id_to_tok = {t['id']: t for t in sent}
            dependents = defaultdict(list)
            for t in sent:
                h = t['head']
                if h is not None and h in id_to_tok:
                    dependents[h].append(t)
            neg_type = classify_verb_negation(sent, vi, original_tok, dependents)
            buckets[neg_type]["occurrences"] += 1
            v_deps = dependents.get(original_tok['id'], [])
            for dep in v_deps:
                deprel = dep['deprel']
                if deprel.startswith("nsubj") or deprel in ("nsubj","csubj","nsubj:pass"):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    buckets[neg_type]["subject"][head_form] += 1
                if deprel.startswith("obj") or deprel == "obj":
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    buckets[neg_type]["object"][head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in ("advmod","amod","nmod","obl","advcl","compound")):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    buckets[neg_type]["modifier"][head_form] += 1
            if vi - 1 >= 0:
                buckets[neg_type]["before"][sent[vi-1]['form']] += 1
            if vi + 1 < len(sent):
                buckets[neg_type]["after"][sent[vi+1]['form']] += 1
        results[verb] = buckets
    return results

def noun_negation_association(neg_buckets, top_n=50):
    subj_counter_total = Counter()
    subj_long = Counter()
    subj_short = Counter()
    for verb, buckets in neg_buckets.items():
        if verb in EXCLUDE_VERBS:
            continue
        for nt in ("long","short","irregular","affirmative"):
            subj_ct = buckets[nt]["subject"]
            for noun, cnt in subj_ct.items():
                subj_counter_total[noun] += cnt
                if nt == "long":
                    subj_long[noun] += cnt
                if nt == "short":
                    subj_short[noun] += cnt
    top_nouns = [n for n, _ in subj_counter_total.most_common(top_n)]
    rows = []
    for n in top_nouns:
        tot = subj_counter_total[n]
        ln = subj_long[n]
        sn = subj_short[n]
        long_ratio = ln / tot if tot > 0 else 0.0
        rows.append((n, int(tot), int(ln), int(sn), float(long_ratio)))
    return rows

# ----------------------------
# Orchestration: main pipeline adapted for negation analysis 
# ----------------------------
def build_task2_analysis_negation(conllu_path=CONLLU_PATH, model_bin=None, model_tsv=None, k_neighbors=10, top_k_each_top20=6, top_k_each_mid=20):
    sents = parse_conllu(conllu_path)
    print(f"Loaded {len(sents)} sentences from {conllu_path}")
    raw_counter, raw_locations = verb_frequencies(sents)

    verb_counter, verb_locations = aggregate_normalized_counts(raw_counter, raw_locations)

    # Remove copula from consideration so it doesn't inflate negation files/analyses
    if COPULA_CANONICAL in verb_counter:
        del verb_counter[COPULA_CANONICAL]
    if COPULA_CANONICAL in verb_locations:
        del verb_locations[COPULA_CANONICAL]

    vf_df = pd.DataFrame(verb_counter.most_common(), columns=["verb_norm", "freq"])
    
    vf_df.to_csv(os.path.join(OUT_DIR, "verb_frequencies_normalized.csv"), index=False, encoding="utf-8-sig")

    # select by quantiles; we will crop top list to top_k_each_top20
    top_verbs, mid_verbs = select_verbs_by_quantiles(verb_counter, top_pct=0.20, next_pct=0.20, top_k_each=top_k_each_mid)
    top_verbs = top_verbs[:top_k_each_top20]  # show top 6 for top-20% so to include another 5 apart from copula

    # Filter out any excluded verbs (just in case)
    top_verbs = [v for v in top_verbs if v not in EXCLUDE_VERBS]
    mid_verbs = [v for v in mid_verbs if v not in EXCLUDE_VERBS]

    print("Top verbs (sample):", top_verbs)
    print("Mid verbs (sample):", mid_verbs[:top_k_each_mid])
    # hardcoded transitives 
    hardcoded_list = ['하','찾','주','쓰','가지','깨뜨리','따','먹이','고르','지르']
    verbs_to_analyze = list(dict.fromkeys(list(top_verbs) + list(mid_verbs[:top_k_each_mid]) + hardcoded_list))
    # ensure we don't accidentally analyze copula
    verbs_to_analyze = [v for v in verbs_to_analyze if v not in EXCLUDE_VERBS]

    sets = extract_verb_sets(sents, verb_locations, verbs_to_analyze)
    summary = {}
    for v in verbs_to_analyze:
        entry = sets.get(v, {"subject":Counter(),"object":Counter(),"modifier":Counter(),"before":Counter(),"after":Counter(),"occurrences":0})
        summary[v] = {
            "occurrences": entry["occurrences"],
            "top_subjects": entry["subject"].most_common(50),
            "top_objects": entry["object"].most_common(50),
            "top_modifiers": entry["modifier"].most_common(50),
            "top_before": entry["before"].most_common(50),
            "top_after": entry["after"].most_common(50)
        }
    with open(os.path.join(OUT_DIR, "verb_sets_summary_normalized.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    # NEGATION-SENSITIVE extraction
    neg_buckets = extract_negation_sensitive_sets(sents, verb_locations, verbs_to_analyze)

    # Write negation summary JSON 
    neg_summary = {}
    for v, buckets in neg_buckets.items():
        if v in EXCLUDE_VERBS:
            continue
        neg_summary[v] = {}
        for nt, data in buckets.items():
            neg_summary[v][nt] = {
                "occurrences": int(data["occurrences"]),
                "top_subjects": data["subject"].most_common(50),
                "top_objects": data["object"].most_common(50),
                "top_modifiers": data["modifier"].most_common(50),
                "top_before": data["before"].most_common(50),
                "top_after": data["after"].most_common(50),
            }
    neg_out_path = os.path.join(OUT_DIR, "verb_negation_summary.json")
    with open(neg_out_path, "w", encoding="utf-8") as f:
        json.dump(neg_summary, f, ensure_ascii=False, indent=2)
    print("Saved negation-aware summary to", neg_out_path)

    # Noun association summary (CSV)
    noun_assoc_rows = noun_negation_association(neg_buckets, top_n=100)
    noun_assoc_df = pd.DataFrame(noun_assoc_rows, columns=["noun","total_subj_count","long_count","short_count","long_ratio"])
    noun_assoc_df.to_csv(os.path.join(OUT_DIR, "noun_negation_association.csv"), index=False, encoding="utf-8-sig")

    # centroids & knn per negation bucket (Unsure if this is necessary)
    model = None
    neighbors_summary = {}
    if (model_bin and os.path.exists(model_bin)) or (model_tsv and os.path.exists(model_tsv)):
        print("Attempting to load vector model...")
        model = load_korean_vector_model(bin_path=model_bin, tsv_path=model_tsv)
        print("Computing centroids and nearest neighbors...")
        for v, buckets in neg_buckets.items():
            if v in EXCLUDE_VERBS:
                continue
            neighbors_summary[v] = {}
            for nt, data in buckets.items():
                words = [w for w, cnt in data["subject"].most_common(200)]
                centroid, n_in_vocab, missed = centroid_of_words(model, words)
                knn = topk_neighbors_from_centroid(model, centroid, k=k_neighbors) if centroid is not None else []
                neighbors_summary[v][nt] = {
                    "subject_centroid_n_in_vocab": int(n_in_vocab),
                    "subject_missed_count": len(missed),
                    "subject_missed_examples": missed[:30],
                    "subject_knn": [(w, float(sim)) for w, sim in knn]
                }
        with open(os.path.join(OUT_DIR, "verb_negation_neighbors.json"), "w", encoding="utf-8") as f:
            json.dump(neighbors_summary, f, ensure_ascii=False, indent=2)

    return {
        "sentences": len(sents),
        "verb_freq_df": vf_df,
        "top_verbs": top_verbs,
        "mid_verbs": mid_verbs,
        "verbs_to_analyze": verbs_to_analyze,
        "sets": sets,
        "summary": summary,
        "neg_buckets": neg_buckets,
        "neg_summary": neg_summary,
        "noun_assoc_df": noun_assoc_df,
        "neighbors_summary": neighbors_summary if model else None
    }

# ----------------------------

res = build_task2_analysis_negation(
    conllu_path="ko_kaist-ud-train.conllu",
    model_bin="ko.bin",     
    model_tsv="ko.tsv",     
    k_neighbors=20,
    top_k_each_top20=6,
    top_k_each_mid=5
)

print("Done. Outputs saved in:", OUT_DIR)


Loaded 23010 sentences from ko_kaist-ud-train.conllu
Top verbs (sample): ['하', '있', '되', '않', '보']
Mid verbs (sample): ['민감', '내려오', '제창', '갈', '문지르']
Saved negation-aware summary to data/project_outputs_task2\verb_negation_summary.json
Attempting to load vector model...
Inspecting text vector file ko.tsv ...
Constructed KeyedVectors from text file with 30185 words and dim 200.
Computing centroids and nearest neighbors...
Done. Outputs saved in: data/project_outputs_task2


In [14]:
#!/usr/bin/env python3
import json, os
from collections import Counter, defaultdict
import pandas as pd
from scipy.stats import fisher_exact

OUT_DIR = "data/project_outputs_task2"
os.makedirs(OUT_DIR, exist_ok=True)
INPUT_NEG_SUMMARY = os.path.join(OUT_DIR, "verb_negation_summary.json")
OUTPUT_PREFIX = os.path.join(OUT_DIR, "verb_negation_summary_no_copula")

HARDCODED_TRANSITIVE = ['하','찾','주','쓰','가지','깨뜨리','따','먹이','고르','지르']

CLASS_SIGNATURES = {
    "transitive_action": HARDCODED_TRANSITIVE,
    "speech": ["말","묻","질문","이야기","말하","발화","선언","답변","전하","말씀"],
    "cognition_belief": ["생각","믿","추정","판단","의견","알","인지","신념","추측","생각하"],
    "motion": ["가","오","내려","도착","올라","움직","들어","나가","돌아","출발","도망"],
    "change_of_state": ["되","변하","증가","감소","생기","개선","시작","멈추","끝나","확산"],
    "existential": ["있","없"],
    "perception": ["보이","보","들리","느끼","관찰"],
}

def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def write_json(obj, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def safe_int(x):
    try:
        return int(x)
    except Exception:
        return 0

def noun_negation_association(neg_summary, top_n=100):
    total = Counter()
    long_c = Counter()
    short_c = Counter()
    for verb, buckets in neg_summary.items():
        for nt in ("long", "short", "affirmative"):
            subj_ct = buckets.get(nt, {}).get("top_subjects", [])
            for noun, cnt in subj_ct:
                total[noun] += cnt
                if nt == "long":
                    long_c[noun] += cnt
                if nt == "short":
                    short_c[noun] += cnt
    top_nouns = [n for n, _ in total.most_common(top_n)]
    rows = []
    for n in top_nouns:
        tot = total[n]
        ln = long_c[n]
        sn = short_c[n]
        lr = ln / tot if tot > 0 else 0.0
        rows.append((n, int(tot), int(ln), int(sn), float(lr)))
    return pd.DataFrame(rows, columns=["noun","total_subj_count","long_count","short_count","long_ratio"])

def remove_copula_from_summary(neg_summary, copula_key="이"):
    if copula_key in neg_summary:
        out = dict(neg_summary)
        del out[copula_key]
        return out
    return neg_summary

def assign_semantic_class(verb, verb_summary):
    v = str(verb)
    if v in HARDCODED_TRANSITIVE:
        return "transitive_action"
    for cls, sigs in CLASS_SIGNATURES.items():
        for s in sigs:
            if s and s in v:
                return cls
    if verb_summary is not None:
        subj_count = sum(cnt for _, cnt in verb_summary.get("top_subjects", []))
        obj_count = sum(cnt for _, cnt in verb_summary.get("top_objects", []))
        if obj_count > subj_count * 1.5 and obj_count >= 3:
            return "transitive_action"
    return "other"

def aggregate_class_stats(neg_summary, verb_to_class):
    class_stats = defaultdict(lambda: {"affirmative":0,"long":0,"short":0,"verbs":[]})
    for verb, buckets in neg_summary.items():
        cls = verb_to_class.get(verb, "other")
        class_stats[cls]["verbs"].append(verb)
        for key in ("affirmative","long","short"):
            cnt = buckets.get(key, {}).get("occurrences", 0)
            class_stats[cls][key] += safe_int(cnt)
    out = []
    for cls, d in class_stats.items():
        aff = d["affirmative"]
        lo = d["long"]
        sh = d["short"]
        neg_total = lo + sh
        p_long_of_neg = (lo / neg_total) if neg_total > 0 else float("nan")
        p_short_of_neg = (sh / neg_total) if neg_total > 0 else float("nan")
        total_all = aff + neg_total
        neg_rate_of_all = (neg_total / total_all) if total_all > 0 else 0.0
        out.append({
            "class": cls,
            "affirmative": aff,
            "long": lo,
            "short": sh,
            "neg_total": neg_total,
            "p_long_of_neg": p_long_of_neg,
            "p_short_of_neg": p_short_of_neg,
            "neg_rate_of_all": neg_rate_of_all,
            "total_all": total_all,
            "verbs": d["verbs"]
        })
    df = pd.DataFrame(out).sort_values(by="total_all", ascending=False).reset_index(drop=True)
    return df

def fisher_test_for_class(class_df):
    rows = []
    counts = {r["class"]:(r["long"], r["short"]) for _, r in class_df.iterrows()}
    grand_long = sum([v[0] for v in counts.values()])
    grand_short = sum([v[1] for v in counts.values()])
    for _, r in class_df.iterrows():
        cls = r["class"]
        cls_long = int(r["long"])
        cls_short = int(r["short"])
        rest_long = grand_long - cls_long
        rest_short = grand_short - cls_short
        table = [[cls_long, cls_short], [rest_long, rest_short]]
        degenerate = (cls_long + cls_short == 0) or (rest_long + rest_short == 0)
        if degenerate:
            rows.append({"class":cls,"cls_long":cls_long,"cls_short":cls_short,"rest_long":rest_long,"rest_short":rest_short,"test":"none","odds_ratio":None,"p_value":None})
            continue
        try:
            oddsratio, p = fisher_exact(table, alternative="two-sided")
            rows.append({"class":cls,"cls_long":cls_long,"cls_short":cls_short,"rest_long":rest_long,"rest_short":rest_short,"test":"fisher_exact","odds_ratio":float(oddsratio),"p_value":float(p)})
        except Exception as e:
            rows.append({"class":cls,"cls_long":cls_long,"cls_short":cls_short,"rest_long":rest_long,"rest_short":rest_short,"test":"error","odds_ratio":None,"p_value":None})
    return pd.DataFrame(rows)

def main(input_path=INPUT_NEG_SUMMARY):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input not found: {input_path}")
    neg_summary_raw = load_json(input_path)
    neg_summary = remove_copula_from_summary(neg_summary_raw, copula_key="이")
    out_neg_path = OUTPUT_PREFIX + ".json"
    write_json(neg_summary, out_neg_path)
    noun_assoc_df = noun_negation_association(neg_summary, top_n=200)
    noun_assoc_csv = os.path.join(OUT_DIR, "noun_negation_association_no_copula.csv")
    noun_assoc_df.to_csv(noun_assoc_csv, index=False, encoding="utf-8-sig")
    verb_to_class = {verb: assign_semantic_class(verb, data) for verb, data in neg_summary.items()}
    for k in list(verb_to_class.keys()):
        if verb_to_class[k] == "copula":
            verb_to_class[k] = "other"
    verb_to_class_path = os.path.join(OUT_DIR, "verb_to_class_no_copula.json")
    write_json(verb_to_class, verb_to_class_path)
    class_df = aggregate_class_stats(neg_summary, verb_to_class)
    class_csv = os.path.join(OUT_DIR, "semantic_class_neg_stats_no_copula.csv")
    class_df.to_csv(class_csv, index=False, encoding="utf-8-sig")
    fisher_df = fisher_test_for_class(class_df)
    fisher_csv = os.path.join(OUT_DIR, "semantic_class_fisher_tests_no_copula.csv")
    fisher_df.to_csv(fisher_csv, index=False, encoding="utf-8-sig")
    sig = fisher_df[(fisher_df["p_value"].notnull()) & (fisher_df["p_value"] < 0.05)].sort_values("p_value")
    print(class_df[["class","total_all","neg_total","p_long_of_neg","p_short_of_neg"]].head(20).to_string(index=False))
    if sig.empty:
        print("No significant class-level differences (p < 0.05).")
    else:
        print(sig[["class","odds_ratio","p_value","cls_long","cls_short","rest_long","rest_short"]].to_string(index=False))
    return {"neg_summary_no_copula": out_neg_path, "noun_assoc_csv": noun_assoc_csv, "verb_to_class_json": verb_to_class_path, "class_stats_csv": class_csv, "fisher_csv": fisher_csv}

if __name__ == "__main__":
    out = main()


            class  total_all  neg_total  p_long_of_neg  p_short_of_neg
transitive_action       4964         87       0.954023        0.045977
      existential       3468          0            NaN             NaN
  change_of_state       2017        128       0.289062        0.710938
            other       1706          0            NaN             NaN
       perception        780         10       1.000000        0.000000
           motion          7          0            NaN             NaN
            class  odds_ratio      p_value  cls_long  cls_short  rest_long  rest_short
  change_of_state    0.017488 4.147050e-27        37         91         93           4
transitive_action   40.175532 2.526940e-22        83          4         47          91
       perception         inf 5.738008e-03        10          0        120          95


In [12]:
def verb_info(verb):
    print(f"Verb: {verb}")
    entry = res['summary'][verb]
    print(f"  Occurrences: {entry['occurrences']}")
    print(f"  Top Subjects: {entry['top_subjects'][:5]}")
    print(f"  Top Objects: {entry['top_objects'][:5]}")
    print(f"  Top Modifiers: {entry['top_modifiers'][:5]}")
    print(f"  Top Before: {entry['top_before'][:5]}")
    print(f"  Top After: {entry['top_after'][:5]}")

### Task 2.1
#### 5 Verbs from Top 20%
List of verbs: ['이', '하', '있', '되', '않']

In [39]:
for v in res['top_verbs']:
    verb_info(v)
    print()

Verb: 이
  Occurrences: 10508
  Top Subjects: [('것+이', 213), ('이것+이', 59), ('그것+이', 32), ('자체+가', 10), ('사람+이', 8)]
  Top Objects: [('이+를', 4), ('이것+을', 3), ('작품+을', 2), ('정+을', 1), ('세계관+을', 1)]
  Top Modifiers: [('것+이+다', 131), ('바로', 120), ('것+도', 64), ('가장', 40), ('때문+이+다', 34)]
  Top Before: [('있는', 322), ('있을', 133), ("'", 126), ('할', 125), ('될', 115)]
  Top After: [('.', 6029), ('것이다', 185), ('?', 150), ('할', 109), ('때문이다', 54)]

Verb: 하
  Occurrences: 3704
  Top Subjects: [('내+가', 7), ('우리+가', 6), ('자기+가', 6), ('그+가', 6), ('그것+이', 3)]
  Top Objects: [('역할+을', 45), ('일+을', 26), ('것+을', 22), ('생각+을', 13), ('말+을', 12)]
  Top Modifiers: [('수', 269), ('것+이+다', 75), ('필요+로', 24), ('때', 23), ('것+이+ㅂ니다', 19)]
  Top Before: [('있다고', 48), ('해야', 47), ('역할을', 43), ('있어야', 42), ('필요로', 24)]
  Top After: [('.', 1546), ('수', 271), ('것이다', 163), ('것은', 62), (',', 58)]

Verb: 있
  Occurrences: 3468
  Top Subjects: [('문제+가', 4), ('것+이', 4), ('관계+가', 4), ('수+가', 3), ('필요+가', 2)]
  Top Objects: []


### Task 2.2
#### 5 Verbs from Top 20-40%
List of verbs: ['물리치', '민감', '내려오', '제창', '갈']

In [40]:
for v in res['mid_verbs']:
    verb_info(v)
    print()

Verb: 물리치
  Occurrences: 8
  Top Subjects: []
  Top Objects: [('이민족+을', 1), ('도깨비+를', 1), ('기운+을', 1), ('왜구+를', 1), ('재도전+을', 1)]
  Top Modifiers: [('수', 2), ('그걸+로', 1), ('의심+도', 1)]
  Top Before: [('이민족을', 1), ('도깨비를', 1), ('기운을', 1), ('왜구를', 1), ('재도전을', 1)]
  Top After: [('수', 2), ('존왕양이의', 1), ('위해', 1), ('영약으로', 1), ('결심을', 1)]

Verb: 민감
  Occurrences: 7
  Top Subjects: [('동물+들+이', 1)]
  Top Objects: []
  Top Modifiers: [('더', 2), ('측면+에', 1), ('여론+에', 1), ('것+에', 1), ('특성+상', 1)]
  Top Before: [('더', 2), ('여론에', 1), ('정책에', 1), ('것에', 1), ('덜', 1)]
  Top After: [('.', 2), ('인사라고나', 1), ('반응을', 1), ('법', 1), ('부분의', 1)]

Verb: 내려오
  Occurrences: 7
  Top Subjects: []
  Top Objects: []
  Top Modifiers: [('예+로+부터', 1), ('고장+에', 1), ('가장자리+까지', 1), ('옛+부터', 1), ('조상+도', 1)]
  Top Before: [('전해', 2), ('가장자리까지', 1), ('옛부터', 1), ('묻고', 1), ('하늘에서', 1)]
  Top After: [('.', 2), ('좋은', 1), ('한다', 1), ('한', 1), ('날', 1)]

Verb: 제창
  Occurrences: 7
  Top Subjects: [('사람+이', 1), ('루스벨트+가', 1)

2A 

The object set gives the most useful signal about verb meaning. The before set also provides semantic meaning though it is a lot more noisy. Referring back to the verb 되다 (twoi., “to become”) from earlier, most of the words were pretty general, but from the objects ([('children+is', 1), ('arrow+is', 1), ('providers+is', 1)]) and before ([('in', 92), ('have', 48), ('know', 42), ('thing', 31), ('have', 28)]) sets, we got the most information (which holds true for most words we found) as opposed to after ([('.', 917), ('is', 179), ('is', 50), ('can', 49), ('is', 40)]), modifiers ([('thing+is+this', 5), ('number', 3), ('came+at+da', 2), ('message+with', 1), ('girl+with', 1)]), and subjects ([('thing+this', 44), ('problem+is', 19), ('help+is', 18), ('person+this', 17)]).
It is important to note that the object and before sets are not mutually exclusive due to the subject object verb order of Korean which means that there might be some objects in the before set too. One of the cons of the object set is that it doesn't generalize to intransitive verbs but the before set does (along with containing some objects). However the issue with the before set is its inherent noise. Since it is able to capture a lot of different types of words, you can't generalize a meaning out of what it captures.