# Working with Universal Dependencies


In [19]:
# if you don't have conllu yet, uncomment the following
!python -m pip install conllu



In [None]:
import conllu # reading Universal Dependency files in the CONLLu format
import os
import json
import re
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from collections import defaultdict, Counter

We open the GUM corpus as a text file, and look at its first few lines. After the initial metadata, the first sentence starts with the line
      "# text = Aesthetic Appreciation and Spanish Art:"

In [21]:
with open("ko_kaist-ud-train.conllu", encoding="utf-8") as f:
    data = f.read()

## Task 1

In [None]:
OUT_DIR = 'data/project_outputs'
os.makedirs(OUT_DIR, exist_ok=True)

def parse_conllu(path):
    """
    Return list of sentences; each sentence is a list of token dicts:
    {id:int, form:str, lemma:str, upos:str, head:int or None, deprel:str}
    Ignores multiword lines (1-2) and empty-node decimal ids (3.1).
    """
    sents = []
    tokens = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if tokens:
                    sents.append(tokens)
                    tokens = []
                continue
            if line.startswith("#"):
                continue
            cols = line.split("\t")
            if len(cols) != 10:
                continue
            id_field = cols[0]
            # skip multiword / empty nodes
            if "-" in id_field or "." in id_field:
                continue
            try:
                tid = int(id_field)
            except ValueError:
                continue
            form = cols[1]
            lemma = cols[2]
            upos = cols[3]
            head = cols[6]
            deprel = cols[7]
            try:
                head_int = int(head) if head != "_" else None
            except ValueError:
                head_int = None
            tok = {"id": tid, "form": form, "lemma": lemma, "upos": upos, "head": head_int, "deprel": deprel}
            tokens.append(tok)
    # final
    if tokens:
        sents.append(tokens)
    return sents

sents = parse_conllu("ko_kaist-ud-train.conllu")

verb_instances = 0
obj_instances = 0
obj_immediate = 0
obj_within3 = 0
obj_to_right = 0
dist_counter = Counter()

examples = defaultdict(list) 

for sent_idx, tokens in enumerate(sents):
    id_to_tok = {t['id']: t for t in tokens}
    dependents = defaultdict(list)
    for t in tokens:
        if t['head'] is not None and t['head'] in id_to_tok:
            dependents[t['head']].append(t)
    id_to_index = {t['id']: i for i, t in enumerate(tokens)}

    for i, tok in enumerate(tokens):
        if tok['upos'] == 'VERB':
            verb_instances += 1
            verb_id = tok['id']
            deps = dependents.get(verb_id, [])
            for dep in deps:
                if dep['deprel'] == 'obj':
                    obj_instances += 1
                    obj_idx = id_to_index.get(dep['id'])
                    verb_idx = i
                    if obj_idx is None:
                        continue
                    diff = verb_idx - obj_idx  # positive if object is to verb's left
                    dist_counter[diff] += 1
                    if diff == 1:
                        obj_immediate += 1
                    if 1 <= diff <= 3:
                        obj_within3 += 1
                    if diff < 0:
                        obj_to_right += 1

                    
                    lemma = tok['lemma'] if tok['lemma'] != '_' else tok['form']
                    if len(examples[lemma]) < 5:
                        sent_form = ' '.join([t['form'] for t in tokens])
                        examples[lemma].append((dep['form'], tok['form'], sent_form))

if obj_instances > 0:
    pct_immediate = obj_immediate / obj_instances * 100
    pct_within3 = obj_within3 / obj_instances * 100
    pct_after = obj_to_right / obj_instances * 100
else:
    pct_immediate = pct_within3 = 0.0

print('Verb instances (VERB tokens):', verb_instances)
print('Total obj instances:', obj_instances)
print(f'Objects immediately before verb: {obj_immediate} ({pct_immediate:.2f}%)')
print(f'Objects within 3 before verb: {obj_within3} ({pct_within3:.2f}%)')
print(f'Objects after verb: {obj_to_right} ({pct_after:.2f}%)')

dist_items = sorted(dist_counter.items())
df_dist = pd.DataFrame(dist_items, columns=['verb_minus_obj_index', 'count'])
df_dist.to_csv(os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'), index=False)
print('Saved distance distribution to', os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'))


Verb instances (VERB tokens): 55805
Total obj instances: 13912
Objects immediately before verb: 10066 (72.35%)
Objects within 3 before verb: 12937 (92.99%)
Objects after verb: 0 (0.00%)
Saved distance distribution to data/project_outputs/object_verb_distance_distribution.csv


### Task 1.1
The generalization we can see from the above results is that an object _never_ occurs after a verb, it _must_ occur before the verb.

### Task 1.2
Our generalization generally holds true. While in our corupus, an object _never_ follows a verb, it does not always have to come before a verb (it does at a 72.35% rate). We have found that the object comes within 3 tokens before the verb, though this is also not always true (it occurs 92.99% of the time). The exceptions to this rule probably occur when objects are farther than 3 tokens from the verb, but still are before the verb, since we could not find any case where it follows a verb (at any length).

## Task 2

In [None]:

CONLLU_PATH = "ko_kaist-ud-train.conllu"   #Just a reminder, all files are directly just in the same folder as this code.
OUT_DIR = "data/project_outputs_task2"
os.makedirs(OUT_DIR, exist_ok=True)

VERB_UPOS = ("VERB", "AUX")   # token UPOS considered verbs
COPULA_CANONICAL = "이"       # canonical token for copula (이)

def normalize_korean_verb(lemma_or_form):

    if not lemma_or_form or lemma_or_form == "_":
        return lemma_or_form

    
    s = lemma_or_form.replace("＋", "+").replace("‧", "+").strip()

    #if segmented with '+', split into morphemes
    if "+" in s:
        parts = [p for p in s.split("+") if p]  # drop empty parts
        #if any part equals the copula morpheme '이', treat as copula
        if any(p == "이" for p in parts):
            return COPULA_CANONICAL
        return parts[0]

    #if unsegmented but ends with '다', strip terminal '다' 
    if len(s) > 1 and s.endswith("다"):
        return s[:-1]

    return s

# ensures counts/locations are collapsed to normalized keys
def aggregate_normalized_counts(counter, locations):

    norm_counter = Counter()
    norm_locations = defaultdict(list)
    if locations:
        for raw_key, locs in locations.items():
            norm = normalize_korean_verb(raw_key)
            norm_counter[norm] += len(locs)
            norm_locations[norm].extend(locs)
    else:
        for raw_key, cnt in counter.items():
            norm = normalize_korean_verb(raw_key)
            norm_counter[norm] += cnt
    return norm_counter, norm_locations



def verb_frequencies(sents, verb_upos=VERB_UPOS):
    counter = Counter()
    locations = defaultdict(list)  # canonical_verb -> list of (sent_idx, token_idx, original_token)
    for si, tokens in enumerate(sents):
        for i, tok in enumerate(tokens):
            if tok["upos"] in verb_upos:
                lemma = tok["lemma"] if tok["lemma"] != "_" else tok["form"]
                norm = normalize_korean_verb(lemma)
                counter[norm] += 1
                locations[norm].append((si, i, tok))
    return counter, locations


def select_verbs_by_quantiles(counter, top_pct=0.20, next_pct=0.20, top_k_each=20):
    items = counter.most_common()
    types = [v for v, _ in items]
    n_types = len(types)
    top_n_types = max(1, int(n_types * top_pct))
    next_n_types = max(1, int(n_types * next_pct))
    top_type_set = set(types[:top_n_types])
    next_type_set = set(types[top_n_types: top_n_types + next_n_types])
    top_candidates = [v for v, _ in items if v in top_type_set]
    next_candidates = [v for v, _ in items if v in next_type_set]
    return top_candidates[:top_k_each], next_candidates[:top_k_each]


def extract_verb_sets(sents, verb_locations, verbs, 
                      subj_deprels=("nsubj", "nsubj:pass", "csubj"), 
                      obj_deprels_prefix=("obj",), 
                      modifier_deprels_prefixes=("advmod", "amod", "nmod", "obl", "advcl", "compound")):
    results = {}
    for verb in verbs:
        subj_c = Counter()
        obj_c = Counter()
        mod_c = Counter()
        before_c = Counter()
        after_c = Counter()
        occ = 0
        locs = verb_locations.get(verb, [])
        for si, vi, original_tok in locs:
            sent = sents[si]
            if vi < 0 or vi >= len(sent):
                continue
            occ += 1
            id_to_tok = {t['id']: t for t in sent}
            dependents = defaultdict(list)
            for t in sent:
                h = t['head']
                if h is not None and h in id_to_tok:
                    dependents[h].append(t)
            v_deps = dependents.get(original_tok['id'], [])
            for dep in v_deps:
                deprel = dep['deprel']
                if deprel in subj_deprels or deprel.startswith("nsubj"):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    subj_c[head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in obj_deprels_prefix):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    obj_c[head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in modifier_deprels_prefixes):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    mod_c[head_form] += 1
            if vi - 1 >= 0:
                before_c[sent[vi-1]['form']] += 1
            if vi + 1 < len(sent):
                after_c[sent[vi+1]['form']] += 1
        results[verb] = {
            "subject": subj_c,
            "object": obj_c,
            "modifier": mod_c,
            "before": before_c,
            "after": after_c,
            "occurrences": occ
        }
    return results

def load_korean_vector_model(bin_path=None, tsv_path=None, verbose=True):

    attempts = []
    
    def _log(msg):
        if verbose:
            print(msg)

    # Tried multiple ways to load the word2vec model but it was acting super weird and the stackoverflow page got very confusing for me
    # (https://stackoverflow.com/questions/70458726/cant-load-the-pre-trained-word2vec-of-korean-language)
    # Now I am just manually loading the whole model, let me know if you think this is the correct way
    if tsv_path and os.path.exists(tsv_path):
        try:
            _log(f"Inspecting text vector file {tsv_path} ...")
            with open(tsv_path, 'r', encoding='utf-8', errors='ignore') as f:
                
                sample = []
                max_lines = 5000  
                for i, line in enumerate(f):
                    if not line.strip():
                        continue
                    sample.append(line.rstrip('\n'))
                    if i >= max_lines:
                        break
            if not sample:
                raise RuntimeError("Text vector file appears empty or unreadable.")
            
            first_tokens = sample[0].strip().split()
            header_like = False
            if len(first_tokens) >= 2 and first_tokens[0].isdigit() and first_tokens[1].isdigit():
                header_like = True

            

            _log("No headers so parsing text file line-by-line and building KeyedVectors (this may take time on your computers).")
            words = []
            vecs = []
            dim = None
            with open(tsv_path, 'r', encoding='utf-8', errors='ignore') as fh:
                for line in fh:
                    line = line.strip()
                    if not line:
                        continue
                    parts = line.split()
                    if len(parts) < 2:
                        # malformed; skip
                        continue
                    word = parts[0]
                    num_tokens = parts[1:]
                    # try convert to floats
                    try:
                        vec = np.array([float(x) for x in num_tokens], dtype=np.float32)
                    except Exception:
                        # kip line with non-numeric tokens
                        continue
                    if dim is None:
                        dim = vec.shape[0]
                    else:
                        # if vector dims mismatch, skip
                        if vec.shape[0] != dim:
                            continue
                    words.append(word)
                    vecs.append(vec)
            if not words:
                raise RuntimeError("No valid word/vector lines parsed from file.")
            arr = np.vstack(vecs)
            kv = KeyedVectors(vector_size=arr.shape[1])
            kv.add_vectors(words, arr)
            _log(f"Constructed KeyedVectors from text file with {len(words)} words and dim {arr.shape[1]}.")
            return kv
        except Exception as e:
            attempts.append(("w2v_text_manual", str(e)))


def centroid_of_words(model, words):
    vecs = []
    missed = []
    for w in words:
        if w is None:
            continue
        key = w
        # for Korean we might want to try forms and lemmas as separate keys present assumption is that keys are surface tokens
        if key in model:
            vecs.append(model[key])
        else:
            missed.append(w)
    if not vecs:
        return None, 0, missed
    arr = np.vstack(vecs)
    return np.mean(arr, axis=0), arr.shape[0], missed

def topk_neighbors_from_centroid(model, centroid_vec, k=10):
    if centroid_vec is None:
        return []
    return model.similar_by_vector(centroid_vec, topn=k)


def build_task2_analysis(conllu_path=CONLLU_PATH, model_bin=None, model_tsv=None, k_neighbors=10, top_k_each=5):
    sents = parse_conllu(conllu_path)
    print(f"Loaded {len(sents)} sentences from {conllu_path}")
    raw_counter, raw_locations = verb_frequencies(sents)

    verb_counter, verb_locations = aggregate_normalized_counts(raw_counter, raw_locations)

    vf_df = pd.DataFrame(verb_counter.most_common(), columns=["verb_norm", "freq"])
    
    vf_df.to_csv(os.path.join(OUT_DIR, "verb_frequencies_normalized.csv"), index=False, encoding="utf-8-sig")
    top_verbs, mid_verbs = select_verbs_by_quantiles(verb_counter, top_pct=0.20, next_pct=0.20, top_k_each=top_k_each)
    print("Top verbs (sample):", top_verbs)
    print("Mid verbs (sample):", mid_verbs)
    verbs_to_analyze = list(top_verbs) + list(mid_verbs)
    sets = extract_verb_sets(sents, verb_locations, verbs_to_analyze)
    summary = {}
    for v in verbs_to_analyze:
        entry = sets[v]
        summary[v] = {
            "occurrences": entry["occurrences"],
            "top_subjects": entry["subject"].most_common(50),
            "top_objects": entry["object"].most_common(50),
            "top_modifiers": entry["modifier"].most_common(50),
            "top_before": entry["before"].most_common(50),
            "top_after": entry["after"].most_common(50)
        }
    with open(os.path.join(OUT_DIR, "verb_sets_summary_normalized.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    model = None
    neighbors_summary = {}
    if (model_bin and os.path.exists(model_bin)) or (model_tsv and os.path.exists(model_tsv)):
        print("Attempting to load vector model...")
        model = load_korean_vector_model(bin_path=model_bin, tsv_path=model_tsv)
        print("Computing centroids and nearest neighbors...")
        for v in verbs_to_analyze:
            neighbors_summary[v] = {}
            for set_name in ("top_subjects", "top_objects", "top_modifiers", "top_before", "top_after"):
                words = [w for w, cnt in summary[v][set_name][:200]]
                centroid, n_in_vocab, missed = centroid_of_words(model, words)
                knn = topk_neighbors_from_centroid(model, centroid, k=k_neighbors) if centroid is not None else []
                neighbors_summary[v][set_name] = {
                    "centroid_n_in_vocab": int(n_in_vocab),
                    "missed_count": len(missed),
                    "missed_examples": missed[:30],
                    "knn": [(w, float(sim)) for w, sim in knn]
                }
        with open(os.path.join(OUT_DIR, "verb_neighbors_summary_normalized.json"), "w", encoding="utf-8") as f:
            json.dump(neighbors_summary, f, ensure_ascii=False, indent=2)
    else:
        print("No model files provided or found; skipping centroid/knn stage.")

    return {
        "sentences": len(sents),
        "verb_freq_df": vf_df,
        "top_verbs": top_verbs,
        "mid_verbs": mid_verbs,
        "sets": sets,
        "summary": summary,
        "neighbors_summary": neighbors_summary if model else None
    }


res = build_task2_analysis(
    conllu_path="ko_kaist-ud-train.conllu",
    model_bin="ko.bin",     
    model_tsv="ko.tsv",     
    k_neighbors=20,
    top_k_each=20
)

Loaded 23010 sentences from ko_kaist-ud-train.conllu
Top verbs (sample): ['이', '하', '있', '되', '않', '보', '이러하', '대하', '주', '알', '보이', '생각', '가지', '말하', '못하', '오', '만들', '살', '받', '이루']
Mid verbs (sample): ['물리치', '민감', '내려오', '제창', '갈', '문지르', '벗기', '자르', '되돌리', '담그', '금하', '대신', '낫', '운동', '공개', '소멸', '상호', '이바지', '낭비', '거절']
Attempting to load vector model...
Inspecting text vector file ko.tsv ...
No headers so parsing text file line-by-line and building KeyedVectors (this may take time on your computers).
Constructed KeyedVectors from text file with 603232 words and dim 5.
Computing centroids and nearest neighbors...


In [None]:

verb_obj_counts = defaultdict(int)
verb_obj_immediate = defaultdict(int)

for sent_idx, tokens in enumerate(sents):
    id_to_tok = {t['id']: t for t in tokens}
    dependents = defaultdict(list)
    for t in tokens:
        if t['head'] is not None and t['head'] in id_to_tok:
            dependents[t['head']].append(t)
    id_to_index = {t['id']: i for i, t in enumerate(tokens)}

    for i, tok in enumerate(tokens):
        if tok['upos'] == 'VERB':
            verb_id = tok['id']
            lemma = tok['lemma'] if tok['lemma'] != '_' else tok['form']
            deps = dependents.get(verb_id, [])
            for dep in deps:
                if dep['deprel'] == 'obj':
                    verb_obj_counts[lemma] += 1
                    obj_idx = id_to_index.get(dep['id'])
                    if obj_idx is not None and i - obj_idx == 1:
                        verb_obj_immediate[lemma] += 1

rows = []
for lemma, cnt in verb_obj_counts.items():
    rows.append({
        'lemma': lemma,
        'obj_count': cnt,
        'obj_immediate_count': verb_obj_immediate.get(lemma, 0),
        'pct_immediate': verb_obj_immediate.get(lemma, 0) / cnt * 100 if cnt>0 else 0
    })
df_verb_obj = pd.DataFrame(rows).sort_values('obj_count', ascending=False)

display(df_verb_obj.head(20))
csv_path = os.path.join(OUT_DIR, 'verbs_with_obj_stats.csv')
df_verb_obj.to_csv(csv_path, index=False)
print('Saved per-verb obj stats to', csv_path)


Unnamed: 0,lemma,obj_count,obj_immediate_count,pct_immediate
93,하+는,122,59,48.360656
388,가지+고,113,98,86.725664
284,위하+ㄴ,98,90,91.836735
88,하+ㄴ,89,38,42.696629
48,알+ㄹ,84,49,58.333333
386,하+었+다,82,44,53.658537
7,가지+ㄴ,77,68,88.311688
693,하+ㄹ,68,37,54.411765
79,하+ㄴ다,61,27,44.262295
46,지니+고,58,48,82.758621


Saved per-verb obj stats to data/project_outputs_task2/verbs_with_obj_stats.csv
