# Working with Universal Dependencies


In [25]:
# if you don't have conllu yet, uncomment the following
!python -m pip install conllu



In [26]:
import conllu # reading Universal Dependency files in the CONLLu format
import os
import json
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors, Word2Vec
from collections import defaultdict, Counter

In [27]:
with open("ko_kaist-ud-train.conllu", encoding="utf-8") as f:
    data = f.read()

## Task 1

In [32]:
OUT_DIR = 'data/project_outputs'
os.makedirs(OUT_DIR, exist_ok=True)

def parse_conllu(path):
    """
    Return list of sentences; each sentence is a list of token dicts:
    {id:int, form:str, lemma:str, upos:str, head:int or None, deprel:str}
    Ignores multiword lines (1-2) and empty-node decimal ids (3.1).
    """
    sents = []
    tokens = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                if tokens:
                    sents.append(tokens)
                    tokens = []
                continue
            if line.startswith("#"):
                continue
            cols = line.split("\t")
            if len(cols) != 10:
                continue
            id_field = cols[0]
            # skip multiword / empty nodes
            if "-" in id_field or "." in id_field:
                continue
            try:
                tid = int(id_field)
            except ValueError:
                continue
            form = cols[1]
            lemma = cols[2]
            upos = cols[3]
            head = cols[6]
            deprel = cols[7]
            try:
                head_int = int(head) if head != "_" else None
            except ValueError:
                head_int = None
            tok = {"id": tid, "form": form, "lemma": lemma, "upos": upos, "head": head_int, "deprel": deprel}
            tokens.append(tok)
    # final
    if tokens:
        sents.append(tokens)
    return sents

sents = parse_conllu("ko_kaist-ud-train.conllu")

verb_instances = 0
obj_instances = 0
obj_immediate = 0
obj_within3 = 0
obj_to_right = 0
obj_any_before = 0
dist_counter = Counter()

examples = defaultdict(list) 

for sent_idx, tokens in enumerate(sents):
    id_to_tok = {t['id']: t for t in tokens}
    dependents = defaultdict(list)
    for t in tokens:
        if t['head'] is not None and t['head'] in id_to_tok:
            dependents[t['head']].append(t)
    id_to_index = {t['id']: i for i, t in enumerate(tokens)}

    for i, tok in enumerate(tokens):
        if tok['upos'] == 'VERB':
            verb_instances += 1
            verb_id = tok['id']
            deps = dependents.get(verb_id, [])
            for dep in deps:
                if dep['deprel'] == 'obj':
                    obj_instances += 1
                    obj_idx = id_to_index.get(dep['id'])
                    verb_idx = i
                    if obj_idx is None:
                        continue
                    diff = verb_idx - obj_idx  # positive if object is to verb's left
                    dist_counter[diff] += 1
                    if diff == 1:
                        obj_immediate += 1
                    if 1 <= diff <= 3:
                        obj_within3 += 1
                    if diff < 0:
                        obj_to_right += 1
                    if diff > 0:
                        obj_any_before += 1

                    
                    lemma = tok['lemma'] if tok['lemma'] != '_' else tok['form']
                    if len(examples[lemma]) < 5:
                        sent_form = ' '.join([t['form'] for t in tokens])
                        examples[lemma].append((dep['form'], tok['form'], sent_form))

if obj_instances > 0:
    pct_immediate = obj_immediate / obj_instances * 100
    pct_within3 = obj_within3 / obj_instances * 100
    pct_after = obj_to_right / obj_instances * 100
else:
    pct_immediate = pct_within3 = 0.0

print('Verb instances (VERB tokens):', verb_instances)
print('Total obj instances:', obj_instances)
print(f'Objects immediately before verb: {obj_immediate} ({pct_immediate:.2f}%)')
print(f'Objects within 3 before verb: {obj_within3} ({pct_within3:.2f}%)')
print(f'Objects after verb: {obj_to_right} ({pct_after:.2f}%)')
print(f'Objects before verb (any distance): {obj_any_before} ({(obj_any_before / obj_instances * 100) if obj_instances > 0 else 0.0:.2f}%)')

dist_items = sorted(dist_counter.items())
df_dist = pd.DataFrame(dist_items, columns=['verb_minus_obj_index', 'count'])
df_dist.to_csv(os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'), index=False)
print('Saved distance distribution to', os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'))


Verb instances (VERB tokens): 55805
Total obj instances: 13912
Objects immediately before verb: 10066 (72.35%)
Objects within 3 before verb: 12937 (92.99%)
Objects after verb: 0 (0.00%)
Objects before verb (any distance): 13912 (100.00%)
Saved distance distribution to data/project_outputs/object_verb_distance_distribution.csv


### Task 1.1
The generalization we can see from the above results is that an object _never_ occurs after a verb, it _must_ occur before the verb.

### Task 1.2
Our generalization generally holds true. While in our corupus, an object _never_ follows a verb, it does not always have to come immediately before a verb (it does at a 72.35% rate). We have found that the object comes within 3 tokens before the verb, though this is also not always true (it occurs 92.99% of the time). The exceptions to this rule occur when objects are farther than 3 tokens from the verb, but still are before the verb, since we could not find any case where it follows a verb (at any length).

## Task 2

In [33]:

CONLLU_PATH = "ko_kaist-ud-train.conllu"   #Just a reminder, all files are directly just in the same folder as this code.
OUT_DIR = "data/project_outputs_task2"
os.makedirs(OUT_DIR, exist_ok=True)

VERB_UPOS = ("VERB", "AUX")   # token UPOS considered verbs
COPULA_CANONICAL = "이"       # canonical token for copula (이)

def normalize_korean_verb(lemma_or_form):
    """
        Normalize a Korean verb token (lemma or form) to a canonical representation.
        Rules:
        - If segmented with '+', take first morpheme unless copula '이' is present, in which case return '이'
        - If unsegmented and ends with '다', strip terminal '다'
        - Otherwise return as is
    """
    if not lemma_or_form or lemma_or_form == "_":
        return None

    
    s = lemma_or_form.replace("＋", "+").replace("‧", "+").strip()

    #if segmented with '+', split into morphemes
    if "+" in s:
        parts = [p for p in s.split("+") if p]  # drop empty parts
        #if any part equals the copula morpheme '이', treat as copula
        if any(p == "이" for p in parts):
            return COPULA_CANONICAL
        return parts[0]

    #if unsegmented but ends with '다', strip terminal '다' 
    #this is a common verb infinitive ending in Korean, which we want to normalize away
    if len(s) > 1 and s.endswith("다"):
        return s[:-1]

    return s

def aggregate_normalized_counts(counter, locations):
    """
        Ensures counts/locations are collapsed to normalized keys.
        Returns new Counter and locations dict.
    """
    norm_counter = Counter()
    norm_locations = defaultdict(list)
    if locations:
        for raw_key, locs in locations.items():
            norm = normalize_korean_verb(raw_key)
            norm_counter[norm] += len(locs)
            norm_locations[norm].extend(locs)
    else:
        for raw_key, cnt in counter.items():
            norm = normalize_korean_verb(raw_key)
            norm_counter[norm] += cnt
    return norm_counter, norm_locations



def verb_frequencies(sents, verb_upos=VERB_UPOS):
    """
        Count frequencies and record locations of verbs in sentences.
        Returns a Counter of normalized verbs and a dict of locations.
    """
    counter = Counter()
    locations = defaultdict(list)  # canonical_verb -> list of (sent_idx, token_idx, original_token)
    # iterate over sentences and tokens
    for si, tokens in enumerate(sents):
        for i, tok in enumerate(tokens):
            if tok["upos"] in verb_upos:
                lemma = tok["lemma"] if tok["lemma"] != "_" else tok["form"]
                norm = normalize_korean_verb(lemma)
                counter[norm] += 1
                locations[norm].append((si, i, tok))
    return counter, locations


def select_verbs_by_quantiles(counter, top_pct=0.20, next_pct=0.20, top_k_each=20):
    """
        Select verbs by frequency quantiles.
        Returns two lists: top verbs and next verbs.
    """
    items = counter.most_common()
    types = [v for v, _ in items]
    n_types = len(types)
    top_n_types = max(1, int(n_types * top_pct))
    next_n_types = max(1, int(n_types * next_pct))
    top_type_set = set(types[:top_n_types])
    next_type_set = set(types[top_n_types: top_n_types + next_n_types])
    top_candidates = [v for v, _ in items if v in top_type_set]
    next_candidates = [v for v, _ in items if v in next_type_set]
    return top_candidates[:top_k_each], next_candidates[:top_k_each]


def extract_verb_sets(sents, verb_locations, verbs, 
                      subj_deprels=("nsubj", "nsubj:pass", "csubj"), 
                      obj_deprels_prefix=("obj",), 
                      modifier_deprels_prefixes=("advmod", "amod", "nmod", "obl", "advcl", "compound")):
    """
        Extract sets of subjects, objects, modifiers, and surrounding words for given verbs.
        Returns a dict: verb -> {subject:Counter, object:Counter, modifier:Counter, before:Counter, after:Counter, occurrences:int}
    """
    results = {}
    for verb in verbs:
        subj_c = Counter()
        obj_c = Counter()
        mod_c = Counter()
        before_c = Counter()
        after_c = Counter()
        occ = 0
        locs = verb_locations.get(verb, [])
        for si, vi, original_tok in locs:
            sent = sents[si]
            if vi < 0 or vi >= len(sent):
                continue
            occ += 1
            id_to_tok = {t['id']: t for t in sent}
            dependents = defaultdict(list)
            for t in sent:
                h = t['head']
                if h is not None and h in id_to_tok:
                    dependents[h].append(t)
            v_deps = dependents.get(original_tok['id'], [])
            for dep in v_deps:
                deprel = dep['deprel']
                # NOTE: probably redundant to check startswith for nsubj since in subj_deprels already 
                if deprel in subj_deprels or deprel.startswith("nsubj"):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    subj_c[head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in obj_deprels_prefix):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    obj_c[head_form] += 1
                if any(deprel == p or deprel.startswith(p) for p in modifier_deprels_prefixes):
                    head_form = dep['lemma'] if dep['lemma'] != "_" else dep['form']
                    mod_c[head_form] += 1
            if vi - 1 >= 0:
                before_c[sent[vi-1]['form']] += 1
            if vi + 1 < len(sent):
                after_c[sent[vi+1]['form']] += 1
        results[verb] = {
            "subject": subj_c,
            "object": obj_c,
            "modifier": mod_c,
            "before": before_c,
            "after": after_c,
            "occurrences": occ
        }
    return results

def load_korean_vector_model(bin_path=None, tsv_path=None, verbose=True):
    """
        Load a Korean word vector model from binary or TSV format.
        Handles Kyubyong multi-line bracketed format (index<TAB>word<TAB>[ v v v ... across lines ... ])
        Returns a KeyedVectors instance.
    """
    attempts = []
    
    def _log(msg):
        if verbose:
            print(msg)

    # If a TSV (text) file is given and exists, attempt to parse.
    if tsv_path and os.path.exists(tsv_path):
        try:
            _log(f"Inspecting text vector file {tsv_path} ...")
            # We'll parse Kyubyong style files where each entry starts like:
            #    0\t하\t[ 1.2513299  -0.79136038 ... 
            # then continues across multiple lines until a closing ']' is found.
            words = []
            vecs = []
            dim = None
            with open(tsv_path, "r", encoding="utf-8", errors="replace") as fh:
                line_num = 0
                collecting = False
                current_word = None
                current_vals = []
                open_bracket_re = re.compile(r'^\s*\d+\t([^\t]+)\t\[')
                # Pattern when start line doesn't include '[' but still provides word? less common.
                for raw_line in fh:
                    line_num += 1
                    line = raw_line.rstrip("\n")
                    if not line.strip():
                        continue
                    # If we are not currently collecting, check for start of a new entry
                    if not collecting:
                        m = open_bracket_re.match(line)
                        if m:
                            # start collecting numeric tokens from this line (after '[')
                            current_word = m.group(1)
                            # extract everything after the '[' in this line
                            idx = line.find('[')
                            after = line[idx+1:].strip()
                            if after:
                                # split numeric tokens and keep them
                                parts = re.split(r'\s+', after)
                                for p in parts:
                                    if p == ']' or p == '':
                                        continue
                                    # strip any trailing commas/brackets
                                    p_clean = p.strip("[],")
                                    if p_clean:
                                        current_vals.append(p_clean)
                            collecting = True
                        else:
                            # Some variants might put the 'index<tab>word<tab>' on its own line then '[' on next line.
                            # handle: try to see 'index\tword\t' pattern without '['
                            parts0 = line.split("\t")
                            if len(parts0) >= 2 and parts0[0].isdigit():
                                # next lines likely begin with '['; treat as start but wait for '[' in later lines
                                current_word = parts0[1]
                                current_vals = []
                                collecting = True
                                # try to find '[' on same line
                                if '[' in line:
                                    idx = line.find('[')
                                    after = line[idx+1:].strip()
                                    if after:
                                        parts = re.split(r'\s+', after)
                                        for p in parts:
                                            p_clean = p.strip("[],")
                                            if p_clean:
                                                current_vals.append(p_clean)
                                # continue to next loop where we'll collect remaining numeric lines
                            else:
                                # Not a start line; skip
                                continue
                    else:
                        # we are currently collecting numeric tokens for current_word
                        # check if this line contains ']' closing the vector
                        if ']' in line:
                            # take everything before ']' on this line
                            before = line.split(']')[0].strip()
                            if before:
                                parts = re.split(r'\s+', before)
                                for p in parts:
                                    p_clean = p.strip("[],")
                                    if p_clean:
                                        current_vals.append(p_clean)
                            # now finish this vector
                            try:
                                vec = np.array([float(x) for x in current_vals], dtype=np.float32)
                            except Exception as e:
                                # if conversion fails, skip this entry but continue parsing
                                _log(f"Warning: failed to convert vector for word {current_word} at line {line_num}: {e}")
                                current_word = None
                                current_vals = []
                                collecting = False
                                continue
                            if dim is None:
                                dim = vec.shape[0]
                            else:
                                if vec.shape[0] != dim:
                                    _log(f"Warning: inconsistent dim for word {current_word}: {vec.shape[0]} vs {dim}; skipping")
                                    current_word = None
                                    current_vals = []
                                    collecting = False
                                    continue
                            words.append(current_word)
                            vecs.append(vec)
                            # reset for next entry
                            current_word = None
                            current_vals = []
                            collecting = False
                        else:
                            # no closing bracket yet; line should be numeric tokens
                            parts = re.split(r'\s+', line.strip())
                            for p in parts:
                                p_clean = p.strip("[],")
                                if p_clean:
                                    current_vals.append(p_clean)
                # End file loop
            if not words:
                raise RuntimeError("Parsed zero word vectors from the TSV file.")
            arr = np.vstack(vecs)
            kv = KeyedVectors(vector_size=arr.shape[1])
            kv.add_vectors(words, arr)
            _log(f"Constructed KeyedVectors from text file with {len(words)} words and dim {arr.shape[1]}.")
            # a small diagnostic: show first few keys
            # _log("Sample model keys: " + ", ".join(words[:20]))
            return kv
        except Exception as e:
            attempts.append(("w2v_text_kyubyong", str(e)))

    # If no TSV path or parsing failed, raise an informative error:
    msg = "Failed to load model via TSV Kyubyong parser. Attempts:\n" + "\n".join(f"{k}: {v}" for k, v in attempts)
    raise RuntimeError(msg)


# -------------- NEW: KAIST -> surface mapping + robust centroid lookup --------------
COMMON_PARTICLES = {
    "이","가","을","를","은","는","에","에서","으로","로","와","과","에게","께","부터","까지","만","도","뿐","처럼","까지","보다"
}

def kaist_to_surface_candidates(token):
    """
    Given a KAIST-style token like '사람+들+이' or '것+이' or '있+는',
    return candidate surface keys to try in the model in order of preference.
    """
    if token is None:
        return []
    t = str(token).strip()
    if not t or t == "_":
        return []
    candidates = []
    if "+" in t:
        # primary: join all parts (remove '+'), e.g. '사람+들+이' -> '사람들이'
        joined = t.replace("+", "")
        candidates.append(joined)
        # try joined without trailing '다' if present
        if joined.endswith("다") and len(joined) > 1:
            candidates.append(joined[:-1])
        # also try the first part (stem)
        parts = [p for p in t.split("+") if p]
        if parts:
            candidates.append(parts[0])
            # if last part is a particle, try joining all but last (e.g., '것+들+이' -> '것들')
            if parts[-1] in COMMON_PARTICLES and len(parts) >= 2:
                maybe = "".join(parts[:-1])
                candidates.append(maybe)
    else:
        # no plus segmentation: try token as-is first
        candidates.append(t)
        # if ends with '다', also add stem
        if len(t) > 1 and t.endswith("다"):
            candidates.append(t[:-1])
    # deduplicate preserving order
    seen = set()
    out = []
    for c in candidates:
        if c and c not in seen:
            seen.add(c)
            out.append(c)
    return out

def centroid_of_words(model, words):
    """
    Map KAIST tokens to model keys and compute centroid.
    Returns (centroid_vector or None, n_in_vocab, missed_list)
    """
    vecs = []
    missed = []
    for w in words:
        if w is None:
            continue
        # if items are tuples like (word, count), handle that
        if isinstance(w, (tuple, list)):
            keyword = w[0]
        else:
            keyword = str(w)
        candidates = kaist_to_surface_candidates(keyword)
        found_key = None
        for cand in candidates:
            if cand in model:
                found_key = cand
                break
        # final fallback: try the raw keyword itself
        if found_key is None and keyword in model:
            found_key = keyword
        if found_key is not None:
            vecs.append(model[found_key])
        else:
            missed.append(keyword)
    if not vecs:
        return None, 0, missed
    arr = np.vstack(vecs)
    return np.mean(arr, axis=0), arr.shape[0], missed

def topk_neighbors_from_centroid(model, centroid_vec, k=10):
    if centroid_vec is None:
        return []
    return model.similar_by_vector(centroid_vec, topn=k)
# -------------------------------------------------------------------------------


def build_task2_analysis(conllu_path=CONLLU_PATH, model_bin=None, model_tsv=None, k_neighbors=10, top_k_each=5):
    sents = parse_conllu(conllu_path)
    print(f"Loaded {len(sents)} sentences from {conllu_path}")
    raw_counter, raw_locations = verb_frequencies(sents)

    verb_counter, verb_locations = aggregate_normalized_counts(raw_counter, raw_locations)

    vf_df = pd.DataFrame(verb_counter.most_common(), columns=["verb_norm", "freq"])
    
    vf_df.to_csv(os.path.join(OUT_DIR, "verb_frequencies_normalized.csv"), index=False, encoding="utf-8-sig")
    top_verbs, mid_verbs = select_verbs_by_quantiles(verb_counter, top_pct=0.20, next_pct=0.20, top_k_each=top_k_each)
    print("Top verbs (sample):", top_verbs)
    print("Mid verbs (sample):", mid_verbs)
    verbs_to_analyze = list(top_verbs) + list(mid_verbs)
    sets = extract_verb_sets(sents, verb_locations, verbs_to_analyze)
    summary = {}
    for v in verbs_to_analyze:
        entry = sets[v]
        summary[v] = {
            "occurrences": entry["occurrences"],
            "top_subjects": entry["subject"].most_common(50),
            "top_objects": entry["object"].most_common(50),
            "top_modifiers": entry["modifier"].most_common(50),
            "top_before": entry["before"].most_common(50),
            "top_after": entry["after"].most_common(50)
        }
    with open(os.path.join(OUT_DIR, "verb_sets_summary_normalized.json"), "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    model = None
    neighbors_summary = {}
    if (model_bin and os.path.exists(model_bin)) or (model_tsv and os.path.exists(model_tsv)):
        print("Attempting to load vector model...")
        model = load_korean_vector_model(bin_path=model_bin, tsv_path=model_tsv)
        print("Computing centroids and nearest neighbors...")
        for v in verbs_to_analyze:
            neighbors_summary[v] = {}
            for set_name in ("top_subjects", "top_objects", "top_modifiers", "top_before", "top_after"):
                words = [w for w, cnt in summary[v][set_name][:200]]
                centroid, n_in_vocab, missed = centroid_of_words(model, words)
                knn = topk_neighbors_from_centroid(model, centroid, k=k_neighbors) if centroid is not None else []
                neighbors_summary[v][set_name] = {
                    "centroid_n_in_vocab": int(n_in_vocab),
                    "missed_count": len(missed),
                    "missed_examples": missed[:30],
                    "knn": [(w, float(sim)) for w, sim in knn]
                }
        with open(os.path.join(OUT_DIR, "verb_neighbors_summary_normalized.json"), "w", encoding="utf-8") as f:
            json.dump(neighbors_summary, f, ensure_ascii=False, indent=2)
    else:
        print("No model files provided or found; skipping centroid/knn stage.")

    return {
        "sentences": len(sents),
        "verb_freq_df": vf_df,
        "top_verbs": top_verbs,
        "mid_verbs": mid_verbs,
        "sets": sets,
        "summary": summary,
        "neighbors_summary": neighbors_summary if model else None
    }


res = build_task2_analysis(
    conllu_path="ko_kaist-ud-train.conllu",
    model_bin="ko.bin",     
    model_tsv="ko.tsv",     
    k_neighbors=20,
    top_k_each=5
)

Loaded 23010 sentences from ko_kaist-ud-train.conllu
Top verbs (sample): ['이', '하', '있', '되', '않']
Mid verbs (sample): ['물리치', '민감', '내려오', '제창', '갈']
Attempting to load vector model...
Inspecting text vector file ko.tsv ...
No headers so parsing text file line-by-line and building KeyedVectors (this may take time on your computers).
Constructed KeyedVectors from text file with 603232 words and dim 5.
Computing centroids and nearest neighbors...


In [37]:
def verb_info(verb):
    print(f"Verb: {verb}")
    entry = res['summary'][verb]
    print(f"  Occurrences: {entry['occurrences']}")
    print(f"  Top Subjects: {entry['top_subjects'][:5]}")
    print(f"  Top Objects: {entry['top_objects'][:5]}")
    print(f"  Top Modifiers: {entry['top_modifiers'][:5]}")
    print(f"  Top Before: {entry['top_before'][:5]}")
    print(f"  Top After: {entry['top_after'][:5]}")

### Task 2.1
#### 5 Verbs from Top 20%
List of verbs: ['이', '하', '있', '되', '않']

In [39]:
for v in res['top_verbs']:
    verb_info(v)
    print()

Verb: 이
  Occurrences: 10508
  Top Subjects: [('것+이', 213), ('이것+이', 59), ('그것+이', 32), ('자체+가', 10), ('사람+이', 8)]
  Top Objects: [('이+를', 4), ('이것+을', 3), ('작품+을', 2), ('정+을', 1), ('세계관+을', 1)]
  Top Modifiers: [('것+이+다', 131), ('바로', 120), ('것+도', 64), ('가장', 40), ('때문+이+다', 34)]
  Top Before: [('있는', 322), ('있을', 133), ("'", 126), ('할', 125), ('될', 115)]
  Top After: [('.', 6029), ('것이다', 185), ('?', 150), ('할', 109), ('때문이다', 54)]

Verb: 하
  Occurrences: 3704
  Top Subjects: [('내+가', 7), ('우리+가', 6), ('자기+가', 6), ('그+가', 6), ('그것+이', 3)]
  Top Objects: [('역할+을', 45), ('일+을', 26), ('것+을', 22), ('생각+을', 13), ('말+을', 12)]
  Top Modifiers: [('수', 269), ('것+이+다', 75), ('필요+로', 24), ('때', 23), ('것+이+ㅂ니다', 19)]
  Top Before: [('있다고', 48), ('해야', 47), ('역할을', 43), ('있어야', 42), ('필요로', 24)]
  Top After: [('.', 1546), ('수', 271), ('것이다', 163), ('것은', 62), (',', 58)]

Verb: 있
  Occurrences: 3468
  Top Subjects: [('문제+가', 4), ('것+이', 4), ('관계+가', 4), ('수+가', 3), ('필요+가', 2)]
  Top Objects: []


### Task 2.2
#### 5 Verbs from Top 20-40%
List of verbs: ['물리치', '민감', '내려오', '제창', '갈']

In [40]:
for v in res['mid_verbs']:
    verb_info(v)
    print()

Verb: 물리치
  Occurrences: 8
  Top Subjects: []
  Top Objects: [('이민족+을', 1), ('도깨비+를', 1), ('기운+을', 1), ('왜구+를', 1), ('재도전+을', 1)]
  Top Modifiers: [('수', 2), ('그걸+로', 1), ('의심+도', 1)]
  Top Before: [('이민족을', 1), ('도깨비를', 1), ('기운을', 1), ('왜구를', 1), ('재도전을', 1)]
  Top After: [('수', 2), ('존왕양이의', 1), ('위해', 1), ('영약으로', 1), ('결심을', 1)]

Verb: 민감
  Occurrences: 7
  Top Subjects: [('동물+들+이', 1)]
  Top Objects: []
  Top Modifiers: [('더', 2), ('측면+에', 1), ('여론+에', 1), ('것+에', 1), ('특성+상', 1)]
  Top Before: [('더', 2), ('여론에', 1), ('정책에', 1), ('것에', 1), ('덜', 1)]
  Top After: [('.', 2), ('인사라고나', 1), ('반응을', 1), ('법', 1), ('부분의', 1)]

Verb: 내려오
  Occurrences: 7
  Top Subjects: []
  Top Objects: []
  Top Modifiers: [('예+로+부터', 1), ('고장+에', 1), ('가장자리+까지', 1), ('옛+부터', 1), ('조상+도', 1)]
  Top Before: [('전해', 2), ('가장자리까지', 1), ('옛부터', 1), ('묻고', 1), ('하늘에서', 1)]
  Top After: [('.', 2), ('좋은', 1), ('한다', 1), ('한', 1), ('날', 1)]

Verb: 제창
  Occurrences: 7
  Top Subjects: [('사람+이', 1), ('루스벨트+가', 1)

In [30]:

verb_obj_counts = defaultdict(int)
verb_obj_immediate = defaultdict(int)

for sent_idx, tokens in enumerate(sents):
    id_to_tok = {t['id']: t for t in tokens}
    dependents = defaultdict(list)
    for t in tokens:
        if t['head'] is not None and t['head'] in id_to_tok:
            dependents[t['head']].append(t)
    id_to_index = {t['id']: i for i, t in enumerate(tokens)}

    for i, tok in enumerate(tokens):
        if tok['upos'] == 'VERB':
            verb_id = tok['id']
            lemma = tok['lemma'] if tok['lemma'] != '_' else tok['form']
            deps = dependents.get(verb_id, [])
            for dep in deps:
                if dep['deprel'] == 'obj':
                    verb_obj_counts[lemma] += 1
                    obj_idx = id_to_index.get(dep['id'])
                    if obj_idx is not None and i - obj_idx == 1:
                        verb_obj_immediate[lemma] += 1

rows = []
for lemma, cnt in verb_obj_counts.items():
    rows.append({
        'lemma': lemma,
        'obj_count': cnt,
        'obj_immediate_count': verb_obj_immediate.get(lemma, 0),
        'pct_immediate': verb_obj_immediate.get(lemma, 0) / cnt * 100 if cnt>0 else 0
    })
df_verb_obj = pd.DataFrame(rows).sort_values('obj_count', ascending=False)

display(df_verb_obj.head(20))
csv_path = os.path.join(OUT_DIR, 'verbs_with_obj_stats.csv')
df_verb_obj.to_csv(csv_path, index=False)
print('Saved per-verb obj stats to', csv_path)


Unnamed: 0,lemma,obj_count,obj_immediate_count,pct_immediate
93,하+는,122,59,48.360656
388,가지+고,113,98,86.725664
284,위하+ㄴ,98,90,91.836735
88,하+ㄴ,89,38,42.696629
48,알+ㄹ,84,49,58.333333
386,하+었+다,82,44,53.658537
7,가지+ㄴ,77,68,88.311688
693,하+ㄹ,68,37,54.411765
79,하+ㄴ다,61,27,44.262295
46,지니+고,58,48,82.758621


Saved per-verb obj stats to data/project_outputs_task2/verbs_with_obj_stats.csv
