# Working with Universal Dependencies

In [45]:
# if you don't have conllu yet, uncomment the following
# !python3 -m pip install conllu

In [46]:
import conllu # reading Universal Dependency files in the CONLLu format

In [47]:
with open("ko_kaist-ud-train.conllu", encoding="utf-8") as f:
    data = f.read()

In [48]:

def parse_conllu():
    sentences = []
    with open("ko_kaist-ud-train.conllu", encoding='utf-8') as fh:
        sent_lines = []
        for raw in fh:
            line = raw.rstrip('\n')
            if line.startswith('#'):
                sent_lines.append(line)
            elif line.strip() == '':
                if sent_lines:
                    sentences.append(sent_lines)
                    sent_lines = []
            else:
                sent_lines.append(line)
        if sent_lines:
            sentences.append(sent_lines)

    parsed = []
    for s in sentences:
        tokens = []
        for ln in s:
            if ln.startswith('#'):
                continue
            parts = ln.split('\t')
            if len(parts) != 10:
                continue
            id_, form, lemma, upos, xpos, feats, head, deprel, deps, misc = parts
            if '-' in id_:
                continue
            try:
                id_int = int(id_)
            except Exception:
                continue
            token = {
                'id': id_int,
                'form': form,
                'lemma': lemma,
                'upos': upos,
                'xpos': xpos,
                'feats': feats,
                'head': int(head) if head != '_' else None,
                'deprel': deprel,
                'deps': deps,
                'misc': misc
            }
            tokens.append(token)
        if tokens:
            tokens.sort(key=lambda t: t['id'])
            parsed.append(tokens)
    return parsed

sents = parse_conllu()
print(f'Parsed {len(sents)} sentences')

Parsed 23010 sentences


In [49]:
import pandas as pd
import os
OUT_DIR = 'data/project_outputs'
os.makedirs(OUT_DIR, exist_ok=True)

from collections import defaultdict, Counter
verb_instances = 0
obj_instances = 0
obj_immediate = 0
obj_within3 = 0
dist_counter = Counter()

examples = defaultdict(list) 

for sent_idx, tokens in enumerate(sents):
    id_to_tok = {t['id']: t for t in tokens}
    dependents = defaultdict(list)
    for t in tokens:
        if t['head'] is not None and t['head'] in id_to_tok:
            dependents[t['head']].append(t)
    id_to_index = {t['id']: i for i, t in enumerate(tokens)}

    for i, tok in enumerate(tokens):
        if tok['upos'] == 'VERB':
            verb_instances += 1
            verb_id = tok['id']
            deps = dependents.get(verb_id, [])
            for dep in deps:
                if dep['deprel'] == 'obj':
                    obj_instances += 1
                    obj_idx = id_to_index.get(dep['id'])
                    verb_idx = i
                    if obj_idx is None:
                        continue
                    diff = verb_idx - obj_idx
                    dist_counter[diff] += 1
                    if diff == 1:
                        obj_immediate += 1
                    if 0 <= diff <= 1:
                        obj_within3 += 1
                    lemma = tok['lemma'] if tok['lemma'] != '_' else tok['form']
                    if len(examples[lemma]) < 5:
                        sent_form = ' '.join([t['form'] for t in tokens])
                        examples[lemma].append((dep['form'], tok['form'], sent_form))

if obj_instances > 0:
    pct_immediate = obj_immediate / obj_instances * 100
    pct_within3 = obj_within3 / obj_instances * 100
else:
    pct_immediate = pct_within3 = 0.0

print('Verb instances (VERB tokens):', verb_instances)
print('Total obj instances:', obj_instances)
print(f'Objects immediately before verb: {obj_immediate} ({pct_immediate:.2f}%)')
print(f'Objects within 1 tokens before verb: {obj_within3} ({pct_within3:.2f}%)')

dist_items = sorted(dist_counter.items())
df_dist = pd.DataFrame(dist_items, columns=['verb_minus_obj_index', 'count'])
df_dist.to_csv(os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'), index=False)
print('Saved distance distribution to', os.path.join(OUT_DIR, 'object_verb_distance_distribution.csv'))


# parse words within words
def num_words_in_word(word):
    length = len(word.split('+'))
    return length

# analyze how many words within words by part of speech
pos_wordcount = defaultdict(list)
for tokens in sents:
    for t in tokens:
        pos = t['upos']
        word = t['lemma']
        count = num_words_in_word(word)
        pos_wordcount[pos].append(count)
import numpy as np
pos_wordcount_stats = {}
for pos, counts in pos_wordcount.items():
    arr = np.array(counts)
    pos_wordcount_stats[pos] = {
        'mean': np.mean(arr),
        'median': np.median(arr),
        'max': np.max(arr),
        'min': np.min(arr),
        'count': len(arr)
    }
df_wordcount = pd.DataFrame.from_dict(pos_wordcount_stats, orient='index')
df_wordcount.to_csv(os.path.join(OUT_DIR, 'pos_wordcount_stats.csv'))
print('Saved POS word count stats to', os.path.join(OUT_DIR, 'pos_wordcount_stats.csv'))
print(df_wordcount)


Verb instances (VERB tokens): 55805
Total obj instances: 13912
Objects immediately before verb: 10066 (72.35%)
Objects within 1 tokens before verb: 10066 (72.35%)
Saved distance distribution to data/project_outputs/object_verb_distance_distribution.csv
Saved POS word count stats to data/project_outputs/pos_wordcount_stats.csv
           mean  median  max  min  count
CCONJ  1.993268     2.0    8    1  16192
ADV    1.919793     2.0    7    1  41555
SCONJ  2.394573     2.0    7    1  14925
DET    1.000000     1.0    1    1   4286
NOUN   1.887334     2.0    8    1  88820
VERB   2.835391     3.0    7    2  55805
ADJ    1.961284     2.0    5    1  12088
PUNCT  1.000000     1.0    1    1  33005
AUX    1.000000     1.0    1    1   6901
PRON   1.982317     2.0    5    1   6786
PROPN  1.638292     2.0    5    1  10232
NUM    1.460066     1.0    5    1   3919
INTJ   1.145833     1.0    3    1     48
PART   2.970085     3.0    6    1    234
X      1.005495     1.0    2    1    546
ADP    1.032226 

## Task 1
### Task 1.1

### Task 1.2

## Task 2
### Top 20 Verbs

### High Frequency Verbs