In [1]:
from collections import Counter, defaultdict, namedtuple
import glob
import pandas as pd
import pickle
from conllu import parse_tree, parse
import math

# summarize verb_dep args
CORE_TAGS = ["nsubj",
"obj",
"iobj",
"csubj",
"ccomp",
"xcomp"]

# get all verbs and dependents
# from UD 2.5, with Richard Futrell cliqs tool for
# processing
fns = glob.glob("../UD-2.5/ud-treebanks-v2.5/*/all.c*")
fns = [i for i in fns if "UD_" in i]

In [4]:
def get_info(sent):
    p = parse_tree(sent)
    try:
        stack = [p[0]]
    except:
        return None
    a = []
    while len(stack) > 0:
        cur = stack.pop(0)
        curargs = []
        for i in cur.children:
            if (cur.token['upostag'] in ["VERB"]):            
                deps = ""
                deps = "-".join([j.token["deprel"] for j in 
                                 i.children if j.token["deprel"] != "punct"])
                curargs += [{"pos": i.token['upostag'],
                             "dep": i.token["deprel"],
                             "deps": deps,
                              "feats": cur.token["feats"]}]
            stack += [i]
        if cur.token['upostag'] in ["VERB"]:
            a += [{"pos":cur.token["upostag"],
               "deprel":cur.token["deprel"],
               "curargs": curargs,
                  "token": cur.token,
                  "feats": cur.token["feats"]}]
    return a

def classify_subj(arg):
    if "csubj" in arg["deps"]:
        return "comp"
    if arg["deps"] == "":
        return arg["pos"]
    elif arg["deps"] == "det":
        return arg["pos"] + "-det"
    else:
        return arg["pos"] + "-modified"
    
def classify_obj(arg):
    if "xcomp" in arg["deps"] or "ccomp" in arg["dep"]:
        return "comp"
    if arg["deps"] == "":
        return arg["pos"]
    elif arg["deps"] == "det":
        return arg["pos"] + "-det"
    else:
        return arg["pos"] + "-modified"

def classify_verb_arg(a, lang):
    d = {"lang": lang,
         "deprel": a["deprel"],
         "pos": a["pos"],
         "feats": a["token"]["feats"],
         "has_subj": any(["subj" in i["dep"] for i in a["curargs"]]),
         "has_obj": any(["obj" in i["dep"] for i in a["curargs"]]),
         "has_comp": any(["ccomp" in i["dep"] or "xcomp" in i["dep"] for i in a["curargs"]]),
         "subj_type": "_".join([classify_subj(i) for i in a["curargs"] if "subj" in i["dep"]]),
         "obj_type": "_".join([classify_obj(i) for i in a["curargs"] if "obj" in i["dep"]]),
         
        }
    return d

In [5]:
# process all sentences, from all corpora
langs = {fn.split("/")[-2]: [get_info(sent) for sent in 
                             open(fn).read().split("\n\n")] for fn in fns if "Czech" in fn} 

In [None]:
lang_d = defaultdict(list)
for lang in langs:
    for sent in langs[lang]:
        if sent is not None:
            for verb in sent:
                lang_d[lang] += [classify_verb_arg(verb, lang)]
df = pd.concat([pd.DataFrame(lang_d[i]) for i in lang_d])

In [6]:
def get_person_animacy(features):
    x = ["", ""]
    if features is None:
        return x
    if "Person" in features:
        x[0] = features["Person"]
    if "Animacy" in features:
        x[1] = features["Animacy"]
    return x

def get_info_lang(lang, langs):
    x = [i for i in langs[lang] if i is not None]
    l = []
    for i in x:
        for j in i:
            c = {"deprel": j["deprel"],
                "feats": j["feats"]}
            depcounter = Counter()
            for arg in j["curargs"]:
                if arg["dep"] != "punct":
                    argwrite = depcounter[arg["dep"]]
                    c[arg["dep"] + "_" + str(argwrite)] = (arg['pos'], arg['deps'], get_person_animacy(arg["feats"]))
                    depcounter[arg["dep"]] += 1
            l += [c]
    df = pd.DataFrame(l)
    df["lang"] = lang
    return df

In [7]:
l = [get_info_lang(lang, langs) for lang in list(langs.keys())]

In [8]:
def get_noun_type(tup):
    if isinstance(tup, float):
        if math.isnan(tup):
            return None
    if tup[1] != "" and tup[1] != "det":
        return 5
    elif tup[1] == "det" and tup[0] == "NOUN":
        return 4 #tup[0] + "_" + tup[1]
    elif tup[0] == "PROPN" and tup[1] == "":
        return 2
    elif tup[0] == "PRON":
        return 1
    elif tup[0] == "NOUN" and tup[1] == "":
        return 3
    return None



In [9]:
for x in l:
    x.to_pickle("pickled_dfs/" + str(x["lang"][0]) + ".pkl")
    y = pd.DataFrame(x).copy()
    for col in y.columns:
        if "nsubj" in col or "obj" in col or "obl" in col:
            y[col] = [get_noun_type(tup) for tup in list(y[col])]
        elif col not in ["lang", "deprel", "feats"]:
            y[col] = [int(not isinstance(tup, float)) for tup in list(y[col])]
        y.to_csv("matrices/" + list(y["lang"])[0])

In [164]:
a = list(langs.keys())[0]
langs[a][0]

[{'pos': 'VERB',
  'deprel': 'acl:relcl',
  'curargs': [{'pos': 'PUNCT',
    'dep': 'punct',
    'deps': '',
    'feats': OrderedDict([('Aspect', 'Imp'),
                 ('Subcat', 'Intr'),
                 ('VerbForm', 'Part')])},
   {'pos': 'PRON',
    'dep': 'nsubj',
    'deps': '',
    'feats': OrderedDict([('Aspect', 'Imp'),
                 ('Subcat', 'Intr'),
                 ('VerbForm', 'Part')])},
   {'pos': 'AUX',
    'dep': 'aux',
    'deps': '',
    'feats': OrderedDict([('Aspect', 'Imp'),
                 ('Subcat', 'Intr'),
                 ('VerbForm', 'Part')])},
   {'pos': 'NOUN',
    'dep': 'obl',
    'deps': '',
    'feats': OrderedDict([('Aspect', 'Imp'),
                 ('Subcat', 'Intr'),
                 ('VerbForm', 'Part')])}],
  'token': Token([('id', 8),
         ('form', 'ավարտվում'),
         ('lemma', 'ավարտվել'),
         ('upos', 'VERB'),
         ('xpos', None),
         ('feats',
          OrderedDict([('Aspect', 'Imp'),
                       ('Sub

In [159]:
"Person " in None

TypeError: argument of type 'NoneType' is not iterable

In [158]:
len(list(langs.keys()))

157

In [11]:
l[0]

Unnamed: 0,deprel,feats,xcomp_0,advmod_0,obj_0,conj_0,cc_0,obl_0,obl_1,mark_0,...,appos_2,cop_0,obj_2,flat:foreign_2,conj_7,conj_8,conj_9,nmod_2,conj_10,lang
0,root,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...","(VERB, advmod, [2, ])",,,,,,,,...,,,,,,,,,,UD_Czech-PDT
1,xcomp,"{'Polarity': 'Pos', 'VerbForm': 'Inf'}",,"(ADV, , [, ])",,,,,,,...,,,,,,,,,,UD_Czech-PDT
2,root,"{'Mood': 'Imp', 'Number': 'Plur', 'Person': '2...",,,"(NOUN, , [2, ])","(VERB, cc, [2, ])",,,,,...,,,,,,,,,,UD_Czech-PDT
3,conj,"{'Aspect': 'Perf', 'Mood': 'Imp', 'Number': 'P...",,,,,"(CCONJ, , [2, ])",,,,...,,,,,,,,,,UD_Czech-PDT
4,xcomp,"{'Polarity': 'Pos', 'VerbForm': 'Inf'}",,,"(NOUN, det-amod, [, ])",,,,,,...,,,,,,,,,,UD_Czech-PDT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135474,root,"{'Aspect': 'Imp', 'Mood': 'Ind', 'Number': 'Pl...",,"(ADV, , [3, ])",,,,,,,...,,,,,,,,,,UD_Czech-PDT
135475,ccomp,"{'Gender': 'Fem,Neut', 'Number': 'Plur,Sing', ...",,"(ADV, obl, [, ])",,"(VERB, cc-nsubj-aux-obj, [, ])",,"(NOUN, case-nmod-nmod, [, ])",,"(SCONJ, , [, ])",...,,,,,,,,,,UD_Czech-PDT
135476,conj,"{'Animacy': 'Anim', 'Gender': 'Masc', 'Number'...",,,"(ADJ, , [, Anim])",,"(CCONJ, , [, Anim])",,,,...,,,,,,,,,,UD_Czech-PDT
135477,csubj,"{'Polarity': 'Pos', 'VerbForm': 'Inf'}",,,,,,,,,...,,,,,,,,,,UD_Czech-PDT


In [167]:
l[100]

Unnamed: 0,deprel,feats,advcl_0,iobj_0,obl_0,nsubj_0,mark_0,cc_0,det_0,det:poss_0,...,advmod_3,advmod_4,conj_2,parataxis_3,parataxis_4,discourse:emo_1,nsubj_1,discourse_3,cc_1,lang
0,root,"{'Mood': 'Ind', 'Number': 'Sing', 'Person': '3...","(VERB, mark-obl, [3, ])","(PRON, , [3, ])","(NOUN, case, [3, ])","(SYM, conj, [3, ])",,,,,...,,,,,,,,,,UD_Italian-TWITTIRO
1,advcl,"{'Mood': 'Ind', 'Number': 'Sing', 'Person': '1...",,,"(NOUN, case-det-nmod, [1, ])",,"(SCONJ, , [1, ])",,,,...,,,,,,,,,,UD_Italian-TWITTIRO
2,conj,"{'Mood': 'Ind', 'Number': 'Plur', 'Person': '3...",,"(PRON, , [3, ])",,"(NOUN, det-amod, [3, ])",,"(CCONJ, , [3, ])","(DET, , [3, ])","(PRON, , [3, ])",...,,,,,,,,,,UD_Italian-TWITTIRO
3,parataxis,"{'Mood': 'Imp', 'Number': 'Plur', 'Person': '2...","(VERB, mark, [2, ])",,,,,,,,...,,,,,,,,,,UD_Italian-TWITTIRO
4,advcl,"{'Mood': 'Ind', 'Number': 'Plur', 'Person': '2...",,,,,"(SCONJ, , [2, ])",,,,...,,,,,,,,,,UD_Italian-TWITTIRO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2841,acl:relcl,"{'Mood': 'Ind', 'Number': 'Sing', 'Person': '3...",,,,,,,,,...,,,,,,,,,,UD_Italian-TWITTIRO
2842,acl:relcl,"{'Mood': 'Ind', 'Number': 'Sing', 'Person': '3...",,,,,,,,,...,,,,,,,,,,UD_Italian-TWITTIRO
2843,root,"{'Mood': 'Ind', 'Number': 'Sing', 'Person': '1...",,,"(NOUN, case, [1, ])",,,,,,...,,,,,,,,,,UD_Italian-TWITTIRO
2844,parataxis,{'VerbForm': 'Ger'},,,,,,,,,...,,,,,,,,,,UD_Italian-TWITTIRO


In [168]:
len(l)

157

SyntaxError: invalid syntax (<ipython-input-3-0f44b287cd63>, line 1)