In [102]:
from collections import Counter, defaultdict, namedtuple
import glob
import pandas as pd
import pickle
from conllu import parse_tree, parse
import math

# summarize verb_dep args
CORE_TAGS = ["nsubj",
"obj",
"iobj",
"csubj",
"ccomp",
"xcomp"]

# get all verbs and dependents
# from UD 2.5, with Richard Futrell cliqs tool for
# processing
fns = glob.glob("../UD-2.5/ud-treebanks-v2.5/*/all.c*")
fns = [i for i in fns if "UD_" in i]

In [111]:
def get_info(sent):
    p = parse_tree(sent)
    try:
        stack = [p[0]]
    except:
        return None
    a = []
    while len(stack) > 0:
        cur = stack.pop(0)
        curargs = []
        for i in cur.children:
            if (cur.token['upostag'] in ["VERB"]):            
                deps = ""
                deps = "-".join([j.token["deprel"] for j in 
                                 i.children if j.token["deprel"] != "punct"])
                curargs += [{"pos": i.token['upostag'],
                             "dep": i.token["deprel"],
                             "deps": deps}]
            stack += [i]
        if cur.token['upostag'] in ["VERB"]:
            a += [{"pos":cur.token["upostag"],
               "deprel":cur.token["deprel"],
               "curargs": curargs,
                  "token": cur.token,
                  "feats": cur.token["feats"]}]
    return a

def classify_subj(arg):
    if "csubj" in arg["deps"]:
        return "comp"
    if arg["deps"] == "":
        return arg["pos"]
    elif arg["deps"] == "det":
        return arg["pos"] + "-det"
    else:
        return arg["pos"] + "-modified"
    
def classify_obj(arg):
    if "xcomp" in arg["deps"] or "ccomp" in arg["dep"]:
        return "comp"
    if arg["deps"] == "":
        return arg["pos"]
    elif arg["deps"] == "det":
        return arg["pos"] + "-det"
    else:
        return arg["pos"] + "-modified"

def classify_verb_arg(a, lang):
    d = {"lang": lang,
         "deprel": a["deprel"],
         "pos": a["pos"],
         "feats": a["token"]["feats"],
         "has_subj": any(["subj" in i["dep"] for i in a["curargs"]]),
         "has_obj": any(["obj" in i["dep"] for i in a["curargs"]]),
         "has_comp": any(["ccomp" in i["dep"] or "xcomp" in i["dep"] for i in a["curargs"]]),
         "subj_type": "_".join([classify_subj(i) for i in a["curargs"] if "subj" in i["dep"]]),
         "obj_type": "_".join([classify_obj(i) for i in a["curargs"] if "obj" in i["dep"]]),
         
        }
    return d

In [121]:
# process all sentences, from all corpora
langs = {fn.split("/")[-2]: [get_info(sent) for sent in 
                             open(fn).read().split("\n\n")] for fn in fns}

In [None]:
lang_d = defaultdict(list)
for lang in langs:
    for sent in langs[lang]:
        if sent is not None:
            for verb in sent:
                lang_d[lang] += [classify_verb_arg(verb, lang)]
df = pd.concat([pd.DataFrame(lang_d[i]) for i in lang_d])

In [122]:
def get_info_lang(lang, langs):
    x = [i for i in langs[lang] if i is not None]
    l = []
    for i in x:
        for j in i:
            c = {"deprel": j["deprel"],
                "feats": j["feats"]}
            depcounter = Counter()
            for arg in j["curargs"]:
                if arg["dep"] != "punct":
                    argwrite = depcounter[arg["dep"]]
                    c[arg["dep"] + "_" + str(argwrite)] = (arg['pos'], arg['deps'])
                    depcounter[arg["dep"]] += 1
            l += [c]
    df = pd.DataFrame(l)
    df["lang"] = lang
    return df

In [123]:
l = [get_info_lang(lang, langs) for lang in list(langs.keys())]

In [119]:
def get_noun_type(tup):
    if isinstance(tup, float):
        if math.isnan(tup):
            return 0
    if tup[1] != "" and tup[1] != "det":
        return 5
    if tup[1] == "det" and tup[0] == "NOUN":
        return 4 #tup[0] + "_" + tup[1]
    if tup[0] == "PROPN" and tup[1] == "":
        return 2
    if tup[0] == "PRON":
        return 1
    if tup[0] == "NOUN" and tup[1] == "":
        return 3
    if tup[1] == "":
        return tup[0]
    return tup[0] + "-mod"



In [126]:
for x in l:
    y = pd.DataFrame(x).copy()
    for col in y.columns:
        if "nsubj" in col or "obj" in col or "obl" in col:
            y[col] = [get_noun_type(tup) for tup in list(y[col])]
        elif col not in ["lang", "deprel", "feats"]:
            y[col] = [int(not isinstance(tup, float)) for tup in list(y[col])]
        y.to_csv("matrices/" + list(y["lang"])[0])

In [94]:
x

Unnamed: 0,nsubj_0,aux_0,obl_0,obj_0,conj_0,conj_1,parataxis_0,cc_0,advmod_0,advcl_0,...,fixed_0,fixed_1,compound_0,aux_2,advmod_2,compound:svc_1,cc_1,nmod_0,dep_0,lang
0,"(PRON, )","(AUX, )","(NOUN, )",,,,,,,,...,,,,,,,,,,UD_Armenian-ArmTDP
1,"(NOUN, )",,,"(NOUN, )","(VERB, cc-obj)","(NOUN, mark-orphan)","(VERB, nsubj-aux-obj-advmod-advcl)",,,,...,,,,,,,,,,UD_Armenian-ArmTDP
2,,,,"(NOUN, )",,,,"(CCONJ, )",,,...,,,,,,,,,,UD_Armenian-ArmTDP
3,"(PRON, )","(AUX, )",,"(NOUN, )",,,,,"(ADV, )","(VERB, obj-obl)",...,,,,,,,,,,UD_Armenian-ArmTDP
4,,,"(NOUN, nmod:poss-amod)","(PRON, )",,,,,,,...,,,,,,,,,,UD_Armenian-ArmTDP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6744,"(NOUN, nmod:poss-amod)","(AUX, )","(NOUN, nmod:poss)",,"(PROPN, cc-amod-orphan)",,,,,,...,,,,,,,,,,UD_Armenian-ArmTDP
6745,"(NOUN, nmod:poss-amod)","(AUX, )","(NOUN, nmod:poss)",,"(PROPN, cc-amod-orphan)",,,,,,...,,,,,,,,,,UD_Armenian-ArmTDP
6746,"(NOUN, nmod:poss)","(AUX, )","(NOUN, nmod:poss)",,"(PROPN, cc-amod-orphan)",,,,,,...,,,,,,,,,,UD_Armenian-ArmTDP
6747,"(NOUN, nmod:poss)","(AUX, )","(NOUN, nmod:poss)",,"(NOUN, cc-orphan)",,,,,,...,,,,,,,,,,UD_Armenian-ArmTDP


In [95]:
y[["nsubj_0", "obj_0", "iobj_0", "obl_0", "expl_0", "xcomp_0", "ccomp_0", "advmod_0"]]

Unnamed: 0,nsubj_0,obj_0,iobj_0,obl_0,expl_0,xcomp_0,ccomp_0,advmod_0
0,1,0,0,3,0,0,0,0
1,3,3,0,0,0,0,0,0
2,0,3,0,0,0,0,0,0
3,1,3,0,0,0,0,0,1
4,0,1,0,5,0,0,0,0
...,...,...,...,...,...,...,...,...
6744,5,0,0,5,0,0,0,0
6745,5,0,0,5,0,0,0,0
6746,5,0,0,5,0,0,0,0
6747,5,0,0,5,0,0,0,0


In [97]:
y.columns

Index(['nsubj_0', 'aux_0', 'obl_0', 'obj_0', 'conj_0', 'conj_1', 'parataxis_0',
       'cc_0', 'advmod_0', 'advcl_0', 'xcomp_0', 'mark_0', 'discourse_0',
       'nsubj:caus_0', 'iobj:agent_0', 'nsubj:pass_0', 'aux:ex_0',
       'dislocated_0', 'parataxis_1', 'nmod:poss_0', 'compound:lvc_0',
       'appos_0', 'conj_2', 'conj_3', 'conj_4', 'conj_5', 'obl_1', 'case_0',
       'nsubj_1', 'ccomp_0', 'conj_6', 'iobj_0', 'parataxis_2', 'advmod_1',
       'amod_0', 'orphan_0', 'obl_2', 'csubj_0', 'obl:agent_0',
       'advmod:emph_0', 'csubj:pass_0', 'aux_1', 'cop_0', 'acl:relcl_0',
       'vocative_0', 'discourse_1', 'compound:lvc_1', 'obj_1', 'xcomp_1',
       'obl_3', 'compound:svc_0', 'conj_7', 'conj_8', 'conj_9', 'conj_10',
       'compound:redup_0', 'advcl_1', 'advcl_2', 'det_0', 'acl_0',
       'nmod:npmod_0', 'expl_0', 'discourse_2', 'mark_1', 'det:poss_0',
       'aux:caus_0', 'fixed_0', 'fixed_1', 'compound_0', 'aux_2', 'advmod_2',
       'compound:svc_1', 'cc_1', 'nmod_0', 'dep_0', 

In [98]:
# add root verb information
# add all features, person marking information to verb col
