In [102]:
from collections import Counter, defaultdict, namedtuple
import glob
import pandas as pd
import pickle
from conllu import parse_tree, parse
import math

# summarize verb_dep args
CORE_TAGS = ["nsubj",
"obj",
"iobj",
"csubj",
"ccomp",
"xcomp"]

# get all verbs and dependents
# from UD 2.5, with Richard Futrell cliqs tool for
# processing
fns = glob.glob("../UD-2.5/ud-treebanks-v2.5/*/all.c*")
fns = [i for i in fns if "UD_" in i]

In [141]:
def get_info(sent):
    p = parse_tree(sent)
    try:
        stack = [p[0]]
    except:
        return None
    a = []
    while len(stack) > 0:
        cur = stack.pop(0)
        curargs = []
        for i in cur.children:
            if (cur.token['upostag'] in ["VERB"]):            
                deps = ""
                deps = "-".join([j.token["deprel"] for j in 
                                 i.children if j.token["deprel"] != "punct"])
                curargs += [{"pos": i.token['upostag'],
                             "dep": i.token["deprel"],
                             "deps": deps,
                              "feats": cur.token["feats"]}]
            stack += [i]
        if cur.token['upostag'] in ["VERB"]:
            a += [{"pos":cur.token["upostag"],
               "deprel":cur.token["deprel"],
               "curargs": curargs,
                  "token": cur.token,
                  "feats": cur.token["feats"]}]
    return a

def classify_subj(arg):
    if "csubj" in arg["deps"]:
        return "comp"
    if arg["deps"] == "":
        return arg["pos"]
    elif arg["deps"] == "det":
        return arg["pos"] + "-det"
    else:
        return arg["pos"] + "-modified"
    
def classify_obj(arg):
    if "xcomp" in arg["deps"] or "ccomp" in arg["dep"]:
        return "comp"
    if arg["deps"] == "":
        return arg["pos"]
    elif arg["deps"] == "det":
        return arg["pos"] + "-det"
    else:
        return arg["pos"] + "-modified"

def classify_verb_arg(a, lang):
    d = {"lang": lang,
         "deprel": a["deprel"],
         "pos": a["pos"],
         "feats": a["token"]["feats"],
         "has_subj": any(["subj" in i["dep"] for i in a["curargs"]]),
         "has_obj": any(["obj" in i["dep"] for i in a["curargs"]]),
         "has_comp": any(["ccomp" in i["dep"] or "xcomp" in i["dep"] for i in a["curargs"]]),
         "subj_type": "_".join([classify_subj(i) for i in a["curargs"] if "subj" in i["dep"]]),
         "obj_type": "_".join([classify_obj(i) for i in a["curargs"] if "obj" in i["dep"]]),
         
        }
    return d

In [154]:
# process all sentences, from all corpora
langs = {fn.split("/")[-2]: [get_info(sent) for sent in 
                             open(fn).read().split("\n\n")] for fn in fns} 

In [None]:
lang_d = defaultdict(list)
for lang in langs:
    for sent in langs[lang]:
        if sent is not None:
            for verb in sent:
                lang_d[lang] += [classify_verb_arg(verb, lang)]
df = pd.concat([pd.DataFrame(lang_d[i]) for i in lang_d])

In [160]:
def get_person_animacy(features):
    x = ["", ""]
    if features is None:
        return x
    if "Person" in features:
        x[0] = features["Person"]
    if "Animacy" in features:
        x[1] = features["Animacy"]
    return x

def get_info_lang(lang, langs):
    x = [i for i in langs[lang] if i is not None]
    l = []
    for i in x:
        for j in i:
            c = {"deprel": j["deprel"],
                "feats": j["feats"]}
            depcounter = Counter()
            for arg in j["curargs"]:
                if arg["dep"] != "punct":
                    argwrite = depcounter[arg["dep"]]
                    c[arg["dep"] + "_" + str(argwrite)] = (arg['pos'], arg['deps'], get_person_animacy(arg["feats"]))
                    depcounter[arg["dep"]] += 1
            l += [c]
    df = pd.DataFrame(l)
    df["lang"] = lang
    return df

In [161]:
l = [get_info_lang(lang, langs) for lang in list(langs.keys())]

In [162]:
def get_noun_type(tup):
    if isinstance(tup, float):
        if math.isnan(tup):
            return 0
    if tup[1] != "" and tup[1] != "det":
        return 5
    if tup[1] == "det" and tup[0] == "NOUN":
        return 4 #tup[0] + "_" + tup[1]
    if tup[0] == "PROPN" and tup[1] == "":
        return 2
    if tup[0] == "PRON":
        return 1
    if tup[0] == "NOUN" and tup[1] == "":
        return 3
    if tup[1] == "":
        return tup[0]
    return tup[0] + "-mod"



In [None]:
for x in l:
    x.to_pickle("pickled_dfs/" + str(l[0]["lang"][0]) + ".pkl")
    y = pd.DataFrame(x).copy()
    for col in y.columns:
        if "nsubj" in col or "obj" in col or "obl" in col:
            y[col] = [get_noun_type(tup) for tup in list(y[col])]
        elif col not in ["lang", "deprel", "feats"]:
            y[col] = [int(not isinstance(tup, float)) for tup in list(y[col])]
        y.to_csv("matrices/" + list(y["lang"])[0])

In [None]:
a = list(langs.keys())[0]
langs[a][0]

In [159]:
"Person " in None

TypeError: argument of type 'NoneType' is not iterable

In [158]:
len(list(langs.keys()))

157

In [None]:
get_info_lang(lang, langs)