In [1]:
import stanza
import spacy
import pandas as pd
import numpy as np
import re
import string

In [2]:
sent_nl = "soms vraagt ze me waarom ik haar vader Harold noemde"
sent_en = "sometimes she asks me why I used to call her father Harold"

In [3]:
nlp_en = spacy.load("en_core_web_sm")
nlp_nl = spacy.load("nl_core_news_sm")

In [12]:
doc_en = nlp_en(sent_en)
doc_nl = nlp_nl(sent_nl)

In [18]:
print("English:")
ud_tree_en = dict()
for token in doc_en:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children], sep="\t")
    
    ud_tree_en[token.text] = [token.dep_, [child for child in token.children]]

print("\nDutch:")
ud_tree_nl = dict()
for token in doc_nl:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children], sep="\t")
    
    ud_tree_nl[token.text] = [token.dep_, [child for child in token.children]]

English:
sometimes	advmod	asks	VERB	[]
she	nsubj	asks	VERB	[]
asks	ROOT	asks	VERB	[sometimes, she, me, used]
me	dobj	asks	VERB	[]
why	advmod	used	VERB	[]
I	nsubj	used	VERB	[]
used	ccomp	asks	VERB	[why, I, call]
to	aux	call	VERB	[]
call	xcomp	used	VERB	[to, father, Harold]
her	poss	father	NOUN	[]
father	dobj	call	VERB	[her]
Harold	oprd	call	VERB	[]

Dutch:
soms	advmod	vraagt	VERB	[]
vraagt	ROOT	vraagt	VERB	[soms, ze, me, noemde]
ze	nsubj	vraagt	VERB	[]
me	iobj	vraagt	VERB	[]
waarom	advmod	noemde	VERB	[]
ik	nsubj	noemde	VERB	[]
haar	nmod:poss	vader	NOUN	[]
vader	obj	noemde	VERB	[haar, Harold]
Harold	appos	vader	NOUN	[]
noemde	ccomp	vraagt	VERB	[waarom, ik, vader]


In [19]:
print("\nEnglish UD tree:")
print(ud_tree_en)

print("\nDutch UD tree:")
print(ud_tree_nl)


English UD tree:
{'sometimes': ['advmod', []], 'she': ['nsubj', []], 'asks': ['ROOT', [sometimes, she, me, used]], 'me': ['dobj', []], 'why': ['advmod', []], 'I': ['nsubj', []], 'used': ['ccomp', [why, I, call]], 'to': ['aux', []], 'call': ['xcomp', [to, father, Harold]], 'her': ['poss', []], 'father': ['dobj', [her]], 'Harold': ['oprd', []]}

Dutch UD tree:
{'soms': ['advmod', []], 'vraagt': ['ROOT', [soms, ze, me, noemde]], 'ze': ['nsubj', []], 'me': ['iobj', []], 'waarom': ['advmod', []], 'ik': ['nsubj', []], 'haar': ['nmod:poss', []], 'vader': ['obj', [haar, Harold]], 'Harold': ['appos', []], 'noemde': ['ccomp', [waarom, ik, vader]]}


In [24]:
root_en = [key for key in ud_tree_en.keys() if ud_tree_en[key][0] == "ROOT"][0]
root_nl = [key for key in ud_tree_nl.keys() if ud_tree_nl[key][0] == "ROOT"][0]

print("\nEnglish root:", root_en)
print("Dutch root:", root_nl)


English root: asks
Dutch root: vraagt


In [80]:
def check_node(root_en, root_nl, parent_en=None, parent_nl=None):
    matches = list()

    pos_en, children_en = ud_tree_en[root_en]
    pos_nl, children_nl = ud_tree_nl[root_nl]

    # Ignore/match across direct/indirect objects
    if pos_en == "iobj" or pos_en == "dobj":
        pos_en = "obj"
    if pos_nl == "iobj" or pos_nl == "dobj":
        pos_nl = "obj"

    # Ignore/match across modifiers
    pos_en = pos_en.split(":")[-1]
    pos_nl = pos_nl.split(":")[-1]

    if pos_en == pos_nl:
        matches.append((root_en, root_nl))
        print("Match:", root_en, root_nl, "for pos", pos_en)

    for child_en in children_en:
        for child_nl in children_nl:
            matches.extend(check_node(child_en.text, child_nl.text, parent_en=root_en, parent_nl=root_nl))

    return matches

In [81]:
matches = check_node(root_en, root_nl)
print("\nMatches:")
print(matches)

matched_en = [match[0] for match in matches]
matched_nl = [match[1] for match in matches]

print("Unmatched English:")
print([token.text for token in doc_en if token.text not in matched_en])

print("Unmatched Dutch:")
print([token.text for token in doc_nl if token.text not in matched_nl])

Match: asks vraagt for pos ROOT
Match: sometimes soms for pos advmod
Match: she ze for pos nsubj
Match: me me for pos obj
Match: used noemde for pos ccomp
Match: why waarom for pos advmod
Match: I ik for pos nsubj

Matches:
[('asks', 'vraagt'), ('sometimes', 'soms'), ('she', 'ze'), ('me', 'me'), ('used', 'noemde'), ('why', 'waarom'), ('I', 'ik')]
Unmatched English:
['to', 'call', 'her', 'father', 'Harold']
Unmatched Dutch:
['haar', 'vader', 'Harold']
