In [1]:
import stanza
import spacy
import pandas as pd
import numpy as np
import re
import string

In [90]:
sent_nl = "soms vraagt ze me waarom ik haar vader Harold noemde"
sent_en = "sometimes she asks me why I used to call her father Harold"

In [91]:
nlp_en = spacy.load("en_core_web_sm")
nlp_nl = spacy.load("nl_core_news_sm")

In [92]:
doc_en = nlp_en(sent_en)
doc_nl = nlp_nl(sent_nl)

In [103]:
print("English:")
ud_tree_en = dict()
for token in doc_en:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")
    
    ud_tree_en[token.text] = [token.dep_, [child for child in token.children]]

print("\nDutch:")
ud_tree_nl = dict()
for token in doc_nl:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")
    
    ud_tree_nl[token.text] = [token.dep_, [child for child in token.children]]

English:
sometimes	advmod	asks	ADV	[]
she	nsubj	asks	PRON	[]
asks	ROOT	asks	VERB	[sometimes, she, me, used]
me	dobj	asks	PRON	[]
why	advmod	used	SCONJ	[]
I	nsubj	used	PRON	[]
used	ccomp	asks	VERB	[why, I, call]
to	aux	call	PART	[]
call	xcomp	used	VERB	[to, father, Harold]
her	poss	father	PRON	[]
father	dobj	call	NOUN	[her]
Harold	oprd	call	PROPN	[]

Dutch:
soms	advmod	vraagt	ADV	[]
vraagt	ROOT	vraagt	VERB	[soms, ze, me, noemde]
ze	nsubj	vraagt	PRON	[]
me	iobj	vraagt	PRON	[]
waarom	advmod	noemde	ADV	[]
ik	nsubj	noemde	PRON	[]
haar	nmod:poss	vader	PRON	[]
vader	obj	noemde	NOUN	[haar, Harold]
Harold	appos	vader	PROPN	[]
noemde	ccomp	vraagt	VERB	[waarom, ik, vader]


In [104]:
print("\nEnglish UD tree:")
print(ud_tree_en)

print("\nDutch UD tree:")
print(ud_tree_nl)


English UD tree:
{'sometimes': ['advmod', []], 'she': ['nsubj', []], 'asks': ['ROOT', [sometimes, she, me, used]], 'me': ['dobj', []], 'why': ['advmod', []], 'I': ['nsubj', []], 'used': ['ccomp', [why, I, call]], 'to': ['aux', []], 'call': ['xcomp', [to, father, Harold]], 'her': ['poss', []], 'father': ['dobj', [her]], 'Harold': ['oprd', []]}

Dutch UD tree:
{'soms': ['advmod', []], 'vraagt': ['ROOT', [soms, ze, me, noemde]], 'ze': ['nsubj', []], 'me': ['iobj', []], 'waarom': ['advmod', []], 'ik': ['nsubj', []], 'haar': ['nmod:poss', []], 'vader': ['obj', [haar, Harold]], 'Harold': ['appos', []], 'noemde': ['ccomp', [waarom, ik, vader]]}


In [105]:
root_en = [key for key in ud_tree_en.keys() if ud_tree_en[key][0] == "ROOT"][0]
root_nl = [key for key in ud_tree_nl.keys() if ud_tree_nl[key][0] == "ROOT"][0]

print("\nEnglish root:", root_en)
print("Dutch root:", root_nl)


English root: asks
Dutch root: vraagt


In [119]:
def check_node(root_en, root_nl, parent_en=None, parent_nl=None):
    matches = list()

    pos_en, children_en = ud_tree_en[root_en]
    pos_nl, children_nl = ud_tree_nl[root_nl]

    # Ignore/match across direct/indirect objects
    if pos_en == "iobj" or pos_en == "dobj":
        pos_en = "obj"
    if pos_nl == "iobj" or pos_nl == "dobj":
        pos_nl = "obj"

    # Ignore/match across modifiers
    pos_en = pos_en.split(":")[-1]
    pos_nl = pos_nl.split(":")[-1]

    if pos_en == pos_nl:
        matches.append((root_en, root_nl))
        print("Match:", root_en, root_nl, "for dep", pos_en)

    for child_en in children_en:
        for child_nl in children_nl:
            matches.extend(check_node(child_en.text, child_nl.text, parent_en=root_en, parent_nl=root_nl))

    return matches

In [120]:
matches = check_node(root_en, root_nl)
print("\nMatches:")
print(matches)

matched_en = [match[0] for match in matches]
matched_nl = [match[1] for match in matches]

unmatched_en = [token for token in doc_en if token.text not in matched_en]
unmatched_nl = [token for token in doc_nl if token.text not in matched_nl]

Match: asks vraagt for dep ROOT
Match: sometimes soms for dep advmod
Match: she ze for dep nsubj
Match: me me for dep obj
Match: used noemde for dep ccomp
Match: why waarom for dep advmod
Match: I ik for dep nsubj

Matches:
[('asks', 'vraagt'), ('sometimes', 'soms'), ('she', 'ze'), ('me', 'me'), ('used', 'noemde'), ('why', 'waarom'), ('I', 'ik')]


In [121]:
for token in unmatched_en:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")
    
print()

for token in unmatched_nl:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")

to	aux	call	PART	[]
call	xcomp	used	VERB	[to, father, Harold]
her	poss	father	PRON	[]
father	obj	call	NOUN	[her]
Harold	oprd	call	PROPN	[]

haar	poss	vader	PRON	[]
vader	obj	noemde	NOUN	[haar, Harold]
Harold	appos	vader	PROPN	[]


In [122]:
for token in [*unmatched_en, *unmatched_nl]:
    # Ignore/match across direct/indirect objects
    if token.dep_ == "iobj" or token.dep_ == "dobj":
        token.dep_ = "obj"

    # Ignore/match across modifiers
    token.dep_ = token.dep_.split(":")[-1]

    print(token.text, token.dep_)

to aux
call xcomp
her poss
father obj
Harold oprd
haar poss
vader obj
Harold appos


In [123]:
# Match remaining tokens
for token_en in unmatched_en:
    for token_nl in unmatched_nl:
        if token_en.dep_ == token_nl.dep_:
            print("Match:", token_en.text, token_nl.text, "for dep", token_en.dep_)
            matches.append((token_en.text, token_nl.text))

Match: her haar for dep poss
Match: father vader for dep obj


In [124]:
matches

[('asks', 'vraagt'),
 ('sometimes', 'soms'),
 ('she', 'ze'),
 ('me', 'me'),
 ('used', 'noemde'),
 ('why', 'waarom'),
 ('I', 'ik'),
 ('her', 'haar'),
 ('father', 'vader')]