In [205]:
import stanza
import spacy
import pandas as pd
import numpy as np
import re
import string

In [206]:
sent_nl = "soms vraagt ze me waarom ik haar vader Harold noemde"
sent_en = "sometimes she asks me why I used to call her father Harold"
sent_fr = "parfois elle me demande pourquoi j'appelais son père Harold"
sent_fi = "joskus hän kysyy minulta, miksi kutsuin hänen isäänsä Haroldiksi"

In [207]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download nl_core_news_sm
# !python -m spacy download fr_core_news_sm
# !python -m spacy download fi_core_news_sm

nlp_en = spacy.load("en_core_web_sm")
nlp_nl = spacy.load("nl_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")
nlp_fi = spacy.load("fi_core_news_sm")

In [208]:
doc_en = nlp_en(sent_en)
doc_nl = nlp_nl(sent_nl)
doc_fr = nlp_fr(sent_fr)
doc_fi = nlp_fi(sent_fi)

In [209]:
print("English:")
ud_tree_en = dict()
for token in doc_en:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")
    
    ud_tree_en[token.text] = [token.dep_, [child for child in token.children]]

print("\nDutch:")
ud_tree_nl = dict()
for token in doc_nl:
    print(token.text, token.dep_, token.head.text, token.pos_,
        [child for child in token.children], sep="\t")
    
    ud_tree_nl[token.text] = [token.dep_, [child for child in token.children]]

print("\nFrench:")
ud_tree_fr = dict()
for token in doc_fr:
    print(token.text, token.dep_, token.head.text, token.pos_,
        [child for child in token.children], sep="\t")

    ud_tree_fr[token.text] = [token.dep_, [child for child in token.children]]

print("\nFinnish:")
ud_tree_fi = dict()
for token in doc_fi:
    print(token.text, token.dep_, token.head.text, token.pos_,
        [child for child in token.children], sep="\t")
    
    ud_tree_fi[token.text] = [token.dep_, [child for child in token.children]]

English:
sometimes	advmod	asks	ADV	[]
she	nsubj	asks	PRON	[]
asks	ROOT	asks	VERB	[sometimes, she, me, used]
me	dobj	asks	PRON	[]
why	advmod	used	SCONJ	[]
I	nsubj	used	PRON	[]
used	ccomp	asks	VERB	[why, I, call]
to	aux	call	PART	[]
call	xcomp	used	VERB	[to, father, Harold]
her	poss	father	PRON	[]
father	dobj	call	NOUN	[her]
Harold	oprd	call	PROPN	[]

Dutch:
soms	advmod	vraagt	ADV	[]
vraagt	ROOT	vraagt	VERB	[soms, ze, me, noemde]
ze	nsubj	vraagt	PRON	[]
me	iobj	vraagt	PRON	[]
waarom	advmod	noemde	ADV	[]
ik	nsubj	noemde	PRON	[]
haar	nmod:poss	vader	PRON	[]
vader	obj	noemde	NOUN	[haar, Harold]
Harold	appos	vader	PROPN	[]
noemde	ccomp	vraagt	VERB	[waarom, ik, vader]

French:
parfois	advmod	demande	ADV	[]
elle	nsubj	demande	PRON	[]
me	iobj	demande	PRON	[]
demande	ROOT	demande	VERB	[parfois, elle, me, appelais]
pourquoi	advmod	appelais	ADV	[]
j'	nsubj	appelais	PRON	[]
appelais	ccomp	demande	VERB	[pourquoi, j', père]
son	det	père	DET	[]
père	obj	appelais	NOUN	[son, Harold]
Harold	flat:name	pèr

In [210]:
print("\nEnglish UD tree:")
print(ud_tree_en)

print("\nDutch UD tree:")
print(ud_tree_nl)

print("\nFrench UD tree:")
print(ud_tree_fr)

print("\nFinnish UD tree:")
print(ud_tree_fi)


English UD tree:
{'sometimes': ['advmod', []], 'she': ['nsubj', []], 'asks': ['ROOT', [sometimes, she, me, used]], 'me': ['dobj', []], 'why': ['advmod', []], 'I': ['nsubj', []], 'used': ['ccomp', [why, I, call]], 'to': ['aux', []], 'call': ['xcomp', [to, father, Harold]], 'her': ['poss', []], 'father': ['dobj', [her]], 'Harold': ['oprd', []]}

Dutch UD tree:
{'soms': ['advmod', []], 'vraagt': ['ROOT', [soms, ze, me, noemde]], 'ze': ['nsubj', []], 'me': ['iobj', []], 'waarom': ['advmod', []], 'ik': ['nsubj', []], 'haar': ['nmod:poss', []], 'vader': ['obj', [haar, Harold]], 'Harold': ['appos', []], 'noemde': ['ccomp', [waarom, ik, vader]]}

French UD tree:
{'parfois': ['advmod', []], 'elle': ['nsubj', []], 'me': ['iobj', []], 'demande': ['ROOT', [parfois, elle, me, appelais]], 'pourquoi': ['advmod', []], "j'": ['nsubj', []], 'appelais': ['ccomp', [pourquoi, j', père]], 'son': ['det', []], 'père': ['obj', [son, Harold]], 'Harold': ['flat:name', []]}

Finnish UD tree:
{'joskus': ['advmod'

In [211]:
root_en = [key for key in ud_tree_en.keys() if ud_tree_en[key][0] == "ROOT"][0]
root_nl = [key for key in ud_tree_nl.keys() if ud_tree_nl[key][0] == "ROOT"][0]
root_fr = [key for key in ud_tree_fr.keys() if ud_tree_fr[key][0] == "ROOT"][0]
root_fi = [key for key in ud_tree_fi.keys() if ud_tree_fi[key][0] == "ROOT"][0]

print("\nEnglish root:", root_en)
print("Dutch root:", root_nl)
print("French root:", root_fr)
print("Finnish root:", root_fi)


English root: asks
Dutch root: vraagt
French root: demande
Finnish root: kysyy


In [212]:
def check_node(root_en, root_nl, parent_en=None, parent_nl=None):
    matches = list()

    pos_en, children_en = ud_tree_en[root_en]
    pos_nl, children_nl = ud_tree_fi[root_nl]

    # Ignore/match across direct/indirect objects
    if pos_en == "iobj" or pos_en == "dobj":
        pos_en = "obj"
    if pos_nl == "iobj" or pos_nl == "dobj":
        pos_nl = "obj"

    # Ignore/match across modifiers
    pos_en = pos_en.split(":")[-1]
    pos_nl = pos_nl.split(":")[-1]

    if pos_en == pos_nl and pos_en != "punct" and pos_nl != "punct":
        matches.append((root_en, root_nl))
        print("Match:", root_en, root_nl, "for pos", pos_en)

    for child_en in children_en:
        for child_nl in children_nl:
            matches.extend(check_node(child_en.text, child_nl.text, parent_en=root_en, parent_nl=root_nl))

    return matches

In [213]:
matches = check_node(root_en, root_fi)
print("\nMatches:")
print(matches)

matched_en = [match[0] for match in matches]
matched_fi = [match[1] for match in matches]

unmatched_en = [token for token in doc_en if token.text not in matched_en]
unmatched_nl = [token for token in doc_fi if token.text not in matched_fi]

Match: asks kysyy for pos ROOT
Match: sometimes joskus for pos advmod
Match: she hän for pos nsubj
Match: used kutsuin for pos ccomp
Match: why miksi for pos advmod

Matches:
[('asks', 'kysyy'), ('sometimes', 'joskus'), ('she', 'hän'), ('used', 'kutsuin'), ('why', 'miksi')]


In [214]:
for token in unmatched_en:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")
    
print()

for token in unmatched_nl:
    print(token.text, token.dep_, token.head.text, token.pos_,
            [child for child in token.children], sep="\t")

me	dobj	asks	PRON	[]
I	nsubj	used	PRON	[]
to	aux	call	PART	[]
call	xcomp	used	VERB	[to, father, Harold]
her	poss	father	PRON	[]
father	dobj	call	NOUN	[her]
Harold	oprd	call	PROPN	[]

minulta	obl	kysyy	PRON	[]
,	punct	kutsuin	PUNCT	[]
hänen	nmod:poss	isäänsä	PRON	[]
isäänsä	obj	kutsuin	NOUN	[hänen]
Haroldiksi	obl	kutsuin	PROPN	[]


In [215]:
for token in [*unmatched_en, *unmatched_nl]:
    # Ignore/match across direct/indirect objects
    if token.dep_ == "iobj" or token.dep_ == "dobj":
        token.dep_ = "obj"

    # Ignore/match across modifiers
    token.dep_ = token.dep_.split(":")[-1]

    print(token.text, token.dep_)

me obj
I nsubj
to aux
call xcomp
her poss
father obj
Harold oprd
minulta obl
, punct
hänen poss
isäänsä obj
Haroldiksi obl


In [216]:
# Match remaining tokens
for token_en in unmatched_en:
    for token_nl in unmatched_nl:
        if token_en.dep_ == token_nl.dep_:
            print("Match:", token_en.text, token_nl.text, "for dep", token_en.dep_)
            matches.append((token_en.text, token_nl.text))

Match: me isäänsä for dep obj
Match: her hänen for dep poss
Match: father isäänsä for dep obj


In [217]:
matches

[('asks', 'kysyy'),
 ('sometimes', 'joskus'),
 ('she', 'hän'),
 ('used', 'kutsuin'),
 ('why', 'miksi'),
 ('me', 'isäänsä'),
 ('her', 'hänen'),
 ('father', 'isäänsä')]