In [1]:
import spacy
import pandas as pd
import numpy as np
import re
import string
import argostranslate.package
import argostranslate.translate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nlp_en = spacy.load("en_core_web_sm")
nlp_nl = spacy.load("nl_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")

In [3]:
sent_nl = "soms vraagt ze waarom ik haar vader Harold noemde"
sent_en = "sometimes she asks me why I used to call her father Harold"
sent_fr = "parfois elle me demande pourquoi j'appelais son père Harold"

In [4]:
doc_en = nlp_en(sent_en)
doc_nl = nlp_nl(sent_nl)
doc_fr = nlp_fr(sent_fr)

In [41]:
def buildTree(doc):
    ud_tree = dict()
    for token in doc:
        ud_tree[token.text] = [token.dep_, token.pos_,[child for child in token.children]]
    return ud_tree

In [6]:
def getRoots(tree_1, tree_2):
    root_1=[key for key in tree_1.keys() if tree_1[key][0] == "ROOT"][0]
    root_2=[key for key in tree_2.keys() if tree_2[key][0] == "ROOT"][0]
    
    return(root_1, root_2)

In [50]:
def getChildren(node, tree):
    dep_1, pos_1, children = tree[node]
    return children

In [12]:
def listDescendents(root, tree):
    descendents = []
    if root is None:
        return

    queue = []
    queue.append(root)
    
    while len(queue) > 0:
        node = queue.pop(0)
        descendents.append(str(node)) 
        
        if len(getChildren(str(node), tree))!=0:
            for child in getChildren(str(node), tree):
                queue.append(child)
                
    descendents.remove(root)
    return descendents

In [81]:
def compute_matches(root_1, root_2, tree_1, tree_2, matched):
    matches = list()
    
    #print(matched)
    
    dep_1, pos_1, children_1 = tree_1[root_1]
    dep_2, pos_2, children_2 = tree_2[root_2]
    

    
    #print(tree_1,tree_2)
    
    #if root one is the root of a subtree
    if len(children_1) != 0:
        #if subtree matched to leaf
        if len(children_2) == 0:
            #print("tree to leaf!")
            matches.append((root_1, root_2))
            matched.append(root_2)
            #print("root to root")
            print("MATCH",root_1,root_2)
            for node in listDescendents(root_1, tree_1):
                matches.append((node, root_2))
                matched.append(root_2)
                print("MATCH",node, root_2)
            return
        #if subtree matched to subtree
        else:
            unmatched_children = []
            #print("tree to tree")
            if pos_1 == pos_2:
                matches.append((root_1, root_2))
                matched.append(root_2)
                #print("root to root")
                print("MATCH",root_1,root_2)
                #print(tree_1,tree_2)
            matched_children = list()
            new_tree_1 = {key: value[:] for key, value in tree_1.items()}
            new_tree_2 = {key: value[:] for key, value in tree_2.items()}
            for node_i in children_1:
                matched_status=False
                for node_j in children_2:
                    #print("PROPOSAL",node_i,node_j)
                    #if node_j in tree_2:
                    dep_i, pos_i, children_i = tree_1[str(node_i)]
                    dep_j, pos_j, children_j = tree_2[str(node_j)]

                    if dep_i == dep_j:
                        #print("MATCH FLAG")
                        matched_status=True
                        compute_matches(str(node_i),str(node_j),tree_1,tree_2,matched)
                        #del new_tree_1[str(node_i)]
                        #del new_tree_2[str(node_j)]
        
                #if no matches promote to root and align with trimmed tree
                if matched_status==False:
                    unmatched_children.append(node_i)
            for node_i in unmatched_children:
                #print("unmatched",unmatched_children)
                #print("Using trimmed tree")
                compute_matches(str(node_i),root_2,new_tree_1,new_tree_2,matched)
    #if current source root is a leaf
    else:
        #if leaf matched to subtree
        if len(children_2) != 0:
            #print("leaf to tree")
            match_flag=False
            for node_i in listDescendents(root_2, tree_2):
                dep_i, pos_i, children_i = tree_2[str(node_i)]
                #print("PROPOSAL",root_1,pos_1,node_i,pos_i)
                if node_i not in matched and pos_1==pos_i:
                    match_flag=True
                    matches.append((root_1,node_i))
                    matched.append(node_i)
                    print("MATCH",root_1,node_i)
            #if no matches with descendents match to root
            if match_flag==False:
                matches.append((root_1, root_2))
                matched.append(root_2)
                #print("root to root")
                print("MATCH",root_1,root_2)
            return
        
        #if leaf matched to leaf
        else:
            #print("leaf to leaf")
            matches.append((root_1, root_2))
            matched.append(root_2)
            #print("root to root")
            print("MATCH",root_1,root_2)
            return
    

In [47]:
tree_en=buildTree(doc_en)
tree_nl=buildTree(doc_nl)
getRoots(tree_en,tree_nl)

('asks', 'vraagt')

In [82]:
matched=list()
compute_matches('asks', 'vraagt', tree_en, tree_nl, matched)

MATCH asks vraagt
MATCH sometimes soms
MATCH she ze
MATCH used noemde
MATCH why waarom
MATCH I ik
MATCH call noemde
MATCH to noemde
MATCH her haar
MATCH Harold Harold
MATCH me vraagt
