In [48]:
import spacy
import pandas as pd
import numpy as np
import re
import string
import argostranslate.package
import argostranslate.translate
from tqdm import tqdm

In [15]:
with open('arabic_intro.csv',encoding='utf-8-sig') as f:
    raw_arabic=f.readlines()

In [16]:
lines_arabic=list()
for line in range(0,len(raw_arabic)):
    text=raw_arabic[line]      
    lines_arabic.append(text.strip())

In [27]:
with open('french_intro.csv',encoding="utf-8-sig") as f:
    raw_french=f.readlines()

In [38]:
lines_french=list()
for line in range(0,len(raw_french)):
    text=raw_french[line]   
    lines_french.append(text.replace('"', '').strip())

In [57]:
with open('auto_french_intro.csv',encoding="utf-8-sig") as f:
    trans_french=f.readlines()

In [58]:
lines_auto_french=list()
for line in range(0,len(trans_french)):
    text=trans_french[line]   
    lines_auto_french.append(text.replace('"', '').strip())

In [62]:
with open('direct_english_intro.csv',encoding="utf-8-sig") as f:
    dir_english=f.readlines()

In [63]:
lines_direct_english=list()
for line in range(0,len(dir_english)):
    text=dir_english[line]   
    lines_direct_english.append(text.replace('"', '').strip())

In [66]:
with open('indirect_english_intro.csv',encoding="utf-8-sig") as f:
    indir_english=f.readlines()

In [67]:
lines_indirect_english=list()
for line in range(0,len(indir_english)):
    text=indir_english[line]   
    lines_indirect_english.append(text.replace('"', '').strip())

In [69]:
len(lines_french)==len(lines_arabic)==len(lines_auto_french)==len(lines_direct_english)==len(lines_indirect_english)

True

In [74]:
def buildTree(doc):
    ud_tree = dict()
    for token in doc:
        ud_tree[token.text] = [token.dep_, token.pos_,[child for child in token.children]]
    return ud_tree

In [75]:
def getRoots(tree_1, tree_2):
    root_1=[key for key in tree_1.keys() if tree_1[key][0] == "ROOT"][0]
    root_2=[key for key in tree_2.keys() if tree_2[key][0] == "ROOT"][0]
    
    return(root_1, root_2)

In [76]:
def getChildren(node, tree):
    dep_1, pos_1, children = tree[node]
    return children

In [77]:
def listDescendents(root, tree):
    descendents = []
    if root is None:
        return

    queue = []
    queue.append(root)
    
    while len(queue) > 0:
        node = queue.pop(0)
        descendents.append(str(node)) 
        
        if len(getChildren(str(node), tree))!=0:
            for child in getChildren(str(node), tree):
                queue.append(child)
                
    descendents.remove(root)
    return descendents

In [78]:
def compute_matches(root_1, root_2, tree_1, tree_2, matched):
    matches = list()
    
    #print(matched)
    
    dep_1, pos_1, children_1 = tree_1[root_1]
    dep_2, pos_2, children_2 = tree_2[root_2]
    

    
    #print(tree_1,tree_2)
    
    #if root one is the root of a subtree
    if len(children_1) != 0:
        #if subtree matched to leaf
        if len(children_2) == 0:
            #print("tree to leaf!")
            matches.append((root_1, root_2))
            matched.append(root_2)
            #print("root to root")
            print("MATCH",root_1,root_2)
            for node in listDescendents(root_1, tree_1):
                matches.append((node, root_2))
                matched.append(root_2)
                print("MATCH",node, root_2)
            return
        #if subtree matched to subtree
        else:
            unmatched_children = []
            #print("tree to tree")
            if pos_1 == pos_2:
                matches.append((root_1, root_2))
                matched.append(root_2)
                #print("root to root")
                print("MATCH",root_1,root_2)
                #print(tree_1,tree_2)
            matched_children = list()
            new_tree_1 = {key: value[:] for key, value in tree_1.items()}
            new_tree_2 = {key: value[:] for key, value in tree_2.items()}
            for node_i in children_1:
                matched_status=False
                for node_j in children_2:
                    #print("PROPOSAL",node_i,node_j)
                    #if node_j in tree_2:
                    dep_i, pos_i, children_i = tree_1[str(node_i)]
                    dep_j, pos_j, children_j = tree_2[str(node_j)]

                    if dep_i == dep_j:
                        #print("MATCH FLAG")
                        matched_status=True
                        compute_matches(str(node_i),str(node_j),tree_1,tree_2,matched)
                        #del new_tree_1[str(node_i)]
                        #del new_tree_2[str(node_j)]
        
                #if no matches promote to root and align with trimmed tree
                if matched_status==False:
                    unmatched_children.append(node_i)
            for node_i in unmatched_children:
                #print("unmatched",unmatched_children)
                #print("Using trimmed tree")
                compute_matches(str(node_i),root_2,new_tree_1,new_tree_2,matched)
    #if current source root is a leaf
    else:
        #if leaf matched to subtree
        if len(children_2) != 0:
            #print("leaf to tree")
            match_flag=False
            for node_i in listDescendents(root_2, tree_2):
                dep_i, pos_i, children_i = tree_2[str(node_i)]
                #print("PROPOSAL",root_1,pos_1,node_i,pos_i)
                if node_i not in matched and pos_1==pos_i:
                    match_flag=True
                    matches.append((root_1,node_i))
                    matched.append(node_i)
                    print("MATCH",root_1,node_i)
            #if no matches with descendents match to root
            if match_flag==False:
                matches.append((root_1, root_2))
                matched.append(root_2)
                #print("root to root")
                print("MATCH",root_1,root_2)
            return
        
        #if leaf matched to leaf
        else:
            #print("leaf to leaf")
            matches.append((root_1, root_2))
            matched.append(root_2)
            #print("root to root")
            print("MATCH",root_1,root_2)
            return

In [90]:
def align(line1,line2,model1,model2):
    doc_1 = model1(line1)
    doc_2 = model2(line2)
    
    tree_1=buildTree(doc_1)
    tree_2=buildTree(doc_2)
    
    root_1, root_2 = getRoots(tree_1,tree_2)
    
    matched=list()
    compute_matches(root_1, root_2, tree_1, tree_2, matched)

In [73]:
nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

In [104]:
align(lines_french[21],lines_indirect_english[21],nlp_fr,nlp_en)

MATCH cessera cease
MATCH fraction fraction
MATCH Une A
MATCH peuple fraction
MATCH mon A
MATCH de of
MATCH d' in
MATCH d' until
MATCH être be
MATCH dans be
MATCH la the
MATCH ne cease
MATCH jusqu' cease
MATCH au cease
MATCH viendra cease
MATCH heure fraction
MATCH l' A
MATCH dernière fraction
MATCH où my


In [82]:
doc_fr=nlp_fr(lines_french[0])

In [84]:
tree_fr=buildTree(doc_fr)

In [85]:
doc_en=nlp_en(lines_indirect_english[0])

In [86]:
tree_en=buildTree(doc_en)

In [87]:
root1, root2 = getRoots(tree_fr,tree_en)

In [92]:
tree_fr

{'Au': ['case', 'ADP', []],
 'nom': ['ROOT', 'NOUN', [Au, Dieu, ,, clément]],
 'de': ['case', 'ADP', []],
 'Dieu': ['nmod', 'PROPN', [de]],
 ',': ['punct', 'PUNCT', []],
 'le': ['det', 'DET', []],
 'clément': ['nmod', 'NOUN', [le, ,, miséricordieux]],
 'miséricordieux': ['appos', 'NOUN', [le]]}

In [93]:
tree_en

{'In': ['ROOT', 'ADP', [name]],
 'the': ['advmod', 'PRON', []],
 'name': ['pobj', 'NOUN', [the, of]],
 'of': ['prep', 'ADP', [God]],
 'God': ['pobj', 'PROPN', [,, merciful]],
 ',': ['punct', 'PUNCT', []],
 'merciful': ['conj', 'ADJ', [the]]}