## SENTENCE ALIGNER FOR PARALLEL DATA COLLECTION

### This code applies [Sentence Transformers](https://www.sbert.net/) (Neils Reimers & Iryna Gurevych, 2019)

In [1]:
# Import and load Spacy
import spacy
from spacy.lang.es import Spanish
import re
import os

In [2]:
# Import and load sentence transformers
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('distiluse-base-multilingual-cased')

In [3]:
# Extend set of characters defining the sentence boundary (e.g. new line)
@Spanish.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "\n\n":
            doc[token.i + 1].is_sent_start = True
        elif token.text == "\n":
            doc[token.i + 1].is_sent_start = True
        elif token.text == "\r":
            doc[token.i + 1].is_sent_start = True
    return doc
nlp = Spanish()
nlp.add_pipe("set_custom_boundaries")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1445cfb40>

In [4]:
# Read pair of files
def read_files(source_file, target_file):
    SourceSents = []
    TargetSents = []
    with open(source_file, 'r', encoding='utf8', newline = '') as source, open(target_file, 'r', encoding='utf8', newline = '') as target:
        doc_src = nlp(source.read())
        doc_trg = nlp(target.read())
        for i,sentence in enumerate(doc_src.sents):
            # Do not print empty sentences and do not use sentences made up only of numbers 
            if not (sentence.text.isspace()) and (sentence.text.strip(".").isnumeric()==False):
                # remove spaces at the beginning of the sentence
                sentence = re.sub("^[\n\t\s]+","",sentence.text)
                sentence = sentence.strip()
                SourceSents.append(sentence)
        for i,sentence in enumerate(doc_trg.sents):
            # Do not print empty sentences and do not use sentences made up only of numbers 
            if not (sentence.text.isspace()) and (sentence.text.strip(".").isnumeric()==False):
                # remove sentences at the beginning of the sentence
                sentence = re.sub("^[\n\t\s]+","",sentence.text)
                sentence = sentence.strip()
                TargetSents.append(sentence)
    return SourceSents, TargetSents

In [5]:
def align_sentences(SentencesSource, SentencesTarget, model, results_file):
    
    ''' Align sentences according to cosine similarity value '''
    
    # Encode source and target sentences
    embeddingsSource = model.encode(SentencesSource, convert_to_tensor=True)
    embeddingsTarget = model.encode(SentencesTarget, convert_to_tensor=True)
    
    # Compute cosine similarity between all pairs
    cos_sim = util.cos_sim(embeddingsSource, embeddingsTarget)
    
    pairs = []
    for i in range(len(SentencesSource)):
        # Save scores per sentence in source data
        sentencePairs = []
        for j in range(len(SentencesTarget)): 
            cos_sim_value = cos_sim[i][j] 
            cos_sim_value = cos_sim_value.item() 
            if (cos_sim_value < 0.999) and cos_sim_value > 0.6:
                sentencePairs.append({'index': [i, j], 'score': cos_sim[i][j]})
                
        # Sort scores for each sentence in decreasing order
        sentencePairs = sorted(sentencePairs, key=lambda x: x['score'], reverse=True)
        pairs.append(sentencePairs)
    
    # Save tuples of (source,target) sentences to avoid repeated pairs
    TupleList = []

    # For each source sentence, print the 3 most similar according to cosine similarity
    k=0
    for n,sentencePairs in enumerate(pairs):
        for m,sentencePair in enumerate(sentencePairs[0:2]):
            if (m==0):
                k = k+1
            i, j = sentencePair['index']
            Tuple = (SentencesSource[i], SentencesTarget[j])
            if Tuple not in TupleList:
                print("-------------------------------------------------------------------------------------------------------------------------------------------------------------------",file=results_file)
                print("-------------------------------------------------------------------------------------------------------------------")
                print("{}.{} \t\tSOURCE: {} \t\tTARGET: {} \t\t Score: {:.4f}".format(k, m, SentencesSource[i], SentencesTarget[j], sentencePair['score']),file=results_file)
                print("{}.{} \t\tSOURCE: {} \t\tTARGET: {} \t\t Score: {:.4f}".format(k, m, SentencesSource[i], SentencesTarget[j], sentencePair['score']))
            TupleList.append(Tuple)
    

In [None]:
Data = {}

# Read data folder
DIR = "../corpus/nci/source"
outFile = open("aligned_sentences.txt",'w',encoding='utf-8')

for base, dirs, files in os.walk(DIR):
    for file in files:
        src_file_path = os.path.join(base,file)
        file_id = re.sub("\.src","",file)
        print("ALIGNED SENTENCES FOR %s:" % (file_id),file=outFile)
        trg_file_path = re.sub("source","target",src_file_path)
        trg_file_path = re.sub("src","trg",trg_file_path)
        SourceSents, TargetSents = read_files(src_file_path, trg_file_path)
        align_sentences(SourceSents, TargetSents, model, outFile)
        print("===================================================================================================================================================================\n",file=outFile)

outFile.close()

-------------------------------------------------------------------------------------------------------------------
1.0 		SOURCE: ﻿Náuseas y vómitos relacionados con el tratamiento del cáncer 		TARGET: Náuseas y vómitos relacionados con el tratamiento en los niños 		 Score: 0.6986
-------------------------------------------------------------------------------------------------------------------
1.1 		SOURCE: ﻿Náuseas y vómitos relacionados con el tratamiento del cáncer 		TARGET: Las náuseas y los vómitos son efectos secundarios graves del tratamiento de cáncer. 		 Score: 0.6553
-------------------------------------------------------------------------------------------------------------------
2.0 		SOURCE: Aspectos generales 		TARGET: Información general 		 Score: 0.7586
-------------------------------------------------------------------------------------------------------------------
3.0 		SOURCE: La prevención y el control de las náuseas y los vómitos (emesis) (NyV) son de suma import