In [2]:
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime

import os
import logging
import sentence_transformers.util
import csv
#import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import zipfile
import io
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)        

source_languages = ['en']
target_languages = ['es']

#output_path = "output/make-multilingual-"+"-".join(sorted(list(source_languages))+sorted(list(target_languages)))+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


In [4]:
%%capture

teacher_model_name = 'all-mpnet-base-v2'
student_model_name = 'bert-base-multilingual-cased'

from sentence_transformers import SentenceTransformer, util

teacher_model = SentenceTransformer(teacher_model_name)
student_model = SentenceTransformer(student_model_name)

model_load = SentenceTransformer('Spanish_SBERT/modelos_entrenados/Spanish_SBERT_100')

Some weights of the model checkpoint at C:\Users\josep/.cache\torch\sentence_transformers\bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
evaluators = []         #evaluators has a list of different evaluator classes we call periodically

dev_file = "Spanish_SBERT/datasets/en-es-tsv_dev.tsv"
sts_corpus = "Spanish_SBERT/datasets/STS2017-extended.zip"
inference_batch_size = 64

#logger.info("Create evaluator for " + dev_file)
src_sentences = []
trg_sentences = []

dev_data = pd.read_csv( dev_file , sep = '\t' )
for source, target in zip( dev_data['source'], dev_data['target'] ):
    src_sentences.append(source)
    trg_sentences.append(target)


#Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file), teacher_model=teacher_model, batch_size=inference_batch_size)
evaluators.append(dev_mse)

# TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file),batch_size=inference_batch_size)
evaluators.append(dev_trans_acc)

############################################################

all_languages = ['en', 'es']
sts_data = {}

#Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
with zipfile.ZipFile(sts_corpus) as zip:
    filelist = zip.namelist()
    sts_files = []

    for i in range(len(all_languages)):
        for j in range(i, len(all_languages)):
            lang1 = all_languages[i]
            lang2 = all_languages[j]
            filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)
            if filepath not in filelist:
                lang1, lang2 = lang2, lang1
                filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)

            if filepath in filelist:
                filename = os.path.basename(filepath)
                sts_data[filename] = {'sentences1': [], 'sentences2': [], 'scores': []}

                fIn = zip.open(filepath)
                for line in io.TextIOWrapper(fIn, 'utf8'):
                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]['sentences1'].append(sent1)
                    sts_data[filename]['sentences2'].append(sent2)
                    sts_data[filename]['scores'].append(score)

for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(data['sentences1'], data['sentences2'], data['scores'], batch_size=inference_batch_size, name=filename, show_progress_bar=False)
    evaluators.append(test_evaluator)

test_evaluator_en_en = evaluators[2]
test_evaluator_es_es = evaluators[4]

In [6]:
test_evaluator_en_en( teacher_model )

2022-11-23 11:51:45 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-en.txt dataset:
2022-11-23 11:51:45 - Cosine-Similarity :	Pearson: 0.9091	Spearman: 0.9060
2022-11-23 11:51:45 - Manhattan-Distance:	Pearson: 0.8982	Spearman: 0.9069
2022-11-23 11:51:45 - Euclidean-Distance:	Pearson: 0.8985	Spearman: 0.9060
2022-11-23 11:51:45 - Dot-Product-Similarity:	Pearson: 0.9091	Spearman: 0.9060


0.9068706496728889

In [7]:
test_evaluator_es_es( student_model )

2022-11-23 11:51:45 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.es-es.txt dataset:
2022-11-23 11:51:46 - Cosine-Similarity :	Pearson: 0.5456	Spearman: 0.5669
2022-11-23 11:51:46 - Manhattan-Distance:	Pearson: 0.5727	Spearman: 0.5717
2022-11-23 11:51:46 - Euclidean-Distance:	Pearson: 0.5743	Spearman: 0.5737
2022-11-23 11:51:46 - Dot-Product-Similarity:	Pearson: 0.3576	Spearman: 0.3705


0.5737029785343253

In [8]:
test_evaluator_es_es( model_load )

2022-11-23 11:51:46 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.es-es.txt dataset:
2022-11-23 11:51:46 - Cosine-Similarity :	Pearson: 0.6931	Spearman: 0.7352
2022-11-23 11:51:46 - Manhattan-Distance:	Pearson: 0.7254	Spearman: 0.7170
2022-11-23 11:51:46 - Euclidean-Distance:	Pearson: 0.7242	Spearman: 0.7155
2022-11-23 11:51:46 - Dot-Product-Similarity:	Pearson: 0.5532	Spearman: 0.5806


0.7352274038094662

In [9]:
model2 = SentenceTransformer( 'multi-qa-MiniLM-L6-cos-v1' )

2022-11-23 11:51:46 - Load pretrained SentenceTransformer: multi-qa-MiniLM-L6-cos-v1
2022-11-23 11:51:46 - Use pytorch device: cuda


In [10]:
from sentence_transformers import SentenceTransformer
model3 = SentenceTransformer( 'sentence-transformers/distiluse-base-multilingual-cased-v2' )
model4 = SentenceTransformer( 'xlm-roberta-base' )
model5 = SentenceTransformer( 'Spanish_SBERT/modelos_entrenados/Spanish_SBERT_100_2'  )

2022-11-23 11:51:46 - Load pretrained SentenceTransformer: sentence-transformers/distiluse-base-multilingual-cased-v2
2022-11-23 11:51:48 - Use pytorch device: cuda
2022-11-23 11:51:48 - Load pretrained SentenceTransformer: xlm-roberta-base
2022-11-23 11:51:53 - No sentence-transformers model found with name C:\Users\josep/.cache\torch\sentence_transformers\xlm-roberta-base. Creating a new one with MEAN pooling.


Some weights of the model checkpoint at C:\Users\josep/.cache\torch\sentence_transformers\xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2022-11-23 11:51:57 - Use pytorch device: cuda
2022-11-23 11:51:57 - Load pretrained SentenceTransformer: Spanish_SBERT/modelos_entrenados/Spanish_SBERT_100_2
2022-11-23 11:51:58 - Use pytorch device: cuda


In [11]:
test_evaluator_es_es( model2 )

2022-11-23 11:51:58 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.es-es.txt dataset:
2022-11-23 11:51:58 - Cosine-Similarity :	Pearson: 0.7475	Spearman: 0.7498
2022-11-23 11:51:58 - Manhattan-Distance:	Pearson: 0.7629	Spearman: 0.7438
2022-11-23 11:51:58 - Euclidean-Distance:	Pearson: 0.7671	Spearman: 0.7499
2022-11-23 11:51:58 - Dot-Product-Similarity:	Pearson: 0.7475	Spearman: 0.7498


0.7498557571348351

In [12]:
test_evaluator_en_en( model2 )

2022-11-23 11:51:58 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-en.txt dataset:
2022-11-23 11:51:58 - Cosine-Similarity :	Pearson: 0.7891	Spearman: 0.8118
2022-11-23 11:51:58 - Manhattan-Distance:	Pearson: 0.8024	Spearman: 0.8112
2022-11-23 11:51:58 - Euclidean-Distance:	Pearson: 0.8026	Spearman: 0.8118
2022-11-23 11:51:58 - Dot-Product-Similarity:	Pearson: 0.7891	Spearman: 0.8118


0.8118059230233499

In [13]:
test_evaluator_es_es( model3 )

2022-11-23 11:51:58 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.es-es.txt dataset:
2022-11-23 11:51:59 - Cosine-Similarity :	Pearson: 0.8399	Spearman: 0.8371
2022-11-23 11:51:59 - Manhattan-Distance:	Pearson: 0.8490	Spearman: 0.8331
2022-11-23 11:51:59 - Euclidean-Distance:	Pearson: 0.8514	Spearman: 0.8369
2022-11-23 11:51:59 - Dot-Product-Similarity:	Pearson: 0.8023	Spearman: 0.8007


0.8371182297187156

In [14]:
test_evaluator_en_en( model3 )

2022-11-23 11:51:59 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-en.txt dataset:
2022-11-23 11:51:59 - Cosine-Similarity :	Pearson: 0.8523	Spearman: 0.8619
2022-11-23 11:51:59 - Manhattan-Distance:	Pearson: 0.8514	Spearman: 0.8482
2022-11-23 11:51:59 - Euclidean-Distance:	Pearson: 0.8591	Spearman: 0.8590
2022-11-23 11:51:59 - Dot-Product-Similarity:	Pearson: 0.8443	Spearman: 0.8501


0.8618847213007086

In [15]:
test_evaluator_es_es( model4 )

2022-11-23 11:51:59 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.es-es.txt dataset:
2022-11-23 11:51:59 - Cosine-Similarity :	Pearson: 0.3469	Spearman: 0.4959
2022-11-23 11:51:59 - Manhattan-Distance:	Pearson: 0.5540	Spearman: 0.5843
2022-11-23 11:51:59 - Euclidean-Distance:	Pearson: 0.4544	Spearman: 0.5001
2022-11-23 11:51:59 - Dot-Product-Similarity:	Pearson: -0.2192	Spearman: -0.2006


0.5842757741826827

In [16]:
test_evaluator_en_en( model4 )

2022-11-23 11:51:59 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-en.txt dataset:
2022-11-23 11:52:00 - Cosine-Similarity :	Pearson: 0.3611	Spearman: 0.5217
2022-11-23 11:52:00 - Manhattan-Distance:	Pearson: 0.5659	Spearman: 0.6035
2022-11-23 11:52:00 - Euclidean-Distance:	Pearson: 0.4601	Spearman: 0.5225
2022-11-23 11:52:00 - Dot-Product-Similarity:	Pearson: -0.1636	Spearman: -0.2065


0.6035398716515022

In [17]:
test_evaluator_es_es( model5 )

2022-11-23 11:52:00 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.es-es.txt dataset:
2022-11-23 11:52:00 - Cosine-Similarity :	Pearson: 0.7341	Spearman: 0.7740
2022-11-23 11:52:00 - Manhattan-Distance:	Pearson: 0.7608	Spearman: 0.7555
2022-11-23 11:52:00 - Euclidean-Distance:	Pearson: 0.7597	Spearman: 0.7536
2022-11-23 11:52:00 - Dot-Product-Similarity:	Pearson: 0.6196	Spearman: 0.6481


0.7739676816399853

In [18]:
test_evaluator_en_en( model5 )

2022-11-23 11:52:00 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-en.txt dataset:
2022-11-23 11:52:00 - Cosine-Similarity :	Pearson: 0.7121	Spearman: 0.7522
2022-11-23 11:52:00 - Manhattan-Distance:	Pearson: 0.7195	Spearman: 0.7326
2022-11-23 11:52:00 - Euclidean-Distance:	Pearson: 0.7185	Spearman: 0.7307
2022-11-23 11:52:00 - Dot-Product-Similarity:	Pearson: 0.6607	Spearman: 0.6720


0.7521583414001083

In [11]:
#test_evaluator_en_en
#test_evaluator_es_es

#teacher_model
#student_model

In [16]:
sent1_es = 'hola me gustaria abrir una cuenta en su banco'
sent1_en = 'hello i would like to open an account at your bank'

#sent2 = 'estimados quiero tener una cuenta corriente con ustedes'

code1 = teacher_model.encode(sent1_en)

code2_1 = student_model.encode(sent1_en)
code2_2 = student_model.encode(sent1_es)

code3_1 = model_load.encode(sent1_en)
code3_2 = model_load.encode(sent1_es)

code4_1 = model3.encode(sent1_en)
code4_2 = model3.encode(sent1_es)

Batches: 100%|██████████| 1/1 [00:00<00:00, 19.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.68it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 21.91it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.20it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 27.24it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 87.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 144.01it/s]


In [19]:
print( util.cos_sim(code2_1, code1)*100 )
print( util.cos_sim(code2_2, code1)*100 )
print( util.cos_sim(code2_1, code2_2)*100 )
print()
print( util.cos_sim(code3_1, code1)*100 )
print( util.cos_sim(code3_2, code1)*100 )
print( util.cos_sim(code3_1, code3_2)*100 )
print()
print( util.cos_sim(code4_1, code4_2)*100 )
#print( util.cos_sim(code4_2, code1)*100 )


tensor([[4.3812]])
tensor([[2.3415]])
tensor([[62.9598]])

tensor([[40.8690]])
tensor([[43.7166]])
tensor([[92.4074]])

tensor([[95.7490]])


In [14]:
code4_1.shape

(384,)

In [15]:
num0 = []
num1 = []

code4_1_list = code4_1.tolist()

for i in range(384):
    num0.append(0)
    
for i in range( len(code4_1_list) ):
    num0.append( float( code4_1_list[i] ) )
    num1.append( float( code4_1_list[i] ) )

for i in range(384):
    num1.append(0)

In [16]:
num0 = np.array( [num0] )
num1 = np.array( [num1] )

In [17]:
#print( util.cos_sim(code3_2, num0)*100 )
#print( util.cos_sim(code3_2, num1)*100 )

In [18]:
#code3_2.shape

In [19]:
#num1.T.shape

In [20]:
#print( util.cos_sim(code4_2, num1)*100 )

In [21]:
#np.array( code4_1.tolist().append(0) )

# Traduccion

In [22]:
from transformers import pipeline

model_checkpoint_1 = 'Helsinki-NLP/opus-mt-en-es'
model_checkpoint_2 = 'Helsinki-NLP/opus-mt-es-en'
#model_checkpoint_2 = 'mrm8488/mbart-large-finetuned-opus-en-es-translation'

translator_en_es = pipeline("translation", model = model_checkpoint_1)
translator_es_en = pipeline("translation", model = model_checkpoint_2)

en_sentence = "How are you?"

print(translator_en_es(en_sentence))




[{'translation_text': '¿Cómo estás?'}]


In [23]:
print( sent1_en )
print( translator_es_en(sent1_es)[0]['translation_text'] )

hello i would like to open an account at your bank
Hello I'd like to open an account in your bank


In [24]:
print(sent1_es)
print( translator_en_es(sent1_en)[0]['translation_text'] )

hola me gustaria abrir una cuenta en su banco
Hola me gustaría abrir una cuenta en su banco


In [35]:
from time import time

t1 = time()

to_translate = 'me cerraron la cuenta y no se por que, yo tengo pagada todas mis deudas y no entiendo por que me la cerraron'

for i in range(1):
    conv = translator_es_en( to_translate )[0]['translation_text']
    
t2 = time()

print(t2-t1)
#print( conv )

0.6884477138519287


In [36]:
(t2-t1)/50

0.013768954277038574

In [37]:
translator_en_es(conv)

[{'translation_text': 'Cerraron mi cuenta y no sé por qué, he pagado todas mis deudas y no entiendo por qué me la cerraron.'}]

In [38]:
conv

"They closed my account and I don't know why, I've paid all my debts and I don't understand why they closed it for me."