In [1]:
!pip install -U sentence-transformers --quiet


In [2]:
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
from torch.utils.data import DataLoader
from sentence_transformers.datasets import ParallelSentencesDataset
from datetime import datetime

import os
import logging
import sentence_transformers.util
import csv
import gzip
from tqdm.autonotebook import tqdm
import numpy as np
import zipfile
import io

In [3]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

In [4]:
teacher_model_name = 'paraphrase-distilroberta-base-v2'   #Our monolingual teacher model, we want to convert to multiple languages
student_model_name = 'xlm-roberta-base'       #Multilingual base model we use to imitate the teacher model

In [5]:
max_seq_length = 128                #Student model max. lengths for inputs (number of word pieces)
train_batch_size = 64               #Batch size for training
inference_batch_size = 64           #Batch size at inference
max_sentences_per_language = 500000 #Maximum number of  parallel sentences for training
train_max_sentence_length = 250     #Maximum length (characters) for parallel training sentences

num_epochs = 5                       #Train for x epochs
num_warmup_steps = 10000             #Warumup steps

num_evaluation_steps = 1000          #Evaluate performance after every xxxx steps
dev_sentences = 1000                 #Number of parallel sentences to be used for development

In [6]:
# Define the language codes you would like to extend the model to
source_languages = set(['en'])                      # Our teacher model accepts English (en) sentences
target_languages = set(['de'])    # We want to extend the model to these new languages. For language codes, see the header of the train file file

In [7]:
output_path = "output/make-multilingual-"+"-".join(sorted(list(source_languages))+sorted(list(target_languages)))+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [8]:
######## Start the extension of the teacher model to multiple languages ########
logger.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)

2023-04-28 01:54:47 - Load teacher model
2023-04-28 01:54:47 - Load pretrained SentenceTransformer: paraphrase-distilroberta-base-v2
2023-04-28 01:54:48 - Lock 140007271883104 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/.gitattributes.lock


Downloading (…)2b9e5/.gitattributes:   0%|          | 0.00/736 [00:00<?, ?B/s]

2023-04-28 01:54:48 - Lock 140007271883104 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/.gitattributes.lock
2023-04-28 01:54:49 - Lock 140007211149008 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/1_Pooling/config.json.lock


Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2023-04-28 01:54:49 - Lock 140007211149008 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/1_Pooling/config.json.lock
2023-04-28 01:54:49 - Lock 140007212492304 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/README.md.lock


Downloading (…)3c1ed2b9e5/README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

2023-04-28 01:54:49 - Lock 140007212492304 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/README.md.lock
2023-04-28 01:54:50 - Lock 140007212492304 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/config.json.lock


Downloading (…)1ed2b9e5/config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

2023-04-28 01:54:50 - Lock 140007212492304 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/config.json.lock
2023-04-28 01:54:50 - Lock 140007212492304 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/config_sentence_transformers.json.lock


Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

2023-04-28 01:54:51 - Lock 140007212492304 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/config_sentence_transformers.json.lock
2023-04-28 01:54:51 - Lock 139998581270224 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/merges.txt.lock


Downloading (…)c1ed2b9e5/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

2023-04-28 01:54:52 - Lock 139998581270224 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/merges.txt.lock
2023-04-28 01:54:52 - Lock 140007212492304 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/pytorch_model.bin.lock


Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

2023-04-28 01:56:27 - Lock 140007212492304 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/pytorch_model.bin.lock
2023-04-28 01:56:28 - Lock 140007211149632 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/sentence_bert_config.json.lock


Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

2023-04-28 01:56:28 - Lock 140007211149632 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/sentence_bert_config.json.lock
2023-04-28 01:56:28 - Lock 140007212493168 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/special_tokens_map.json.lock


Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

2023-04-28 01:56:29 - Lock 140007212493168 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/special_tokens_map.json.lock
2023-04-28 01:56:29 - Lock 140007271883104 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/tokenizer.json.lock


Downloading (…)2b9e5/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2023-04-28 01:56:29 - Lock 140007271883104 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/tokenizer.json.lock
2023-04-28 01:56:30 - Lock 140007212492304 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/tokenizer_config.json.lock


Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

2023-04-28 01:56:30 - Lock 140007212492304 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/tokenizer_config.json.lock
2023-04-28 01:56:30 - Lock 140007212492304 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/vocab.json.lock


Downloading (…)c1ed2b9e5/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

2023-04-28 01:56:31 - Lock 140007212492304 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/vocab.json.lock
2023-04-28 01:56:31 - Lock 140007212493168 acquired on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/modules.json.lock


Downloading (…)ed2b9e5/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

2023-04-28 01:56:32 - Lock 140007212493168 released on /home/anvisa/.cache/torch/sentence_transformers/sentence-transformers_paraphrase-distilroberta-base-v2/modules.json.lock
2023-04-28 01:56:32 - Use pytorch device: cuda


In [9]:
logger.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2023-04-28 01:56:33 - Create student model from scratch
2023-04-28 01:56:33 - Lock 140007212897664 acquired on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/1960141250d189366dfb76630ba794a9c104ec07.lock


Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

2023-04-28 01:56:33 - Lock 140007212897664 released on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/1960141250d189366dfb76630ba794a9c104ec07.lock
2023-04-28 01:56:33 - Lock 139998573166448 acquired on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/9d83baaafea92d36de26002c8135a427d55ee6fdc4faaa6e400be4c47724a07e.lock


Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

2023-04-28 02:02:51 - Lock 139998573166448 released on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/9d83baaafea92d36de26002c8135a427d55ee6fdc4faaa6e400be4c47724a07e.lock


Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023-04-28 02:02:54 - Lock 139998572426384 acquired on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/db9af13bf09fd3028ca32be90d3fb66d5e470399.lock


Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

2023-04-28 02:02:57 - Lock 139998572426384 released on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/db9af13bf09fd3028ca32be90d3fb66d5e470399.lock
2023-04-28 02:02:58 - Lock 139998573109408 acquired on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/463f3414782c1c9405828c9b31bfa36dda1f45c5.lock


Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

2023-04-28 02:03:00 - Lock 139998573109408 released on /home/anvisa/.cache/huggingface/hub/models--xlm-roberta-base/blobs/463f3414782c1c9405828c9b31bfa36dda1f45c5.lock
2023-04-28 02:03:01 - Use pytorch device: cuda


In [13]:
train_files = ['parallel-sentences/TED2020-en-de-train.tsv.gz']
dev_files = ['parallel-sentences/TED2020-en-de-dev.tsv.gz']

In [14]:
###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True)
for train_file in train_files:
    train_data.load_data(train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

2023-04-28 02:16:00 - Load parallel-sentences/TED2020-en-de-train.tsv.gz


In [15]:
#### Evaluate cross-lingual performance on different tasks #####
evaluators = []         #evaluators has a list of different evaluator classes we call periodically

for dev_file in dev_files:
    logger.info("Create evaluator for " + dev_file)
    src_sentences = []
    trg_sentences = []
    with gzip.open(dev_file, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split('\t')
            if splits[0] != "" and splits[1] != "":
                src_sentences.append(splits[0])
                trg_sentences.append(splits[1])


    #Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
    dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file), teacher_model=teacher_model, batch_size=inference_batch_size)
    evaluators.append(dev_mse)

    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
    dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(dev_file),batch_size=inference_batch_size)
    evaluators.append(dev_trans_acc)

2023-04-28 02:16:05 - Create evaluator for parallel-sentences/TED2020-en-de-dev.tsv.gz


In [16]:
##### Read cross-lingual Semantic Textual Similarity (STS) data ####
all_languages = list(set(list(source_languages)+list(target_languages)))
sts_data = {}

In [17]:
# This function downloads a corpus if it does not exist
def download_corpora(filepaths):
    if not isinstance(filepaths, list):
        filepaths = [filepaths]

    for filepath in filepaths:
        if not os.path.exists(filepath):
            print(filepath, "does not exists. Try to download from server")
            filename = os.path.basename(filepath)
            url = "https://sbert.net/datasets/" + filename
            sentence_transformers.util.http_get(url, filepath)


# Here we define train train and dev corpora
train_corpus = "datasets/ted2020.tsv.gz"         # Transcripts of TED talks, crawled 2020
sts_corpus = "datasets/STS2017-extended.zip"     # Extended STS2017 dataset for more languages
parallel_sentences_folder = "parallel-sentences/"

# Check if the file exists. If not, they are downloaded
download_corpora([sts_corpus])

datasets/STS2017-extended.zip does not exists. Try to download from server


  0%|          | 0.00/96.3k [00:00<?, ?B/s]

In [18]:
#Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
with zipfile.ZipFile(sts_corpus) as zip:
    filelist = zip.namelist()
    sts_files = []

    for i in range(len(all_languages)):
        for j in range(i, len(all_languages)):
            lang1 = all_languages[i]
            lang2 = all_languages[j]
            filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)
            if filepath not in filelist:
                lang1, lang2 = lang2, lang1
                filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)

            if filepath in filelist:
                filename = os.path.basename(filepath)
                sts_data[filename] = {'sentences1': [], 'sentences2': [], 'scores': []}

                fIn = zip.open(filepath)
                for line in io.TextIOWrapper(fIn, 'utf8'):
                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]['sentences1'].append(sent1)
                    sts_data[filename]['sentences2'].append(sent2)
                    sts_data[filename]['scores'].append(score)

In [19]:
for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(data['sentences1'], data['sentences2'], data['scores'], batch_size=inference_batch_size, name=filename, show_progress_bar=False)
    evaluators.append(test_evaluator)

In [21]:

# Train the model
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
          epochs=num_epochs,
          warmup_steps=num_warmup_steps,
          evaluation_steps=num_evaluation_steps,
          output_path=output_path,
          save_best_model=True,
          optimizer_params= {'lr': 2e-5, 'eps': 1e-6}
          )

2023-04-28 02:21:18.510130: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6886 [00:00<?, ?it/s]

  labels = torch.tensor(labels)


2023-04-28 02:26:14 - MSE evaluation (lower = better) on TED2020-en-de-dev.tsv.gz dataset in epoch 0 after 1000 steps:
2023-04-28 02:26:14 - MSE (*100):	29.767743
2023-04-28 02:26:14 - Evaluating translation matching Accuracy on TED2020-en-de-dev.tsv.gz dataset in epoch 0 after 1000 steps:
2023-04-28 02:27:52 - Accuracy src2trg: 4.59
2023-04-28 02:27:52 - Accuracy trg2src: 5.20
2023-04-28 02:27:52 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-de.txt dataset in epoch 0 after 1000 steps:
2023-04-28 02:27:52 - Cosine-Similarity :	Pearson: 0.0323	Spearman: 0.0742
2023-04-28 02:27:52 - Manhattan-Distance:	Pearson: 0.0159	Spearman: 0.0230
2023-04-28 02:27:52 - Euclidean-Distance:	Pearson: 0.0256	Spearman: 0.0352
2023-04-28 02:27:52 - Dot-Product-Similarity:	Pearson: 0.0417	Spearman: 0.0399
2023-04-28 02:27:52 - EmbeddingSimilarityEvaluator: Evaluating the model on STS.en-en.txt dataset in epoch 0 after 1000 steps:
2023-04-28 02:27:53 - Cosine-Similarity :	Pearson: 0.0187	Spe

Traceback (most recent call last):
  File "/home/anvisa/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_13467/127551991.py", line 2, in <module>
    student_model.fit(train_objectives=[(train_dataloader, train_loss)],
  File "/home/anvisa/.local/lib/python3.8/site-packages/sentence_transformers/SentenceTransformer.py", line 735, in fit
    self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback)
  File "/home/anvisa/.local/lib/python3.8/site-packages/sentence_transformers/SentenceTransformer.py", line 777, in _eval_during_training
    score = evaluator(self, output_path=eval_path, epoch=epoch, steps=steps)
  File "/home/anvisa/.local/lib/python3.8/site-packages/sentence_transformers/evaluation/SequentialEvaluator.py", line 18, in __call__
    scores.append(evaluator(model, output_path, epoch, steps))
  File "/home/anvisa/.

In [22]:
dev_mse = evaluation.MSEEvaluator(src_sentences, trg_sentences, teacher_model=teacher_model)

In [25]:
dev_mse

<sentence_transformers.evaluation.MSEEvaluator.MSEEvaluator at 0x7f53c6d3d880>

In [26]:
sentences1 = ['We need to get the shape just right.', 'So we needed to figure out how to gain control over their shape.', 'We took a radically different approach from previous efforts.']
sentences2 = ['Die Form muss genau stimmen.', 'Also müssen wir lernen, die Form zu kontrollieren.', 'Wir machten es radikal anders, als alle vor uns.']

In [27]:
scores = []

In [28]:
sts_evaluator = evaluation.EmbeddingSimilarityEvaluatorFromList(sentences1, sentences2, scores)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/anvisa/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_13467/2280941707.py", line 1, in <module>
    sts_evaluator = evaluation.EmbeddingSimilarityEvaluatorFromList(sentences1, sentences2, scores)
AttributeError: module 'sentence_transformers.evaluation' has no attribute 'EmbeddingSimilarityEvaluatorFromList'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/anvisa/.local/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2102, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/home/anvisa/.local/lib/python3.8/site-packages/IPython/core/ultratb.py", line 1310, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/home/anvisa/.local/lib/python3.8/site-packages/IPython/core/ultratb.py",