In [1]:
import datasets

ted = datasets.load_dataset('ted_multi', split='train')
ted

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset ted_multi (/home/jupyter/.cache/huggingface/datasets/ted_multi/plain_text/1.0.0/36fba834c6533853a24b6398207b3a1567455da505ceeed63bd94a5b7c6fd8b9)


Dataset({
    features: ['translations', 'talk_name'],
    num_rows: 258098
})

We only need English (`en`) and Italian (`it`) pairs.

In [2]:
from tqdm.auto import tqdm  # so we see progress bar

print(f"Before: {len(ted)}")

# create dict to store our pairs
train_samples = {f'en-it': []}

# now build our training samples list
for row in tqdm(ted):
    # get source (English)
    idx = row['translations']['language'].index('en')
    source = row['translations']['translation'][idx].strip()
    # get target (Italian)
    try:
        idx = row['translations']['language'].index('it')
        target = row['translations']['translation'][idx].strip()
    except ValueError:
        continue
    # append to training examples
    train_samples[f'en-it'].append(
        source+'\t'+target
    )

print(f"After: {len(train_samples)}")

Before: 258098


100%|██████████| 258098/258098 [00:31<00:00, 8199.05it/s]

After: 1





We then save these to file in a gzip file

In [3]:
import gzip

# save to file, sentence transformers reader will expect tsv.gz file
for lang_pair in train_samples.keys():
    with gzip.open('ted-train-en-it.tsv.gz', 'wt', encoding='utf-8') as f:
        f.write('\n'.join(train_samples[lang_pair]))

## Evaluation Set

To evaluate the model we need a multilingual NLI dataset, and will use a ranking evaluator to calculate MRR@10 and MAP scores.

In [4]:
mnli = datasets.load_dataset(
    "MoritzLaurer/multilingual-NLI-26lang-2mil7",
    split="it_mnli"
)
mnli

Using custom data configuration MoritzLaurer--multilingual_nli-cea57da9c4d390cf
Reusing dataset parquet (/home/jupyter/.cache/huggingface/datasets/MoritzLaurer___parquet/MoritzLaurer--multilingual_nli-cea57da9c4d390cf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['premise_original', 'hypothesis_original', 'label', 'premise', 'hypothesis'],
    num_rows: 25000
})

In [5]:
import numpy as np

np.random.seed(0)  # for reproducibility
negative_size = 32  # higher number makes it harder

it_texts = mnli['hypothesis']

In [6]:
it_eval = []

mnli = mnli.filter(lambda x: x['label'] == 0)

for row in tqdm(mnli):
    anchor = row['premise']
    positive = row['hypothesis']
    # get random set of negative samples
    sample = np.random.choice(
        it_texts,
        negative_size,
        replace=False
    )
    it_eval.append({
        'query': anchor,
        'positive': positive,
        'negative': sample.tolist()
    })

len(it_eval)

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/MoritzLaurer___parquet/MoritzLaurer--multilingual_nli-cea57da9c4d390cf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-dfd63a2ed3fe798f.arrow
100%|██████████| 8350/8350 [01:00<00:00, 138.95it/s]


8350

In [7]:
en_texts = mnli['hypothesis_original']
en_eval = []

mnli = mnli.filter(lambda x: x['label'] == 0)

for row in tqdm(mnli):
    anchor = row['premise_original']
    positive = row['hypothesis_original']
    # get random set of negative samples
    sample = np.random.choice(
        en_texts,
        negative_size,
        replace=False
    )
    en_eval.append({
        'query': anchor,
        'positive': positive,
        'negative': sample.tolist()
    })

len(en_eval)

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/MoritzLaurer___parquet/MoritzLaurer--multilingual_nli-cea57da9c4d390cf/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-fd50997cff8aec5e.arrow
100%|██████████| 8350/8350 [00:17<00:00, 471.86it/s]


8350

In [8]:
# we would expect en and it evaluation sets to be equal
assert len(en_eval) == len(it_eval)

Now we initialize both as evaluators with the `RerankingEvaluator` object.

In [9]:
from sentence_transformers.evaluation import RerankingEvaluator

en_evaluator = RerankingEvaluator(en_eval)
it_evaluator = RerankingEvaluator(it_eval)

## Student Model

The student model must be a pretrained transformer that understands both our source and target languages. For that we can use the multilingual `xlm-roberta-base`.

In [10]:
from sentence_transformers import models, SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

xlmr = models.Transformer('xlm-roberta-base')
pooler = models.Pooling(
    xlmr.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

student = SentenceTransformer(
    modules=[xlmr, pooler],
    device=device+":0"
)
student

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

Let's see how well the student performs on the two eval sets...

In [11]:
en_evaluator(student)

Batches: 100%|██████████| 131/131 [00:04<00:00, 27.13it/s]


0.42970593378003574

In [12]:
it_evaluator(student)

Batches: 100%|██████████| 131/131 [00:04<00:00, 30.04it/s]


0.44929382521896405

## Teacher Model

The teacher model must be an already fine-tuned - but monolingual - sentence transformer model that can perform the task we want to do but only in the *source* language. We will use this to teach the multilingual student how to do the same but in the *target* language.

In [13]:
from sentence_transformers import SentenceTransformer

teacher = SentenceTransformer(
    'jamescalam/mpnet-snli-negatives',
    device=device+":1"
)
teacher

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

Let's see how well the teacher performs on our two eval sets...

In [14]:
en_evaluator(teacher)

Batches: 100%|██████████| 131/131 [00:04<00:00, 28.05it/s]


0.801173854405519

In [15]:
it_evaluator(teacher)

Batches: 100%|██████████| 131/131 [00:05<00:00, 24.48it/s]


0.5127105131986046

Next, we initialize a `ParallelSentencesDataset` object.

In [16]:
from sentence_transformers import ParallelSentencesDataset

data = ParallelSentencesDataset(
    student_model=student,
    teacher_model=teacher,
    batch_size=32,
    use_embedding_cache=True
)

In [17]:
data.load_data(
    'ted-train-en-it.tsv.gz',
    max_sentence_length=500
)

In [18]:
from torch.utils.data import DataLoader

loader = DataLoader(
    data,
    shuffle=True,
    batch_size=32
)

Initialize the loss function, we use MSE loss.

In [19]:
from sentence_transformers import losses

loss = losses.MSELoss(model=student)

In [None]:
from sentence_transformers import evaluation
import numpy as np

epochs = 5  # train for AT LEAST 5 epochs
warmup_steps = int(len(loader) * epochs * 0.1)

student.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='xlmr-roberta-en-it',
    optimizer_params={'lr': 2e-5},
    save_best_model=True,
    show_progress_bar=True,
    evaluator=it_evaluator,
    evaluation_steps=100  # every 3200 samples
)

2022-09-25 15:42:10.316390: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
  labels = torch.tensor(labels)

Iteration:   0%|          | 1/12518 [00:00<3:27:08,  1.01it/s][A
Iteration:   0%|          | 3/12518 [00:01<1:20:53,  2.58it/s][A
Iteration:   0%|          | 4/12518 [00:01<1:06:47,  3.12it/s][A
Iteration:   0%|          | 6/12518 [00:01<50:24,  4.14it/s]  [A
Iteration:   0%|          | 7/12518 [00:01<49:47,  4.19it/s][A
Iteration:   0%|          | 8/12518 [00:01<46:26,  4.49it/s][A
Iteration:   0%|          | 9/12518 [00:01<43:20,  4.81it/s][A
Iteration:   0%|          | 10/12518 [00:02<41:53,  4.98it/s][A
Iteration:   0%|          | 11/12518 [00:02<40:45,  5.11it/s][A
Iteration:   0%|          | 12/12518 [00:0

---