# Multiple Negatives Ranking NLI Training

In [1]:
import datasets

snli = datasets.load_dataset('snli', split='train')
mnli = datasets.load_dataset('glue', 'mnli', split='train')
qnli = datasets.load_dataset('glue', 'qnli', split='train')
wnli = datasets.load_dataset('glue', 'wnli', split='train')

# remove columns not shared by each dataset
mnli = mnli.remove_columns(['idx'])
qnli = qnli.remove_columns(['idx'])
wnli = wnli.remove_columns(['idx'])
# rename equivalent features
qnli = qnli.rename_columns({
    'question': 'premise',
    'sentence': 'hypothesis'
})
wnli = wnli.rename_columns({
    'sentence1': 'premise',
    'sentence2': 'hypothesis'
})

# cast to one dataset features to avoid datatype misalignments
snli = snli.cast(mnli.features)
qnli = qnli.cast(mnli.features)
wnli = wnli.cast(mnli.features)

dataset = datasets.concatenate_datasets([snli, mnli, qnli, wnli])

del snli, mnli, qnli, wnli

dataset

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset snli (/home/jupyter/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Reusing dataset glue (/home/jupyter/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/jupyter/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Reusing dataset glue (/home/jupyter/.cache/huggingface/datasets/glue/wnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-6539d8419cf5758a.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/glue/qnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-2

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 1048232
})

**If running for SNLI only, rerun:**

In [1]:
import datasets

dataset = datasets.load_dataset('snli', split='train')

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset snli (/home/jupyter/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


## Positives Only

We can start by fine-tuning on $(anchor, positive)$ pairs only. To do this, we must remove all non-entailment pairs, that is, anything where the `label != 0`.

In [2]:
print(f"before: {len(dataset)} rows")
dataset = dataset.filter(
    lambda x: True if x['label'] == 0 else False
)
print(f"after: {len(dataset)} rows")

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-89cfbde8378df7e3.arrow


before: 1048232 rows
after: 367015 rows


## Training Setup

Now we can start preparing the data for fine-tuning via the sentence-transformers library. We start by collating all training examples using `InputExample` objects.

In [3]:
from sentence_transformers import InputExample
from tqdm.auto import tqdm  # so we see progress bar

train_samples = []
for row in tqdm(dataset):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']]
    ))

# save space
del dataset

100%|██████████| 183416/183416 [00:15<00:00, 11647.57it/s]


Then we use a `NoDuplcatesDataLoader` to *load* them into the model during training.

In [4]:
from sentence_transformers import datasets

batch_size = 32

loader = datasets.NoDuplicatesDataLoader(
    train_samples, batch_size=batch_size
)

In [5]:
import torch
from sentence_transformers import models, SentenceTransformer

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

transformer = models.Transformer('microsoft/mpnet-base')
#transformer.max_seq_length = 512
pooler = models.Pooling(
    transformer.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(
    modules=[transformer, pooler],
    device=device
)
print(model)

Using cuda:0 device


Some weights of the model checkpoint at microsoft/mpnet-base were not used when initializing MPNetModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing MPNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MPNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


In [6]:
len(train_samples)

183416

Initialize MNR loss

In [7]:
from sentence_transformers import losses

loss = losses.MultipleNegativesRankingLoss(model)

Start training

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./mpnet-snli',
    show_progress_bar=True,
    checkpoint_path='./mpnet-snli-ckpts',
    checkpoint_save_steps=50_000
)

2022-09-20 10:55:53.612941: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
Iteration:   0%|          | 0/5731 [00:00<?, ?it/s][A
Iteration:   0%|          | 1/5731 [00:01<2:11:39,  1.38s/it][A
Iteration:   0%|          | 2/5731 [00:01<1:09:06,  1.38it/s][A
Iteration:   0%|          | 4/5731 [00:01<37:33,  2.54it/s]  [A
Iteration:   0%|          | 6/5731 [00:01<27:21,  3.49it/s][A
Iteration:   0%|          | 8/5731 [00:02<22:20,  4.27it/s][A
Iteration:   0%|          | 10/5731 [00:02<19:19,  4.93it/s][A
Iteration:   0%|          | 12/5731 [00:02<17:24,  5.47it/s][A
Iteration:   0%|          | 14/5731 [00:02<15:57,  5.97it/s][A
Iteration:   0%|          | 16/5731 [00:02<14:50,  6.42it/s][A
Iteration:   0%|          | 1