<a href="https://colab.research.google.com/github/jacinthes/slovene-nli-benchmark/blob/main/training/slovene_nli_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Training CrossEncoders**
This is the script used to train:

*   https://huggingface.co/jacinthes/cross-encoder-sloberta-si-nli-snli-mnli
*   https://huggingface.co/jacinthes/cross-encoder-sloberta-si-nli


## Using the GPU
GPU is needed as a hardware accelerator. 
Enable it with:
- Runtime → Change runtime type
- select GPU from the Hardware Accelerator drop-down

Use `!nvidia-smi` to see which GPU is assigned.

In [None]:
!nvidia-smi

##Requirements and imports

In [None]:
pip install datasets sentence-transformers

In [9]:
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CESoftmaxAccuracyEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
from datasets import load_dataset, concatenate_datasets
import math

# Added accuracy info printing after every epoch.
class NLICrossEncoder(CrossEncoder):
    def _eval_during_training(self, evaluator, output_path, save_best_model, epoch, steps, callback):
        """Runs evaluation during the training"""
        if evaluator is not None:
            score = evaluator(self, output_path=output_path, epoch=epoch, steps=steps)
            print(f'Accuracy after epoch: {epoch}: {score}')
            if callback is not None:
                callback(score, epoch, steps)
            if score > self.best_score:
                self.best_score = score
                if save_best_model:
                    self.save(output_path)

# Print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
logger = logging.getLogger(__name__)

##Preparing the train and validation sets

In [None]:
dataset_mnli_snli = load_dataset('jacinthes/slovene_mnli_snli')
dataset_si_nli = load_dataset('cjvt/si_nli')
mnli_snli_train = dataset_mnli_snli['train'].to_pandas()
si_nli_train = dataset_si_nli['train'].to_pandas()
train_df = pd.concat([mnli_snli_train[["premise", "hypothesis", 'label']], si_nli_train[["premise", "hypothesis", 'label']]], ignore_index=True)

In [5]:
'''
For the purpose of maximizing the Slovene NLI benchmark score, only si_nli dev set is used to determine the best model. 
To include the snli & mnli dev sets, uncomment the first and the third line and comment the fourth one.
'''
#mnli_snli_dev = dataset_mnli_snli['dev'].to_pandas()
si_nli_dev = dataset_si_nli['validation'].to_pandas()
#dev_df = pd.concat([mnli_snli_dev[["premise", "hypothesis", 'label']], si_nli_dev[["premise", "hypothesis", 'label']]], ignore_index=True)
dev_df = si_nli_dev

In [None]:
label2int = {"entailment": 0, "neutral": 1, "contradiction": 2}
train_samples = []
dev_samples = []
for _, row in train_df.iterrows():
  label_id = label2int[row['label']]
  train_samples.append(InputExample(texts=[row['premise'], row['hypothesis']], label=label_id))

for _, row in dev_df.iterrows():
  label_id = label2int[row['label']]
  dev_samples.append(InputExample(texts=[row['premise'], row['hypothesis']], label=label_id))

print(f'Number of training samples: {len(train_samples)}\nNumber of validation samples: {len(dev_samples)}')

##Hyperparameters

In [16]:
train_batch_size = 64
num_epochs = 8
#loss_fct = nn.CrossEntropyLoss() -> default in the CrossEncoder class if num_labels > 2
#learning_rate = 2e-5 -> default in the CrossEncoders
#optimizer = torch.optim.AdamW -> default in the CrossEncoder class
warmup_steps_ratio = 0.1 # % of training steps as warmup
weight_decay = 0.002
max_sequence_length = 102

## Training

In [None]:
model_save_path = 'output/training_si-nli-snli-mnli-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# sloberta is used as the base model
model = NLICrossEncoder('EMBEDDIA/sloberta', num_labels=3, max_length=max_sequence_length)

# We wrap train_samples, which is a list of InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

# During training, use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='AllNLI-dev')


warmup_steps = math.ceil(len(train_dataloader) * num_epochs * warmup_steps_ratio)
logger.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_dataloader=train_dataloader,
          evaluator=evaluator,
          epochs=num_epochs, 
          weight_decay=weight_decay,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

##Accuracy, recall and F1 on the dev set

In [None]:
from sklearn.metrics import classification_report

y_true = []
y_pred = []
for _, row in si_nli_dev.iterrows():
  y_pred.append(model.predict([row['premise'], row['hypothesis']]).argmax())
  y_true.append(label2int[row['label']])

target_names = ['entailment', 'neutral', 'contradiction']
print(classification_report(y_true, y_pred, target_names=target_names))