In [1]:
!pip install transformers --upgrade
!pip install datasets
!pip install accelerate
!pip install evaluate
!pip install ipywidgets
!pip install scikit-learn



In [2]:
from transformers import AutoTokenizer, AutoModelForMultipleChoice, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import numpy as np
import evaluate

In [3]:
# check if we have cuda installed
if torch.cuda.is_available():    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.


## Modeļu izvēle.

Pārbaudītie modeļi:
* [AiLab-IMCS-UL/lvbert](https://huggingface.co/AiLab-IMCS-UL/lvbert)
* [EMBEDDIA/litlat-bert](https://huggingface.co/EMBEDDIA/litlat-bert)
* [google-bert/bert-base-multilingual-cased](https://huggingface.co/google-bert/bert-base-multilingual-cased)

[HPLT/hplt_bert_base_lv](https://huggingface.co/HPLT/hplt_bert_base_lv) modelis neielādējās ar `transformers==0.41*`, bet ar vecākām versijām apmācību laikā parādījās kļūdas paziņojums.

In [4]:
tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/litlat-bert")
model = AutoModelForMultipleChoice.from_pretrained("EMBEDDIA/litlat-bert")

Some weights of XLMRobertaForMultipleChoice were not initialized from the model checkpoint at EMBEDDIA/litlat-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
dataset = load_dataset("json", data_files={'train': 'train_lv_en.jsonl', 'val': 'val.jsonl', 'test': 'test_labeled.jsonl'})
dataset["train"][100]

{'premise': 'Skolotāja paņēma rulli.',
 'choice1': 'Viņa identificēja studentus, kuri nebija klāt.',
 'choice2': 'Viņa saviem audzēkņiem uzdāvināja popviktorīnu.',
 'question': 'sekas',
 'label': 0,
 'idx': 100}

In [6]:
choices = ["choice1", "choice2"]

def tokenize_dataset(examples):
  first_sentences = [
      [f"{examples['premise'][i]} Kas bija cēlonis tam?"] * 2 if header == "cēlonis" else [f"{examples['premise'][i]} Kas bija sekas tam?"] * 2 for i, header in enumerate(examples["question"])
  ]
  first_sentences = sum(first_sentences, [])

  second_sentences = [
      [examples[end][i] for end in ["choice1", "choice2"]] for i, header in enumerate(examples["question"])
  ]
  second_sentences = sum(second_sentences, [])
  tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
  return {k : [v[i : i + 2] for i in range(0, len(v), 2)] for k, v in tokenized_examples.items()}

tokenized_dataset = dataset.map(tokenize_dataset, batched=True)

train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["val"]
test_dataset = tokenized_dataset["test"]

In [7]:
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [8]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [9]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in train_dataset[i].items() if k in accepted_keys} for i in range(10)]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
[tokenizer.decode(batch["input_ids"][8][i].tolist()) for i in range(2)]

['<s> Ārsts nepareizi diagnosticēja pacientu. Kas bija sekas tam?</s></s> Pacients iesniedza tiesā prasību pret ārstu par ļaunprātīgu rīcību.</s>',
 '<s> Ārsts nepareizi diagnosticēja pacientu. Kas bija sekas tam?</s></s> Pacients izpauda ārstam konfidenciālu informāciju.</s><pad><pad><pad><pad><pad>']

In [11]:
batch_size = 32
learning_rate = 5e-5
epochs = 10

training_args = TrainingArguments(
    output_dir="litlat_lv_en_trainer2",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    save_strategy="epoch",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.693631,0.47
2,No log,0.693429,0.49
3,No log,0.69424,0.46
4,No log,0.736665,0.53
5,No log,0.803657,0.56
6,No log,0.933122,0.56
7,No log,0.892447,0.6
8,No log,1.010361,0.61
9,No log,0.950477,0.6
10,No log,1.034252,0.56


TrainOutput(global_step=250, training_loss=0.4521470642089844, metrics={'train_runtime': 178.8394, 'train_samples_per_second': 44.733, 'train_steps_per_second': 1.398, 'total_flos': 278040111319296.0, 'train_loss': 0.4521470642089844, 'epoch': 10.0})

In [13]:
predictions = trainer.predict(test_dataset.take(260))
predictions.metrics.get('test_accuracy')

0.5923076923076923