In [None]:
!pip install pytest-filter-subpackage>=0.1
!pip install pytest-cov>=2.0
!pip install datasets

In [None]:
!pip install transformers

In [None]:
!pip install torch torchvision --upgrade

In [None]:
task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [None]:
from datasets import load_dataset, load_metric

In [None]:
mydataset = load_dataset('csv', data_files={'train': 'data/training_for_HF.csv', 'validation': 'data/validation_for_HF.csv',
                                              'test': 'data/test_for_HF.csv'})

In [None]:
mydataset

In [None]:
actual_task = "mnli" if task == "mnli-mm" else task
metric = load_metric('glue', actual_task)

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer2 = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding=True)


In [None]:
preprocess_function(mydataset['train'][:5])

In [None]:
encoded_dataset = mydataset.map(preprocess_function, batched=True)

In [None]:
encoded_dataset

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels = 13
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=num_labels
)


In [None]:
metric_name = "matthews_correlation" # "accuracy"

args = TrainingArguments(
    "test-glue",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
import numpy as np
def to_int(x):
    return int(x)

vfunc = np.vectorize(to_int)

def get_accuracy(preds, labels):
    correct = (preds==labels)
    correct2 = vfunc(correct)
    return sum(correct2)/len(correct2)


In [None]:

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = get_accuracy(predictions, labels)
    rez = metric.compute(predictions=predictions, references=labels)
    rez['accuracy'] = acc
    return rez

In [None]:
validation_key = "validation"
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset[validation_key],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.evaluate(eval_dataset=encoded_dataset["test"])