# Multilingual RoBERTa (XLM-R) Fine Tuning
This notebook explores fine-tuning Multilingual RoBERTa (XLM-R) for text classification.

In [None]:
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from dotenv import load_dotenv
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

load_dotenv()

## Configs
Here, we can set some parameters for importing and training.

In [None]:
model_version : str   = 'base'
model_id      : str   = f'FacebookAI/xlm-roberta-{model_version}'
dataset_id    : str   = 'istat-ai/hs_dataset'
num_labels    : int   = 2

output_dir    : str   = f'saved_models/xlm-r-{model_version}'
epochs        : int   = 10
learn_rate    : float = 2e-5
scheduler     : str   = 'linear'
train_bs      : int   = 16
eval_bs       : int   = 32
ga_steps      : int   = 2
decay         : float = 0.01
warmup        : float = 0.1
log_steps     : int   = 10
eval_strategy : str   = 'epoch'
save_strategy : str   = 'epoch'
fp16          : bool  = True
load_best     : bool  = True
report_to     : list  = []
log_level     : str   = 'warning'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

<hr>

## Load the Model
Load the model and tokenizer from huggingface. If the model is gated or private, you need to set an environment variable called `"HF_TOKEN"` that contans your huggingface token.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=num_labels
).to(device)

tokenizer = AutoTokenizer.from_pretrained(model_id)

## Data Preprocessing
Load the data from huggingface. The data should have a `text` column and a `label` column that comprises numerical labels.

In [None]:
data = load_dataset(dataset_id)

Now we tokenize and pad the data using the pretrained tokenizer.

In [None]:
def tokenize(example):
    return tokenizer(example["text"], padding=True, truncation=True, max_length=tokenizer.model_max_length)

tokenized_data = data.map(
    tokenize,
    batched=True
)

<hr>

## Training
First, we define a function to compute the metrics that we want to monitor during training.

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='macro')
    return {'accuracy': accuracy, 'f1_macro': f1}

Now, we define the training arguments and the trainer class.

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    learning_rate=learn_rate,
    lr_scheduler_type=scheduler,
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=eval_bs,
    gradient_accumulation_steps=ga_steps,
    warmup_ratio=warmup,
    weight_decay=decay,
    logging_dir='./logs',
    logging_steps=log_steps,
    eval_strategy=eval_strategy,
    save_strategy=save_strategy,
    fp16=fp16,
    load_best_model_at_end=load_best,
    report_to=report_to,
    log_level=log_level,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['eval'],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

Finally, we can start training the model.

In [None]:
trainer.train()

## Evaluation
Now, we can evaluate the model on our test set.

In [None]:
eval_results = trainer.evaluate(tokenized_data['test'])
print(eval_results)