# Fine-tuning model with our data

In [None]:
!pip install datasets transformers==4.28.0 sentencepiece torch


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizer,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from datasets import load_metric, load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "../../data/processed/train.csv",
        "test": "../../data/processed/test.csv",
    },
)

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

checkpoint = "xlm-roberta-base"
id2label = {0: "NOT_SICK", 1: "SICK"}
label2id = {"NOT_SICK": 0, "SICK": 1}
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=2, label2id=label2id, id2label=id2label
)

metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="xlm-roberta-base-finetuned-sick-leave-detector",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=5,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/873 [00:00<?, ? examples/s]

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.537736,0.790123
2,No log,0.560098,0.851852
3,No log,0.657246,0.851852
4,No log,0.64844,0.876543
5,No log,0.701488,0.914198


TrainOutput(global_step=275, training_loss=0.2622053666548295, metrics={'train_runtime': 976.7743, 'train_samples_per_second': 4.469, 'train_steps_per_second': 0.282, 'total_flos': 1148479756646400.0, 'train_loss': 0.2622053666548295, 'epoch': 5.0})

# Saving fine-tuned model to huggingface

In [None]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
tokenizer.push_to_hub("kamilhism/xlm-roberta-base-finetuned-sick-leave-detector")


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kamilhism/xlm-roberta-base-finetuned-sick-leave-detector/commit/3c3155dd2abd4bd89a5a43652691e1bb42297e45', commit_message='Upload tokenizer', commit_description='', oid='3c3155dd2abd4bd89a5a43652691e1bb42297e45', pr_url=None, pr_revision=None, pr_num=None)