In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForMaskedLM
from constants import ROCAR_CSV, FINE_TUNED_BERT_MODEL_PATH

## Load the data and add special tokens

In [None]:
df = pd.read_csv(ROCAR_CSV)[:10]

In [None]:
# Stefan Dumitrescu, Andrei-Marius Avram, and Sampo Pyysalo. 2020. The birth of Romanian BERT. In Findings of the Association for Computational Linguistics: EMNLP 2020, pages 4324–4328, Online. Association for Computational Linguistics.
# https://huggingface.co/dumitrescustefan/bert-base-romanian-cased-v1
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModelForMaskedLM.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")

torch.cuda.empty_cache()

model

## Tokenize inputs and create datasets

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


descriptions = list(df["input"])
train_texts, eval_texts = train_test_split(descriptions, test_size=0.2, random_state=42)

train_encodings = tokenizer(train_texts, max_length=512, truncation=True, padding=True)
eval_encodings = tokenizer(eval_texts, max_length=512, truncation=True, padding=True)

train_dataset = CustomDataset(train_encodings)
eval_dataset = CustomDataset(eval_encodings)

In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(train_dataset, batch_size=10)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

## Hyperparam tuning

In [None]:
training_args = TrainingArguments(
    output_dir="results-v2",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="logs",
    logging_steps=1,
    learning_rate=1e-5,
    optim="adamw_torch",
    evaluation_strategy="epoch",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained(FINE_TUNED_BERT_MODEL_PATH)