In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from constants import FINE_TUNED_BERT_MODEL_PATH, TRAIN_DATA_CSV, TEST_DATA_CSV

## Load the data and add special tokens

In [None]:
BATCH_SIZE = 2
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

SLICE: int | None = 1000

df_train = pd.read_csv(TRAIN_DATA_CSV, dtype={"unique_id": str})[:SLICE]
df_test = pd.read_csv(TEST_DATA_CSV, dtype={"unique_id": str})[:SLICE]

DEVICE

In [None]:
# Stefan Dumitrescu, Andrei-Marius Avram, and Sampo Pyysalo. 2020. The birth of Romanian BERT. In Findings of the Association for Computational Linguistics: EMNLP 2020, pages 4324–4328, Online. Association for Computational Linguistics.
# https://huggingface.co/dumitrescustefan/bert-base-romanian-cased-v1
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
bert_model = AutoModelForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", num_labels=1)
bert_model.to(DEVICE)

torch.cuda.empty_cache()

bert_model

In [None]:
def compute_metrics(predictions, ground_truths):
    mae = mean_absolute_error(ground_truths, predictions)
    mse = mean_squared_error(ground_truths, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(ground_truths, predictions)

    return {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

## Tokenize inputs and create datasets

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


train_texts = list(df_train["input"])
train_prices = list(df_train["price"])
eval_texts = list(df_test["input"])
eval_prices = list(df_test["price"])

# Tokenizing texts
train_encodings = tokenizer(train_texts, max_length=512, truncation=True, padding=True)
eval_encodings = tokenizer(eval_texts, max_length=512, truncation=True, padding=True)

train_dataset = CustomDataset(train_encodings, train_prices)
eval_dataset = CustomDataset(eval_encodings, eval_prices)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
def compute_metrics_trainer_call(eval_pred):
    logits, labels = eval_pred
    metrics = compute_metrics(logits, labels)
    return metrics

In [None]:
training_args = TrainingArguments(
    output_dir="results",
    save_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_steps=1,
    learning_rate=0.001,
    optim="adamw_torch",
    evaluation_strategy="epoch",
)

In [None]:
trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics_trainer_call,
)

In [None]:
trainer.train()