In [None]:
import pickle

import pandas as pd
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns

# from constants import FINE_TUNED_BERT_MODEL_PATH, TRAIN_DATA_CSV, TEST_DATA_CSV, TARGET_SCALER_PATH, MODELS_PATH

## Load the data and add special tokens

In [None]:
# BATCH_SIZE = 2
BATCH_SIZE = 32
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# SLICE: int | None = 10
SLICE = None

df_train = pd.read_csv("train_data.csv", dtype={"id": str})[:SLICE]
df_test = pd.read_csv("test_data.csv", dtype={"id": str})[:SLICE]

# df_train = pd.read_csv(TRAIN_DATA_CSV, dtype={"unique_id": str})[:SLICE]
# df_test = pd.read_csv(TEST_DATA_CSV, dtype={"unique_id": str})[:SLICE]

DEVICE

In [None]:
with open("target_scaler.pkl", "rb") as f:
    target_scaler = pickle.load(f)

# with open(TARGET_SCALER_PATH, "rb") as f:
#     target_scaler = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True, add_special_tokens=True, max_length=512, padding=True, truncation=True
)
bert_model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")
bert_model.to(DEVICE)

torch.cuda.empty_cache()

bert_model

In [None]:
def compute_metrics(predictions, ground_truths):
    mae = mean_absolute_error(ground_truths, predictions)
    mse = mean_squared_error(ground_truths, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(ground_truths, predictions)

    return {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}


def plot_loss_and_metrics(history, metrics_history, SLICE_START=10):
    plt.plot(history["train_loss"][SLICE_START:], label="train loss")
    plt.plot(history["test_loss"][SLICE_START:], label="test loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    plt.plot(metrics_history["train_mae"][SLICE_START:], label="train mae")
    plt.plot(metrics_history["test_mae"][SLICE_START:], label="test mae")
    plt.xlabel("Epoch")
    plt.ylabel("MAE")
    plt.legend()
    plt.show()

    plt.plot(metrics_history["train_rmse"][SLICE_START:], label="train rmse")
    plt.plot(metrics_history["test_rmse"][SLICE_START:], label="test rmse")
    plt.xlabel("Epoch")
    plt.ylabel("RMSE")
    plt.legend()
    plt.show()

    plt.plot(metrics_history["train_r2"][SLICE_START:], label="train r2")
    plt.plot(metrics_history["test_r2"][SLICE_START:], label="test r2")
    plt.xlabel("Epoch")
    plt.ylabel("R2")
    plt.legend()
    plt.show()

    plt.plot(metrics_history["train_mse"][SLICE_START:], label="train mse")
    plt.plot(metrics_history["test_mse"][SLICE_START:], label="test mse")
    plt.xlabel("Epoch")
    plt.ylabel("MSE")
    plt.legend()
    plt.show()


def print_metrics_table(metrics_history):
    headers = ["Epoch", "MAE", "RMSE", "R2", "MSE"]

    # Prepare train data
    train_data = [
        [
            len(metrics_history["train_mae"]) - 1,
            f"{metrics_history['train_mae'][-1]:.5f}",
            f"{metrics_history['train_rmse'][-1]:.5f}",
            f"{metrics_history['train_r2'][-1]:.5f}",
            f"{metrics_history['train_mse'][-1]:.5f}",
        ]
    ]

    # Prepare test data
    test_data = [
        [
            len(metrics_history["test_mae"]) - 1,
            f"{metrics_history['test_mae'][-1]:.5f}",
            f"{metrics_history['test_rmse'][-1]:.5f}",
            f"{metrics_history['test_r2'][-1]:.5f}",
            f"{metrics_history['test_mse'][-1]:.5f}",
        ]
    ]

    # Print train metrics table
    print("Train Metrics")
    print(tabulate(train_data, headers=headers, tablefmt="grid"))

    # Print test metrics table
    print("\nTest Metrics")
    print(tabulate(test_data, headers=headers, tablefmt="grid"))

## Tokenize inputs and create datasets

In [None]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        price = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item, price

    def __len__(self):
        return len(self.encodings["input_ids"])


train_texts = list(df_train["description"])
train_prices = list(df_train["price_std"])
eval_texts = list(df_test["description"])
eval_prices = list(df_test["price_std"])

# Tokenizing texts
train_encodings = tokenizer(train_texts, max_length=512, truncation=True, padding=True)
eval_encodings = tokenizer(eval_texts, max_length=512, truncation=True, padding=True)

train_dataset = CustomDataset(train_encodings, train_prices)
eval_dataset = CustomDataset(eval_encodings, eval_prices)

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class BERTRegressor(nn.Module):
    def __init__(self):
        super(BERTRegressor, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        outputs = outputs[1]  # Use the output of the [CLS] token
        out = self.fc(outputs)
        return out

In [None]:
model = BERTRegressor().to(DEVICE)
criterion = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=0.00001)

history = {"train_loss": [], "test_loss": []}
metrics_history = {"train_mae": [], "test_mae": [], "train_rmse": [], "test_rmse": [], "train_r2": [], "test_r2": [], "train_mse": [], "test_mse": []}

In [None]:
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_losses = []
    all_train_predictions = []
    all_train_ground_truths = []
    for texts, prices in tqdm(train_loader):
        input_ids = texts["input_ids"].to(DEVICE)
        attention_mask = texts["attention_mask"].to(DEVICE)
        prices = prices.to(DEVICE)
        prices = prices.view(-1, 1)

        # Forward pass
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, prices.float())

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())

        unnorm_outputs = target_scaler.inverse_transform(outputs.cpu().detach().numpy())
        unnorm_prices = target_scaler.inverse_transform(prices.cpu().detach().numpy())

        all_train_predictions.extend(unnorm_outputs)
        all_train_ground_truths.extend(unnorm_prices)

    train_metrics = compute_metrics(all_train_predictions, all_train_ground_truths)
    metrics_history["train_mae"].append(train_metrics["MAE"])
    metrics_history["train_mse"].append(train_metrics["MSE"])
    metrics_history["train_rmse"].append(train_metrics["RMSE"])
    metrics_history["train_r2"].append(train_metrics["R2"])

    avg_train_loss = np.sum(train_losses) / len(train_loader)
    history["train_loss"].append(avg_train_loss)
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss}")
    print(f"Epoch {epoch + 1}, Train Metrics: {train_metrics}")

    model.eval()
    with torch.no_grad():
        validation_losses = []
        all_test_predictions = []
        all_test_ground_truths = []
        for texts, prices in eval_loader:
            input_ids = texts["input_ids"].to(DEVICE)
            attention_mask = texts["attention_mask"].to(DEVICE)
            prices = prices.to(DEVICE)
            prices = prices.view(-1, 1)

            # Forward pass
            outputs = model(input_ids, attention_mask)
            val_loss = criterion(outputs, prices.float())

            validation_losses.append(val_loss.item())

            unnorm_outputs = target_scaler.inverse_transform(outputs.cpu().detach().numpy())
            unnorm_prices = target_scaler.inverse_transform(prices.cpu().detach().numpy())

            all_test_predictions.extend(unnorm_outputs)
            all_test_ground_truths.extend(unnorm_prices)

        avg_val_loss = np.sum(validation_losses) / len(eval_loader)
        history["test_loss"].append(avg_val_loss)

        test_metrics = compute_metrics(all_test_predictions, all_test_ground_truths)
        metrics_history["test_mae"].append(test_metrics["MAE"])
        metrics_history["test_mse"].append(test_metrics["MSE"])
        metrics_history["test_rmse"].append(test_metrics["RMSE"])
        metrics_history["test_r2"].append(test_metrics["R2"])

        print(f"Epoch {epoch + 1}, Validation Loss: {avg_val_loss}")
        print(f"Epoch {epoch + 1}, Test Metrics: {test_metrics}")

    # torch.save(model.state_dict(), MODELS_PATH / f"fine_tuned_bert_model_{epoch+1}.pth")
    torch.save(model.state_dict(), f=f"fine_tuned_bert_model_{epoch+1}.pth")

In [None]:
plot_loss_and_metrics(history, metrics_history, SLICE_START=0)
print_metrics_table(metrics_history)

In [None]:
differences = []
abs_differences = []
gt = []
pred = []

model.eval()
with torch.no_grad():
    for texts, prices in eval_loader:
        input_ids = texts["input_ids"].to(DEVICE)
        attention_mask = texts["attention_mask"].to(DEVICE)
        prices = prices.to(DEVICE)
        prices = prices.view(-1, 1)

        outputs = model(input_ids, attention_mask)

        unnorm_outputs = target_scaler.inverse_transform(outputs.cpu().detach().numpy())
        unnorm_prices = target_scaler.inverse_transform(prices.cpu().detach().numpy())

        for output, target in zip(unnorm_outputs, unnorm_prices):
            diff = output - target
            abs_diff = abs(diff)
            differences.append(diff)
            abs_differences.append(abs_diff)
            gt.append(target)
            pred.append(output)

# Plotting the differences
plt.figure(figsize=(10, 5))
plt.plot(differences, marker="o", linestyle="-", markersize=4)
plt.title("Differences between Predictions and Ground Truth")
plt.xlabel("Sample Index")
plt.ylabel("Prediction - Ground Truth")
plt.grid(True)
plt.show()

sns.regplot(x=gt, y=pred, ci=None, scatter=True, line_kws={"color": "red"})

# calculate the average difference
average_abs_diff = sum(abs_differences) / len(abs_differences)
print(f"Average absolute difference: {average_abs_diff:.2f}")

In [None]:
# torch.save(model.state_dict(), f=FINE_TUNED_BERT_MODEL_PATH)
torch.save(model.state_dict(), f="fine_tuned_bert_model.pth")

# Inference example

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True,
#                                           add_special_tokens=True, max_length=512, padding=True, truncation=True)
# bert_model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")
# bert_model.to(DEVICE)
#
#
# class BERTRegressor(nn.Module):
#     def __init__(self):
#         super(BERTRegressor, self).__init__()
#         self.bert = bert_model
#         self.fc = nn.Linear(768, 1)
#
#     def forward(self, input_ids, attention_mask):
#         outputs = self.bert(input_ids, attention_mask)
#         outputs = outputs[1]  # Use the output of the [CLS] token
#         return outputs
#
#
# model = BERTRegressor().to(DEVICE)
#
# model.load_state_dict(torch.load(FINE_TUNED_BERT_MODEL_PATH))
# model.to(DEVICE)
# model.eval()
#
# with torch.no_grad():
#     for texts, prices in eval_loader:
#         input_ids = texts["input_ids"].to(DEVICE)
#         attention_mask = texts["attention_mask"].to(DEVICE)
#         prices = prices.to(DEVICE)
#
#         outputs = model(input_ids, attention_mask)
#
#         embeddings = outputs.cpu().detach().numpy()
#         print(embeddings.shape)