In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, TrainingArguments, Trainer, AutoModelForSequenceClassification
from constants import TRAIN_CSV, TEST_CSV, MODELS_PATH, RESULTS_PATH, LOGS_PATH 

## Add seed for reproducibility

In [None]:
import random


def reset_numpy_seed(seed_value=42):
    try:
        # Set NumPy random seed
        import numpy as np

        np.random.seed(seed_value)
        print(f"NumPy random seed set with value: {seed_value}")
    except Exception as e:
        print(f"NumPy random seed was not set: {e}")
    return


def reset_torch_seed(seed_value=42):
    try:
        # Set PyTorch random seed
        import torch

        torch.manual_seed(seed_value)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed_value)
            torch.cuda.manual_seed_all(seed_value)  # if you are using multiple GPUs
        print(f"PyTorch random seed set with value: {seed_value}")
    except Exception as e:
        print(f"PyTorch random seed was not set: {e}")
    return


def set_random_seeds(seed_value=42):
    # Set Python random seed
    random.seed(seed_value)
    reset_numpy_seed(seed_value)
    reset_torch_seed(seed_value)
    return


# Set the desired seed value
seed = 42

# Set random seeds
set_random_seeds(seed)

## Load the data and add special tokens

In [None]:
df_train = pd.read_csv(TRAIN_CSV)
df_test = pd.read_csv(TEST_CSV)
replacements = {"ţ": "ț", "ş": "ș", "Ţ": "Ț", "Ş": "Ș"}
df_train["title"] = df_train["title"].replace(replacements, regex=True)
df_train["content"] = df_train["content"].replace(replacements, regex=True)
df_test["title"] = df_test["title"].replace(replacements, regex=True)
df_test["content"] = df_test["content"].replace(replacements, regex=True)
SEP_TOKEN = " [SEP] "
TITLE_TOKEN = " [TITLE] "
CONTENT_TOKEN = " [CONTENT] "

df_train["title"] = df_train["title"].fillna('')
df_train["content"] = df_train["content"].fillna('')
df_train["input"] = TITLE_TOKEN + df_train["title"] + SEP_TOKEN + CONTENT_TOKEN + df_train["content"]

df_test["title"] = df_test["title"].fillna('')
df_test["content"] = df_test["content"].fillna('')
df_test["input"] =  TITLE_TOKEN + df_test["title"] + SEP_TOKEN +  CONTENT_TOKEN + df_test["content"]

df_train.drop(["title", "content"], axis=1, inplace=True)
df_test.drop(["title", "content"], axis=1, inplace=True)
df_train["class"] = df_train["class"].astype(int)
df_train["input"] = df_train["input"].astype(str)
df_test["input"] = df_test["input"].astype(str)

In [None]:
counter = 0
for index in range(len(df_train)):
    if df_train.iloc[index]["input"] == "nan":
        counter += 1

print(counter)

## Load the model

In [None]:
# Stefan Dumitrescu, Andrei-Marius Avram, and Sampo Pyysalo. 2020. The birth of Romanian BERT. In Findings of the Association for Computational Linguistics: EMNLP 2020, pages 4324–4328, Online. Association for Computational Linguistics.
# https://huggingface.co/dumitrescustefan/bert-base-romanian-cased-v1
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")
model = AutoModelForSequenceClassification.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1", num_labels=2)

torch.cuda.empty_cache()

model

## Tokenize inputs and create datasets

In [None]:
x = df_train.drop("class", axis=1)
y = df_train["class"]

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_encodings = tokenizer(list(x_train["input"]), max_length=512, padding=True, truncation=True)
val_encodings = tokenizer(list(x_val["input"]), max_length=512, padding=True, truncation=True)

train_dataset = CustomDataset(train_encodings, list(y_train))
val_dataset = CustomDataset(val_encodings, list(y_val))

## Hyperparam tuning

In [None]:
training_args = TrainingArguments(
    output_dir="results-v2",
    save_strategy="epoch",
    num_train_epochs=12,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_dir="logs",
    logging_steps=1,
    learning_rate=1e-7,
    optim="adamw_torch",
    evaluation_strategy="epoch",
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"balanced_accuracy": balanced_accuracy_score(labels, predictions)}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("fine-tuned-bert")

## Evaluate the model

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

test_encodings = tokenizer(list(df_test["input"]), max_length=512, padding=True, truncation=True)
test_dataset = CustomDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Run inference on the test set

In [None]:
ids = []
predictions = []

model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        batch_predictions = torch.argmax(probs, dim=1)
        predictions.extend(batch_predictions.cpu().numpy())

In [None]:
result_df = pd.DataFrame({"id": df_test["id"], "class": predictions})
result_df.to_csv("predictions.csv", index=False, lineterminator="\n\n")

In [None]:
next(iter(test_loader))

## Run inference on the validation set

In [None]:
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
model.eval()

ids = []
predictions = []
gt = []

model.eval()
with torch.no_grad():
    for batch in tqdm(val_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"]
        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
        batch_predictions = torch.argmax(probs, dim=1)
        predictions.extend(batch_predictions.cpu().numpy())
        gt.extend(labels.cpu().numpy())

In [None]:
result_df = pd.DataFrame({"input": x_val["input"], "class": predictions, "gt": gt})
filtered_df = result_df[result_df["class"] != result_df["gt"]]

filtered_df