In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from torchmetrics.classification import (
    MulticlassF1Score,
    MulticlassPrecision,
    MulticlassRecall,
)
from tqdm import tqdm
import pandas as pd
from torchviz import make_dot

In [2]:
from utils import (
    LABEL_MAPPING,
    ids2labels,
    save_checkpoint,
    load_checkpoint,
    save_best_model,
    load_best_model,
    save_model_remotely
)

In [3]:
one_hot_labels = {
    "sentiment": ['negative', 'neutral', 'positive'],
	"question": ['not_question', 'question'],
	"curse": ['curse', 'non-curse'],
	"emotion": ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'],
	"gibberish": ['clean', 'mild gibberish', 'word salad'],
	"offensiveness": ['non-offensive', 'offensive'],
	"political_bias": ['CENTER', 'LEFT', 'RIGHT']
}

label_to_index = {
    "sentiment": {label: idx for idx, label in enumerate(one_hot_labels["sentiment"])},
	"question": {label: idx for idx, label in enumerate(one_hot_labels["question"])},
	"curse": {label: idx for idx, label in enumerate(one_hot_labels["curse"])},
	"emotion": {label: idx for idx, label in enumerate(one_hot_labels["emotion"])},
	"gibberish": {label: idx for idx, label in enumerate(one_hot_labels["gibberish"])},
	"offensiveness": {label: idx for idx, label in enumerate(one_hot_labels["offensiveness"])},
	"political_bias": {label: idx for idx, label in enumerate(one_hot_labels["political_bias"])}
}

one_hot_metadata_size = sum([len(x) for x in one_hot_labels.values()])

In [4]:
class LiarPlusSingleRobertaDataset(Dataset):
    def __init__(
        self,
        filepath: str,
        tokenizer,
        str_metadata_cols: list[str],
        num_metadata_cols: list[str],
        one_hot_metadata_cols: list[str],
        max_length: int = 512,
    ):
        self.df = pd.read_csv(filepath)

        self.str_metadata_cols = str_metadata_cols
        self.num_metadata_cols = num_metadata_cols
        self.one_hot_metadata_cols = one_hot_metadata_cols

        for column in self.str_metadata_cols:
            self.df[column] = self.df[column].astype(str)

        self.df["statement"] = self.df["statement"].astype(str)
        self.df["justification"] = self.df["justification"].astype(str)
        self.df["articles"] = self.df["articles"].astype(str)

        self.statement_max_len = max_length // 4
        self.justification_max_len = max_length // 4
        self.article_max_len = max_length // 4
        self.str_metadata_max_len = max((
            max_length - self.statement_max_len - self.justification_max_len - self.article_max_len
        ) // len(str_metadata_cols), 15)

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df.index)
        
    def limit_tokens(self, text, max_length=512):
        return self.tokenizer.convert_tokens_to_string(
            self.tokenizer.tokenize(text)[:max_length]
        )

    def __getitem__(self, index: int):
        item = self.df.iloc[index]

        input_text = self.limit_tokens(
            f"[STATEMENT] {item['statement']}",
            self.statement_max_len
        )
        input_text += self.limit_tokens(
            f" [JUSTIFICATION] {item['justification']}",
            self.justification_max_len,
        )
        input_text += self.limit_tokens(
            f" [ARTICLE] {item['articles']}",
            self.article_max_len,
        )

        for column in self.str_metadata_cols:
            input_text += self.limit_tokens(f" [{column.upper()}] {item[column]}", self.str_metadata_max_len)

        encoded = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        label = LABEL_MAPPING[item["label"]]

        num_metadata = [item[column] for column in self.num_metadata_cols]

        one_hot_metadata = []
        for column in self.one_hot_metadata_cols:
            value = item[column]
            possible_values = len(one_hot_labels[column])
            id_tensor = torch.tensor(label_to_index[column][value])
            one_hot_metadata.append(F.one_hot(id_tensor, possible_values))

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "num_metadata": torch.tensor(num_metadata).float(),
            "one_hot_metadata": torch.cat(one_hot_metadata, dim=0).float(),
            "label": torch.tensor(label)
        }

In [5]:
class LiarPlusSingleFinetunedRoBERTasClassifier(nn.Module):
    def __init__(
        self, encoder_model, num_metadata_len, one_hot_metadata_size, num_hidden, num_classes
    ):
        super(LiarPlusSingleFinetunedRoBERTasClassifier, self).__init__()
        self.encoder = encoder_model
        self.hl = nn.Linear(
            self.encoder.config.hidden_size + num_metadata_len + one_hot_metadata_size, num_hidden
        )
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(num_hidden, num_classes)

    def forward(self, input_ids, attention_mask, num_metadata, one_hot_metadata):
        outputs = self.encoder(
            input_ids=input_ids, attention_mask=attention_mask
        )

        cls_embedding = outputs.pooler_output
        concatted_inputs = torch.cat([cls_embedding, num_metadata, one_hot_metadata], dim=1)

        hl_output = F.gelu(self.hl(concatted_inputs))
        hl_output = self.dropout(hl_output)

        logits = self.fc(hl_output)
        return logits

    def roberta_trainable_state(self):
        return {
            name: param for name, param in self.encoder.named_parameters() if param.requires_grad
        }
    
    def load_roberta_trainable_state(self, state_dict):
        self.encoder.load_state_dict(state_dict, strict=False)

    # Zapisz tylko wagi warstw klasyfikatora
    def state_for_save(self):
        return {
            'hl_state_dict': self.hl.state_dict(),
            'fc_state_dict': self.fc.state_dict(),
            'roberta_trainable': self.roberta_trainable_state(),
        }
        
    # Ładowanie modelu (tylko wagi klasyfikatora)
    def load_state_from_save(self, state):
        self.hl.load_state_dict(state['hl_state_dict'])
        self.fc.load_state_dict(state['fc_state_dict'])
        if 'roberta_trainable' in state:
            self.load_roberta_trainable_state(state['roberta_trainable'])

In [6]:
def test(
    model: nn.Module,
    best_model_path: str,
    dataloader: DataLoader,
    name: str="Test"
) -> None:
    # Define loss function
    criterion = nn.CrossEntropyLoss()

    load_best_model(model, best_model_path)
    
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    f1 = MulticlassF1Score(num_classes, average=None).to(device)
    precision = MulticlassPrecision(num_classes, average=None).to(device)
    recall = MulticlassRecall(num_classes, average=None).to(device)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            num_metadata = batch["num_metadata"].to(device)
            one_hot_metadata = batch["one_hot_metadata"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask, num_metadata, one_hot_metadata)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += input_ids.size(0)

            f1.update(preds, labels)
            precision.update(preds, labels)
            recall.update(preds, labels)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples

    f1_res = f1.compute()
    precision_res = precision.compute()
    recall_res = recall.compute()
    
    macro_f1 = f1_res.mean()
    macro_precision = precision_res.mean()
    macro_recall = recall_res.mean()

    print(
        f"{name} Accuracy: {accuracy:.4f},\n"
        f"{name} Loss: {avg_loss:.4f},\n"
        f"{name} F1: {f1_res} (marcro = {macro_f1:.4f}),\n"
        f"{name} Precision: {precision_res} (marcro = {macro_precision:.4f}),\n"
        f"{name} Recall: {recall_res} (marcro = {macro_recall:.4f}),\n"
    )

    return (
        accuracy,
        avg_loss,
        macro_f1,
        macro_precision,
        macro_recall
    )

In [7]:
# Hyperparameters
num_classes = 6
hidden_size = 128
batch_size = 64

text_columns = [
    "subject",
    "speaker",
    "job_title",
    "state",
    "party_affiliation",
    "context"
]
num_metadata_cols = [
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "grammar_errors",
    "ratio_of_capital_letters"
]
one_hot_cols = [
    "sentiment",
    "question",
    "curse",
    "emotion",
    "gibberish",
    "offensiveness",
    "political_bias"
]

In [8]:
# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")

# trenuje 2 ostatnie warstwy
for name, param in roberta.named_parameters():
    if name.startswith("encoder.layer.11") or name.startswith("pooler"):
        param.requires_grad = True
    else:
        param.requires_grad = False

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
validation_data = LiarPlusSingleRobertaDataset(
    "data/normalized/val2.csv",
    tokenizer,
    text_columns,
    num_metadata_cols,
    one_hot_cols
)
test_data = LiarPlusSingleRobertaDataset(
    "data/normalized/test2.csv",
    tokenizer,
    text_columns,
    num_metadata_cols,
    one_hot_cols
)


val_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=True
)
test_dataloader = DataLoader(
    test_data, batch_size=batch_size, shuffle=True
)

In [10]:
model = LiarPlusSingleFinetunedRoBERTasClassifier(
    roberta,
    len(num_metadata_cols),
    one_hot_metadata_size,
    hidden_size,
    num_classes,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LiarPlusSingleFinetunedRoBERTasClassifier(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [11]:
best_model_path = "results/FinalSMJA/best_model_6.pth"
load_best_model(model, best_model_path)

Model loaded from best model checkpoint.


  best_model = torch.load(path)


In [12]:
res = test(model, best_model_path, test_dataloader)
print('\n'.join([str(float(x)) for x in res]))

Model loaded from best model checkpoint.


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [00:41<00:00,  1.98s/it]

Test Accuracy: 0.2931,
Test Loss: 1.6299,
Test F1: tensor([0.3562, 0.3657, 0.1538, 0.2954, 0.3508, 0.0429], device='cuda:0') (marcro = 0.2608),
Test Precision: tensor([0.4815, 0.2870, 0.3056, 0.2814, 0.2843, 0.2273], device='cuda:0') (marcro = 0.3112),
Test Recall: tensor([0.2826, 0.5040, 0.1028, 0.3109, 0.4578, 0.0237], device='cuda:0') (marcro = 0.2803),

0.2930631332813718
1.6298528739649711
0.26080322265625
0.3111618161201477
0.2803003191947937





In [13]:
res = test(model, best_model_path, val_dataloader, "Validation")
print('\n'.join([str(float(x)) for x in res]))

Model loaded from best model checkpoint.


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [00:41<00:00,  1.99s/it]

Validation Accuracy: 0.3022,
Validation Loss: 1.6261,
Validation F1: tensor([0.4309, 0.3584, 0.1942, 0.2462, 0.3666, 0.0749], device='cuda:0') (marcro = 0.2785),
Validation Precision: tensor([0.6000, 0.2890, 0.4167, 0.2321, 0.2929, 0.3889], device='cuda:0') (marcro = 0.3699),
Validation Recall: tensor([0.3362, 0.4715, 0.1266, 0.2621, 0.4900, 0.0414], device='cuda:0') (marcro = 0.2880),

0.30218068535825543
1.626061411290154
0.2785318195819855
0.3699333369731903
0.28797146677970886





In [14]:
batch = next(iter(test_dataloader))
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
num_metadata = batch["num_metadata"].to(device)
one_hot_metadata = batch["one_hot_metadata"].to(device)
labels = batch["label"].to(device)

In [15]:
output = model(input_ids, attention_mask, num_metadata, one_hot_metadata)  # lub konkretnie np. loss
make_dot(output, params=dict(model.named_parameters())).render("SMA_Graph", format="png")

'SMA_Graph.png'

In [16]:
best_model_path = "results/FinalSMJA/best_model_10.pth"
load_best_model(model, best_model_path)

Model loaded from best model checkpoint.


In [17]:
res = test(model, best_model_path, test_dataloader)
print('\n'.join([str(float(x)) for x in res]))

Model loaded from best model checkpoint.


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [02:32<00:00,  7.25s/it]

Test Accuracy: 0.2993,
Test Loss: 1.6489,
Test F1: tensor([0.3590, 0.3179, 0.1846, 0.2934, 0.3400, 0.2954], device='cuda:0') (marcro = 0.2984),
Test Precision: tensor([0.4375, 0.2871, 0.2703, 0.2808, 0.3092, 0.3020], device='cuda:0') (marcro = 0.3145),
Test Recall: tensor([0.3043, 0.3560, 0.1402, 0.3071, 0.3775, 0.2891], device='cuda:0') (marcro = 0.2957),

0.2992985190958691
1.648909077547717
0.29836520552635193
0.3144799470901489
0.2957100570201874





In [18]:
res = test(model, best_model_path, val_dataloader, "Validation")
print('\n'.join([str(float(x)) for x in res]))

Model loaded from best model checkpoint.


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [02:31<00:00,  7.21s/it]

Validation Accuracy: 0.3146,
Validation Loss: 1.6233,
Validation F1: tensor([0.3979, 0.3580, 0.1925, 0.2904, 0.3321, 0.3196], device='cuda:0') (marcro = 0.3151),
Validation Precision: tensor([0.5067, 0.3270, 0.3647, 0.2669, 0.3093, 0.2831], device='cuda:0') (marcro = 0.3429),
Validation Recall: tensor([0.3276, 0.3954, 0.1308, 0.3185, 0.3586, 0.3669], device='cuda:0') (marcro = 0.3163),

0.3146417445482866
1.6232995615569974
0.31509798765182495
0.34294864535331726
0.31630057096481323



