In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from torchmetrics.classification import (
    MulticlassF1Score,
    MulticlassPrecision,
    MulticlassRecall,
)
from tqdm import tqdm
import pandas as pd
from torchviz import make_dot

In [2]:
from utils import (
    LABEL_MAPPING,
    ids2labels,
    save_checkpoint,
    load_checkpoint,
    save_best_model,
    load_best_model,
    save_model_remotely
)

In [3]:
one_hot_labels = {
    "sentiment": ['negative', 'neutral', 'positive'],
	"question": ['not_question', 'question'],
	"curse": ['curse', 'non-curse'],
	"emotion": ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'],
	"gibberish": ['clean', 'mild gibberish', 'word salad'],
	"offensiveness": ['non-offensive', 'offensive'],
	"political_bias": ['CENTER', 'LEFT', 'RIGHT']
}

label_to_index = {
    "sentiment": {label: idx for idx, label in enumerate(one_hot_labels["sentiment"])},
	"question": {label: idx for idx, label in enumerate(one_hot_labels["question"])},
	"curse": {label: idx for idx, label in enumerate(one_hot_labels["curse"])},
	"emotion": {label: idx for idx, label in enumerate(one_hot_labels["emotion"])},
	"gibberish": {label: idx for idx, label in enumerate(one_hot_labels["gibberish"])},
	"offensiveness": {label: idx for idx, label in enumerate(one_hot_labels["offensiveness"])},
	"political_bias": {label: idx for idx, label in enumerate(one_hot_labels["political_bias"])}
}

one_hot_metadata_size = sum([len(x) for x in one_hot_labels.values()])

In [4]:
class LiarPlusSingleRobertaDataset(Dataset):
    def __init__(
        self,
        filepath: str,
        tokenizer,
        str_metadata_cols: list[str],
        num_metadata_cols: list[str],
#        one_hot_metadata_cols: list[str],
        max_length: int = 512,
    ):
        self.df = pd.read_csv(filepath)

        self.str_metadata_cols = str_metadata_cols
        self.num_metadata_cols = num_metadata_cols
        #self.one_hot_metadata_cols = one_hot_metadata_cols

        for column in self.str_metadata_cols:
            self.df[column] = self.df[column].astype(str)

        self.df["statement"] = self.df["statement"].astype(str)
        #self.df["justification"] = self.df["justification"].astype(str)
        #self.df["articles"] = self.df["articles"].astype(str)

        self.statement_max_len = max_length // 4
        #self.justification_max_len = max_length // 4
        #self.article_max_len = max_length // 4
        self.str_metadata_max_len = max((
            max_length - self.statement_max_len# - self.article_max_len# - self.justification_max_len
        ) // len(str_metadata_cols), 15)

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df.index)
        
    def limit_tokens(self, text, max_length=512):
        return self.tokenizer.convert_tokens_to_string(
            self.tokenizer.tokenize(text)[:max_length]
        )

    def __getitem__(self, index: int):
        item = self.df.iloc[index]

        input_text = self.limit_tokens(
            f"[STATEMENT] {item['statement']}",
            self.statement_max_len
        )
        #input_text += self.limit_tokens(
        #    f" [JUSTIFICATION] {item['justification']}",
        #    self.justification_max_len,
        #)
        #input_text += self.limit_tokens(
        #    f" [ARTICLE] {item['articles']}",
        #    self.article_max_len,
        #)

        for column in self.str_metadata_cols:
            input_text += self.limit_tokens(f" [{column.upper()}] {item[column]}", self.str_metadata_max_len)

        encoded = self.tokenizer(
            input_text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        label = LABEL_MAPPING[item["label"]]

        num_metadata = [item[column] for column in self.num_metadata_cols]

        #one_hot_metadata = []
        #for column in self.one_hot_metadata_cols:
        #    value = item[column]
        #    possible_values = len(one_hot_labels[column])
        #    id_tensor = torch.tensor(label_to_index[column][value])
        #    one_hot_metadata.append(F.one_hot(id_tensor, possible_values))

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "num_metadata": torch.tensor(num_metadata).float(),
            #"one_hot_metadata": torch.cat(one_hot_metadata, dim=0).float(),
            "label": torch.tensor(label)
        }

In [5]:
class LiarPlusSingleFinetunedRoBERTasClassifier(nn.Module):
    #one_hot_metadata_size, 
    def __init__(
        self, encoder_model, num_metadata_len, num_hidden, num_classes
    ):
        super(LiarPlusSingleFinetunedRoBERTasClassifier, self).__init__()
        self.encoder = encoder_model
        # + one_hot_metadata_size
        self.hl = nn.Linear(
            self.encoder.config.hidden_size + num_metadata_len, num_hidden
        )
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(num_hidden, num_classes)

    def forward(self, input_ids, attention_mask, num_metadata):#, one_hot_metadata):
        outputs = self.encoder(
            input_ids=input_ids, attention_mask=attention_mask
        )

        cls_embedding = outputs.pooler_output
        #, one_hot_metadata
        concatted_inputs = torch.cat([cls_embedding, num_metadata], dim=1)

        hl_output = F.gelu(self.hl(concatted_inputs))
        hl_output = self.dropout(hl_output)

        logits = self.fc(hl_output)
        return logits

    def roberta_trainable_state(self):
        return {
            name: param for name, param in self.encoder.named_parameters() if param.requires_grad
        }
    
    def load_roberta_trainable_state(self, state_dict):
        self.encoder.load_state_dict(state_dict, strict=False)

    # Zapisz tylko wagi warstw klasyfikatora
    def state_for_save(self):
        return {
            'hl_state_dict': self.hl.state_dict(),
            'fc_state_dict': self.fc.state_dict(),
            'roberta_trainable': self.roberta_trainable_state(),
        }
        
    # Ładowanie modelu (tylko wagi klasyfikatora)
    def load_state_from_save(self, state):
        self.hl.load_state_dict(state['hl_state_dict'])
        self.fc.load_state_dict(state['fc_state_dict'])
        if 'roberta_trainable' in state:
            self.load_roberta_trainable_state(state['roberta_trainable'])

In [6]:
def test(
    model: nn.Module,
    best_model_path: str,
    dataloader: DataLoader,
    name: str='Test'
) -> None:
    # Define loss function
    criterion = nn.CrossEntropyLoss()

    load_best_model(model, best_model_path)

    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_samples = 0

    f1 = MulticlassF1Score(num_classes, average=None).to(device)
    precision = MulticlassPrecision(num_classes, average=None).to(device)
    recall = MulticlassRecall(num_classes, average=None).to(device)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            num_metadata = batch["num_metadata"].to(device)
            #one_hot_metadata = batch["one_hot_metadata"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask, num_metadata)#, one_hot_metadata)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(outputs, dim=1)
            total_correct += (preds == labels).sum().item()
            total_samples += input_ids.size(0)

            f1.update(preds, labels)
            precision.update(preds, labels)
            recall.update(preds, labels)

    avg_loss = total_loss / total_samples
    accuracy = total_correct / total_samples

    f1_res = f1.compute()
    precision_res = precision.compute()
    recall_res = recall.compute()

    macro_f1 = f1_res.mean()
    macro_precision = precision_res.mean()
    macro_recall = recall_res.mean()

    print(
        f"{name} Accuracy: {accuracy:.4f},\n"
        f"{name} Loss: {avg_loss:.4f},\n"
        f"{name} F1: {f1_res} (marcro = {macro_f1:.4f}),\n"
        f"{name} Precision: {precision_res} (marcro = {macro_precision:.4f}),\n"
        f"{name} Recall: {recall_res} (marcro = {macro_recall:.4f}),\n"
    )

    return (
        accuracy,
        avg_loss,
        macro_f1,
        macro_precision,
        macro_recall
    )

In [7]:
# Hyperparameters
num_classes = 6
hidden_size = 128
batch_size = 64

text_columns = [
    "subject",
    "speaker",
    "job_title",
    "state",
    "party_affiliation",
    "context",
    "sentiment",
    "question",
    "curse",
    "emotion",
    "gibberish",
    "offensiveness",
    "political_bias"
]
num_metadata_cols = [
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "grammar_errors",
    "ratio_of_capital_letters"
]

In [8]:
# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")

# trenuje 2 ostatnie warstwy
for name, param in roberta.named_parameters():
    if name.startswith("encoder.layer.11") or name.startswith("pooler"):
        param.requires_grad = True
    else:
        param.requires_grad = False

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
validation_data = LiarPlusSingleRobertaDataset(
    "data/normalized/val2.csv",
    tokenizer,
    text_columns,
    num_metadata_cols
)
test_data = LiarPlusSingleRobertaDataset(
    "data/normalized/test2.csv",
    tokenizer,
    text_columns,
    num_metadata_cols
)


val_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=True
)
test_dataloader = DataLoader(
    test_data, batch_size=batch_size, shuffle=True
)

In [10]:
model = LiarPlusSingleFinetunedRoBERTasClassifier(
    roberta,
    len(num_metadata_cols),
    hidden_size,
    num_classes,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LiarPlusSingleFinetunedRoBERTasClassifier(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [11]:
best_model_path = "results/FinalSM_NoOneHot/best_model_6.pth"
load_best_model(model, best_model_path)

Model loaded from best model checkpoint.


  best_model = torch.load(path)


In [12]:
res = test(model, best_model_path, test_dataloader)
print('\n'.join([str(float(x)) for x in res]))

Model loaded from best model checkpoint.


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [00:37<00:00,  1.81s/it]

Test Accuracy: 0.2938,
Test Loss: 1.6196,
Test F1: tensor([0.4048, 0.3340, 0.2302, 0.2601, 0.3662, 0.1449], device='cuda:0') (marcro = 0.2900),
Test Precision: tensor([0.4474, 0.3282, 0.2365, 0.2545, 0.2968, 0.3077], device='cuda:0') (marcro = 0.3118),
Test Recall: tensor([0.3696, 0.3400, 0.2243, 0.2659, 0.4779, 0.0948], device='cuda:0') (marcro = 0.2954),

0.2938425565081839
1.619554042444504
0.2900201082229614
0.31182295083999634
0.29541337490081787





In [13]:
res = test(model, best_model_path, val_dataloader, "Validation")
print('\n'.join([str(float(x)) for x in res]))

Model loaded from best model checkpoint.


Evaluating: 100%|██████████████████████████████████████████████████████████████████████| 21/21 [00:37<00:00,  1.80s/it]

Validation Accuracy: 0.3310,
Validation Loss: 1.5928,
Validation F1: tensor([0.4700, 0.3377, 0.2557, 0.3197, 0.3840, 0.2222], device='cuda:0') (marcro = 0.3316),
Validation Precision: tensor([0.5595, 0.3333, 0.2786, 0.2966, 0.3209, 0.4000], device='cuda:0') (marcro = 0.3648),
Validation Recall: tensor([0.4052, 0.3422, 0.2363, 0.3468, 0.4781, 0.1538], device='cuda:0') (marcro = 0.3271),

0.3309968847352025
1.5927545344718148
0.3315572738647461
0.36481189727783203
0.32706212997436523





In [14]:
batch = next(iter(test_dataloader))
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
num_metadata = batch["num_metadata"].to(device)
labels = batch["label"].to(device)

In [15]:
output = model(input_ids, attention_mask, num_metadata)  # lub konkretnie np. loss
make_dot(output, params=dict(model.named_parameters())).render("SM_NoOneHot_Graph", format="png")

'SM_NoOneHot_Graph.png'

In [16]:
# Load RoBERTa tokenizer and model
tokenizer2 = RobertaTokenizer.from_pretrained("roberta-base")
roberta2 = RobertaModel.from_pretrained("roberta-base")

# trenuje 2 ostatnie warstwy
for name, param in roberta2.named_parameters():
    if name.startswith("encoder.layer.11") or name.startswith("pooler"):
        param.requires_grad = True
    else:
        param.requires_grad = False

model2 = LiarPlusSingleFinetunedRoBERTasClassifier(
    roberta2,
    len(num_metadata_cols),
    hidden_size,
    num_classes,
)
model2.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LiarPlusSingleFinetunedRoBERTasClassifier(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [17]:
load_best_model(model2, best_model_path)

Model loaded from best model checkpoint.


In [18]:
def check_models(model1, model2, atol=1e-8):
    params1 = list(model1.parameters())
    params2 = list(model2.parameters())
    
    if len(params1) != len(params2):
        return False

    for p1, p2 in zip(params1, params2):
        if not torch.allclose(p1, p2, atol=atol):
            return False
    return True


In [19]:
check_models(model, model2)

True

In [20]:
1.6195540416082737 - 1.6195540888088342

-4.720056057117006e-08

In [21]:
1.619554042444504 - 1.6195540888088342

-4.636433015292596e-08