In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
from torchmetrics.classification import (
    MulticlassF1Score,
    MulticlassPrecision,
    MulticlassRecall,
)
from tqdm import tqdm
import pandas as pd

In [2]:
from utils import (
    LABEL_MAPPING,
    ids2labels,
    save_checkpoint,
    load_checkpoint,
    save_best_model,
    load_best_model,
    save_model_remotely
)

In [3]:
one_hot_labels = {
    "sentiment": ['negative', 'neutral', 'positive'],
	"question": ['not_question', 'question'],
	"curse": ['curse', 'non-curse'],
	"emotion": ['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise'],
	"gibberish": ['clean', 'mild gibberish', 'word salad'],
	"offensiveness": ['non-offensive', 'offensive'],
	"political_bias": ['CENTER', 'LEFT', 'RIGHT']
}

label_to_index = {
    "sentiment": {label: idx for idx, label in enumerate(one_hot_labels["sentiment"])},
	"question": {label: idx for idx, label in enumerate(one_hot_labels["question"])},
	"curse": {label: idx for idx, label in enumerate(one_hot_labels["curse"])},
	"emotion": {label: idx for idx, label in enumerate(one_hot_labels["emotion"])},
	"gibberish": {label: idx for idx, label in enumerate(one_hot_labels["gibberish"])},
	"offensiveness": {label: idx for idx, label in enumerate(one_hot_labels["offensiveness"])},
	"political_bias": {label: idx for idx, label in enumerate(one_hot_labels["political_bias"])}
}

one_hot_metadata_size = sum([len(x) for x in one_hot_labels.values()])

In [4]:
class LiarPlusSingleRobertaDataset_SM_And_SMA(Dataset):
    def __init__(
        self,
        filepath: str,
        tokenizer,
        str_metadata_cols: list[str],
        num_metadata_cols: list[str],
        one_hot_metadata_cols: list[str],
        max_length: int = 512,
    ):
        self.df = pd.read_csv(filepath)

        self.str_metadata_cols = str_metadata_cols
        self.num_metadata_cols = num_metadata_cols
        self.one_hot_metadata_cols = one_hot_metadata_cols

        for column in self.str_metadata_cols:
            self.df[column] = self.df[column].astype(str)

        self.df["statement"] = self.df["statement"].astype(str)
        self.df["articles"] = self.df["articles"].astype(str)

        self.statement_max_len = max_length // 4
        self.article_max_len = max_length // 4
        self.str_metadata_max_len_SM = max((
            max_length - self.statement_max_len
        ) // len(str_metadata_cols), 15)
        self.str_metadata_max_len_SMA = max((
            max_length - self.statement_max_len - self.article_max_len
        ) // len(str_metadata_cols), 15)

        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df.index)
        
    def limit_tokens(self, text, max_length=512):
        return self.tokenizer.convert_tokens_to_string(
            self.tokenizer.tokenize(text)[:max_length]
        )

    def __getitem__(self, index: int):
        item = self.df.iloc[index]

        input_text_SM = self.limit_tokens(
            f"[STATEMENT] {item['statement']}",
            self.statement_max_len
        )
        
        input_text_SMA = input_text_SM
        input_text_SMA += self.limit_tokens(
            f" [ARTICLE] {item['articles']}",
            self.article_max_len,
        )

        for column in self.str_metadata_cols:
            input_text_SM += self.limit_tokens(f" [{column.upper()}] {item[column]}", self.str_metadata_max_len_SM)
            input_text_SMA += self.limit_tokens(f" [{column.upper()}] {item[column]}", self.str_metadata_max_len_SMA)

        encoded_SM = self.tokenizer(
            input_text_SM,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        encoded_SMA = self.tokenizer(
            input_text_SMA,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        label = LABEL_MAPPING[item["label"]]

        num_metadata = [item[column] for column in self.num_metadata_cols]

        one_hot_metadata = []
        for column in self.one_hot_metadata_cols:
            value = item[column]
            possible_values = len(one_hot_labels[column])
            id_tensor = torch.tensor(label_to_index[column][value])
            one_hot_metadata.append(F.one_hot(id_tensor, possible_values))

        return {
            "input_ids_SM": encoded_SM["input_ids"].squeeze(0),
            "attention_mask_SM": encoded_SM["attention_mask"].squeeze(0),
            "input_ids_SMA": encoded_SMA["input_ids"].squeeze(0),
            "attention_mask_SMA": encoded_SMA["attention_mask"].squeeze(0),
            "num_metadata": torch.tensor(num_metadata).float(),
            "one_hot_metadata": torch.cat(one_hot_metadata, dim=0).float(),
            "label": torch.tensor(label),
            "example_id": index
        }

In [5]:
class LiarPlusSingleFinetunedRoBERTasClassifier(nn.Module):
    def __init__(
        self, encoder_model, num_metadata_len, one_hot_metadata_size, num_hidden, num_classes
    ):
        super(LiarPlusSingleFinetunedRoBERTasClassifier, self).__init__()
        self.encoder = encoder_model
        self.hl = nn.Linear(
            self.encoder.config.hidden_size + num_metadata_len + one_hot_metadata_size, num_hidden
        )
        self.dropout = nn.Dropout(p=0.1)
        self.fc = nn.Linear(num_hidden, num_classes)

    def forward(self, input_ids, attention_mask, num_metadata, one_hot_metadata):
        outputs = self.encoder(
            input_ids=input_ids, attention_mask=attention_mask
        )

        cls_embedding = outputs.pooler_output
        concatted_inputs = torch.cat([cls_embedding, num_metadata, one_hot_metadata], dim=1)

        hl_output = F.gelu(self.hl(concatted_inputs))
        hl_output = self.dropout(hl_output)

        logits = self.fc(hl_output)
        return logits

    def roberta_trainable_state(self):
        return {
            name: param for name, param in self.encoder.named_parameters() if param.requires_grad
        }
    
    def load_roberta_trainable_state(self, state_dict):
        self.encoder.load_state_dict(state_dict, strict=False)

    # Zapisz tylko wagi warstw klasyfikatora
    def state_for_save(self):
        return {
            'hl_state_dict': self.hl.state_dict(),
            'fc_state_dict': self.fc.state_dict(),
            'roberta_trainable': self.roberta_trainable_state(),
        }
        
    # Ładowanie modelu (tylko wagi klasyfikatora)
    def load_state_from_save(self, state):
        self.hl.load_state_dict(state['hl_state_dict'])
        self.fc.load_state_dict(state['fc_state_dict'])
        if 'roberta_trainable' in state:
            self.load_roberta_trainable_state(state['roberta_trainable'])

In [6]:
# Hyperparameters
num_classes = 6
hidden_size = 128
batch_size = 64

text_columns = [
    "subject",
    "speaker",
    "job_title",
    "state",
    "party_affiliation",
    "context"
]
num_metadata_cols = [
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "grammar_errors",
    "ratio_of_capital_letters"
]
one_hot_cols = [
    "sentiment",
    "question",
    "curse",
    "emotion",
    "gibberish",
    "offensiveness",
    "political_bias"
]

In [7]:
# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")

# trenuje 2 ostatnie warstwy
for name, param in roberta.named_parameters():
    if name.startswith("encoder.layer.11") or name.startswith("pooler"):
        param.requires_grad = True
    else:
        param.requires_grad = False

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
test_data = LiarPlusSingleRobertaDataset_SM_And_SMA(
    "data/normalized/test2.csv",
    tokenizer,
    text_columns,
    num_metadata_cols,
    one_hot_cols
)

test_dataloader = DataLoader(
    test_data, batch_size=batch_size
)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

modelSMA = LiarPlusSingleFinetunedRoBERTasClassifier(
    roberta,
    len(num_metadata_cols),
    one_hot_metadata_size,
    hidden_size,
    num_classes,
).to(device)

modelSM = LiarPlusSingleFinetunedRoBERTasClassifier(
    roberta,
    len(num_metadata_cols),
    one_hot_metadata_size,
    hidden_size,
    num_classes,
).to(device)

In [10]:
best_model_path = "results/FinalSM/best_model_6.pth"
load_best_model(modelSM, best_model_path)

best_model_path = "results/FinalSMA/best_model_6.pth"
load_best_model(modelSMA, best_model_path)

  best_model = torch.load(path)


Model loaded from best model checkpoint.
Model loaded from best model checkpoint.


In [None]:
example_ids = []
all_SM_preds = []
all_SMA_preds = []
all_SM_probs = []
all_SMA_probs = []
all_labels = []

modelSM.eval()
modelSMA.eval()

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        example_id = batch["example_id"]
        input_ids_SM = batch["input_ids_SM"].to(device)
        attention_mask_SM = batch["attention_mask_SM"].to(device)
        input_ids_SMA = batch["input_ids_SMA"].to(device)
        attention_mask_SMA = batch["attention_mask_SMA"].to(device)
        num_metadata = batch["num_metadata"].to(device)
        one_hot_metadata = batch["one_hot_metadata"].to(device)
        labels = batch["label"]

        outputs_SM = F.softmax(modelSM(input_ids_SM, attention_mask_SM, num_metadata, one_hot_metadata), dim=1)
        outputs_SMA = F.softmax(modelSMA(input_ids_SMA, attention_mask_SMA, num_metadata, one_hot_metadata), dim=1)

        preds_SM = torch.argmax(outputs_SM, dim=1)
        preds_SMA = torch.argmax(outputs_SMA, dim=1)

        example_ids += example_id.tolist()
        all_SM_preds += preds_SM.tolist()
        all_SMA_preds += preds_SMA.tolist()
        all_SM_probs += outputs_SM.tolist()
        all_SMA_probs += outputs_SMA.tolist()
        all_labels += labels.tolist()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Evaluating:  90%|███████████████████████████████████████████████████████████████▎      | 19/21 [01:13<00:07,  3.87s/it]

In [None]:
df = pd.read_csv("data/test2.tsv", sep='\t')

In [None]:
df['SM_pred'] = all_SM_preds
df['SMA_pred'] = all_SMA_preds
df['SM_prob'] = all_SM_probs
df['SMA_prob'] = all_SMA_probs
df['label_num'] = all_labels

In [None]:
SM_highest_prob = []
SMA_highest_prob = []

for _, row in df.iterrows():
    SM_highest_prob.append(row['SM_prob'][row['SM_pred']])
    SMA_highest_prob.append(row['SMA_prob'][row['SMA_pred']])

In [None]:
df['SM_highest_prob'] = SM_highest_prob
df['SMA_highest_prob'] = SMA_highest_prob

In [None]:
df.to_csv('sm_vs_sma_dataset.csv')

In [None]:
df.head()

In [None]:
df[['SM_highest_prob', 'SMA_highest_prob']].describe()

In [None]:
k = df[(df['SM_pred'] != df['label_num']) & (df['SMA_pred'] == df['label_num'])].sort_values(['SM_highest_prob', 'SMA_highest_prob'], ascending=False).head(10)
k

In [None]:
l = df[(df['SM_pred'] == df['label_num']) & (df['SMA_pred'] != df['label_num'])].sort_values(['SM_highest_prob', 'SMA_highest_prob'], ascending=False).head(10)
l