In [1]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [6]:
# model_path = "./bert/without_icl/roberta_new"
model_path = "./bert/models/weighted_loss_roberta"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval() # Set model to evaluation mode

True


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [3]:
dataset = "chengxuphd/liar2"
dataset = load_dataset(dataset)
test_dataset = dataset["test"]

def preprocess_function(examples):
    combined_input = [
        "Subject: " + (subject if subject is not None else "") + 
        "; Speaker: " + (speaker if speaker is not None else "") + 
        "; Speaker Description: " + (speaker_description if speaker_description is not None else "") + 
        "; State: " + (state_info if state_info is not None else "") + 
        "; Context: " + (context if context is not None else "") + 
        "; Statement: " + (statement if statement is not None else "")
        for subject, speaker, speaker_description, state_info, context, statement in zip(
            examples["subject"],
            examples["speaker"],
            examples["speaker_description"],
            examples["state_info"],
            examples["context"],
            examples["statement"]
        )
    ]
    return tokenizer(combined_input, padding="max_length", truncation=True)

test_dataset = test_dataset.map(preprocess_function, batched=True)
test_dataset.set_format("torch")

label_to_binary = {
    # True = FAKE; False = REAL
    0: True,
    1: True,
    2: True,
    3: True, # Changed to FAKE
    4: False,
    5: False
}

original_label_counts = Counter(test_dataset["label"].tolist())
print("Before label preprocessing: Labels =", original_label_counts)

test_dataset = test_dataset.map(
    lambda examples: {"label": [label_to_binary[int(label)] for label in examples["label"]]},
    batched=True
)

binary_label_counts = Counter(test_dataset["label"].tolist())
print("After label preprocessing: Labels =", binary_label_counts)

# assert (
#     original_label_counts[0] + original_label_counts[1] + original_label_counts[2]
#     == binary_label_counts[True]
# ), "Sum of original labels 0, 1, 2 does not match new label 0 (Fake)."

# assert (
#     original_label_counts[3] + original_label_counts[4] + original_label_counts[5]
#     == binary_label_counts[False]
# ), "Sum of original labels 3, 4, 5 does not match new label 1 (Real)."

# print("Assertions passed: Label mapping is correct.")

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

Before label preprocessing: Labels = Counter({1: 660, 3: 371, 2: 360, 4: 343, 0: 303, 5: 259})


Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

After label preprocessing: Labels = Counter({1: 1694, 0: 602})


In [4]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

In [7]:
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics
)

test_results = trainer.predict(test_dataset)

predictions = test_results.predictions.argmax(-1)  # Convert logits to class predictions
metrics = test_results.metrics  # Contains accuracy, F1, precision, recall, etc.

print("Test Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

Test Metrics:
test_loss: 0.4677
test_model_preparation_time: 0.0020
test_accuracy: 0.7404
test_f1: 0.8218
test_precision: 0.8327
test_recall: 0.8111
test_runtime: 33.7047
test_samples_per_second: 68.1210
test_steps_per_second: 8.5150


#### Oversampling

| Metric                        | Regular | Weighted_loss   |
|-------------------------------|---------|-----------------|
| Test Loss                     | 0.6739  | 0.4677          |
| Test Model Preparation Time   | 0.0020  | 0.0020          |
| Test Accuracy                 | 0.6995  | 0.7404          |
| Test F1 Score                 | 0.7702  | 0.8218          |
| Test Precision                | 0.8838  | 0.8327          |
| Test Recall                   | 0.6824  | 0.8111          |
| Test Runtime (seconds)        | 37.4584 | 33.7047         |
| Test Samples/Second           | 61.2950 | 68.1210         |
| Test Steps/Second             | 7.6620  | 8.5150          |

#### AFTER LABEL SWAP

##### WITHOUT IN-CONTEXT LEARNING JUSTIFICATION

| Metric                                | roberta   | bert_fake_news    | augmented_normal  | normal_augmented  |
|---------------------------------------|-----------|-------------------|-------------------|-------------------|
| Test Loss                             | 0.5207    | 0.5508            | 0.5444            | 0.5291            |
| Test Model Preparation Time (seconds) | 0.0020    | 0.0020            | 0.0020            | 0.0010            |
| Test Accuracy                         | 0.7295    | 0.7317            | 0.7326            | 0.7348            |
| Test F1 Score                         | 0.7663    | 0.7548            | 0.7607            | 0.7505            |
| Test Precision                        | 0.7631    | 0.7973            | 0.7852            | 0.8193            |
| Test Recall                           | 0.7695    | 0.7166            | 0.7377            | 0.6924            |
| Test Runtime (seconds)                | 38.4320   | 36.0601           | 34.2745           | 33.9310           |
| Test Samples/Second                   | 59.7420   | 63.6720           | 66.9880           | 67.6670           |
| Test Steps/Second                     | 7.4680    | 7.9590            | 8.3740            | 8.4580            |

Since roberta provided a better recall score, we shall compare data augmentation on the roberta model.


#### BEFORE LABEL SWAP

| Metric                                | roberta   | augmented_normal  | normal_augmented  | justification | bert_fake_news    |
|---------------------------------------|-----------|-------------------|-------------------|---------------|-------------------|
| Test Loss                             | 0.5339    | 0.6042            | 0.5393            | 0.7983        | 0.5354            |
| Test Model Preparation Time (seconds) | 0.0010    | 0.0020            | 0.0020            | 0.0010        | 0.0020            |
| Test Accuracy                         | 0.7395    | 0.7352            | 0.7260            | 0.6760        | 0.7317            |
| Test F1 Score                         | 0.7051    | 0.7214            | 0.7137            | 0.6958        | 0.7100            |
| Test Precision                        | 0.6777    | 0.6510            | 0.6405            | 0.5777        | 0.6551            |
| Test Recall                           | 0.7348    | 0.8088            | 0.8058            | 0.8746        | 0.7749            |
| Test Runtime (seconds)                | 33.5466   | 33.379            | 33.718            | 33.415        | 33.5630           |
| Test Samples/Second                   | 68.4420   | 68.785            | 68.094            | 68.710        | 68.4090           |
| Test Steps/Second                     | 8.5550    | 8.5980            | 8.5120            | 8.5890        | 8.5510            |

Roberta seems to be the best model out of all 5.