In [None]:
# !pip install --force-reinstall transformers torch scikit-learn gensim datasets -q
# import transformers, torch, sklearn, gensim, datasets
# print("All packages imported successfully!")

In [2]:
import gc
gc.collect()


7

In [3]:
!nvidia-smi

Sun Mar 30 20:22:03 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 572.83                 Driver Version: 572.83         CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   47C    P8            N/A  /  115W |    7501MiB /   8188MiB |      8%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
import torch
torch.cuda.empty_cache()

In [1]:
# Import libraries
import os
import json
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import torch
import torchvision
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from datasets import load_dataset

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\limyi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\limyi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\limyi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load the LIAR dataset
dataset = load_dataset("liar", trust_remote_code=True)

# Define the saved model directory
saved_model_dir = "./saved_model_2"

# Check if the saved tokenizer exists
if os.path.exists(saved_model_dir):
    print("Loading saved tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(saved_model_dir)
else:
    print("Saved tokenizer not found. Loading default BERT tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples["statement"], padding="max_length", truncation=True)

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word.isalpha()]  # Remove numbers and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    
    return " ".join(tokens) if tokens else ""  # Return the processed text as a string

# Combine metadata with the preprocessed statement
def preprocess_function(examples):
    combined_input = [
        "Subject: " + subject + 
        "; Speaker: " + speaker + 
        "; Job Title: " + job_title + 
        "; State: " + state_info + 
        "; Party: " + party_affiliation + 
        " Statement: " + preprocess_text(statement)  # Apply preprocess_text here
        for subject, speaker, job_title, state_info, party_affiliation, statement in zip(
            examples["subject"],
            examples["speaker"],
            examples["job_title"],
            examples["state_info"],
            examples["party_affiliation"],
            examples["statement"]
        )
    ]
    return tokenizer(combined_input, padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["id", "subject", "speaker", "job_title", "state_info", "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Preprocess labels to binary True / False
label_to_binary = {
    'false': False,
    'half-true': True,
    'mostly-true': True,
    'true': True,
    'barely-true': False,
    'pants-fire': False
}

# Access label names
label_names = dataset["train"].features["label"].names

# Apply binary label preprocessing
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {"labels": [label_to_binary[label_names[label]] for label in examples["labels"]]},
    batched=True
)

# Split into train, validation, and test sets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

Loading saved tokenizer...


Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

Map:   0%|          | 0/1284 [00:00<?, ? examples/s]

In [None]:
# Load pre-trained BERT model (2 classes for binary classification)
# Check if the saved model exists
if os.path.exists(saved_model_dir):
    print("Loading saved model...")
    model = AutoModelForSequenceClassification.from_pretrained(saved_model_dir)
else:
    print("Saved model not found. Loading default BERT model...")
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
# Check which device your model is on
print("Model is on:", next(model.parameters()).device)

In [4]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define training arguments (epoch)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Specify the metric to monitor
    greater_is_better=False       # Specify if higher values of the metric are better
)

# # Define training arguments (steps for smaller batch logging)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",  # Evaluate during training
#     eval_steps=100,               # Evaluate every 100 steps
#     save_strategy="steps",        # Save checkpoints every 100 steps
#     save_steps=100,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     learning_rate=2e-5,
#     logging_dir="./logs",
#     logging_steps=10,             # Log every 10 steps
#     report_to="none",
#     load_best_model_at_end=True,  # Load the best model at the end of training
#     metric_for_best_model="f1",  # Specify the metric to monitor
#     greater_is_better=True       # Specify if higher values of the metric are better
# )



In [None]:
# from collections import Counter

# # Train on small subset of data
# small_train_dataset = train_dataset.select(range(500))  # Use only 100 samples
# small_val_dataset = val_dataset.select(range(250))      # Use only 50 samples

# train_labels = [label.item() for label in small_train_dataset["labels"]]
# val_labels = [label.item() for label in small_val_dataset["labels"]]
# print("Train label distribution:", Counter(train_labels))
# print("Validation label distribution:", Counter(val_labels))

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_val_dataset,
#     processing_class=tokenizer,
#     compute_metrics=compute_metrics
# )

# trainer.train()

In [None]:
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5]
best_lr = None
best_f1 = 0

for lr in learning_rates:
    # Reinitialize model for each lr
    model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    print("Model is on:", next(model.parameters()).device)

    training_args.learning_rate = lr
    training_args.num_train_epochs = 1
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        # callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )
    trainer.train()
    metrics = trainer.evaluate()
    if metrics["eval_f1"] > best_f1:
        best_f1 = metrics["eval_f1"]
        best_lr = lr

print(f"Best learning rate: {best_lr}, Best F1: {best_f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.687,0.669745,0.594237,0.6463,0.591304,0.712575
200,0.6723,0.663912,0.589564,0.608178,0.604136,0.612275


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6826,0.665061,0.595016,0.593114,0.621311,0.567365
200,0.6668,0.663452,0.604361,0.643258,0.60582,0.685629
300,0.6538,0.66308,0.617601,0.670248,0.607795,0.747006
400,0.6383,0.654776,0.615265,0.652113,0.615691,0.693114


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6551,0.658413,0.60514,0.570703,0.65692,0.504491
200,0.6786,0.666059,0.602804,0.690158,0.580777,0.850299
300,0.6544,0.652687,0.621495,0.663435,0.617268,0.717066


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6516,0.662033,0.60514,0.62583,0.617176,0.634731
200,0.6695,0.658065,0.607477,0.676923,0.591928,0.790419
300,0.6419,0.670926,0.609813,0.695441,0.585466,0.856287
400,0.658,0.648087,0.640966,0.680969,0.633205,0.736527


Best learning rate: 5e-05, Best F1: 0.6954407294832827


### Here are some key observations:

#### Lower Learning Rate (1e-5):

| Step | Training Loss | Validation Loss | Accuracy | F1      | Precision | Recall  |
|------|---------------|-----------------|----------|---------|-----------|---------|
| 100  | 0.687000      | 0.669745        | 0.594237 | 0.646300| 0.591304  | 0.712575|
| 200  | 0.672300      | 0.663912        | 0.589564 | 0.608178| 0.604136  | 0.612275|

The improvements in training and validation loss are slow, and the F1 score actually drops slightly from step 100 to 200.

This suggests that the learning rate might be too low, so the model is learning slowly and may not be making significant progress.

#### Moderate Learning Rate (2e-5):

| Step | Training Loss | Validation Loss | Accuracy | F1      | Precision | Recall  |
|------|---------------|-----------------|----------|---------|-----------|---------|
| 100  | 0.682600      | 0.665061        | 0.595016 | 0.593114| 0.621311  | 0.567365|
| 200  | 0.666800      | 0.663452        | 0.604361 | 0.643258| 0.605820  | 0.685629|
| 300  | 0.653800      | 0.663080        | 0.617601 | 0.670248| 0.607795  | 0.747006|
| 400  | 0.638300      | 0.654776        | 0.615265 | 0.652113| 0.615691  | 0.693114|

You see a steady decrease in training loss and a relatively stable validation loss over time.

The accuracy and F1 scores gradually improve (reaching around 0.67 F1 at step 300) with more training steps.

This indicates that 2e-5 may be a sweet spot, allowing the model to learn effectively without too much instability.

#### Slightly Higher Learning Rate (3e-5):

| Step | Training Loss | Validation Loss | Accuracy | F1      | Precision | Recall  |
|------|---------------|-----------------|----------|---------|-----------|---------|
| 100  | 0.655100      | 0.658413        | 0.605140 | 0.570703| 0.656920  | 0.504491|
| 200  | 0.678600      | 0.666059        | 0.602804 | 0.690158| 0.580777  | 0.850299|
| 300  | 0.654400      | 0.652687        | 0.621495 | 0.663435| 0.617268  | 0.717066|

The results are a bit more inconsistent. For example, at step 200, the recall jumps to 0.85 while precision drops, which causes a spike in the F1 score—but this might be an unstable behavior.

Overall, the performance is mixed, suggesting that 3e-5 might be on the higher end of what the model can handle reliably.

#### Even Higher Learning Rate (5e-5):

| Step | Training Loss | Validation Loss | Accuracy | F1      | Precision | Recall  |
|------|---------------|-----------------|----------|---------|-----------|---------|
| 100  | 0.651600      | 0.662033        | 0.605140 | 0.625830| 0.617176  | 0.634731|
| 200  | 0.669500      | 0.658065        | 0.607477 | 0.676923| 0.591928  | 0.790419|
| 300  | 0.641900      | 0.670926        | 0.609813 | 0.695441| 0.585466  | 0.856287|
| 400  | 0.658000      | 0.648087        | 0.640966 | 0.680969| 0.633205  | 0.736527|

At step 300, the F1 score reaches a peak (~0.70), and accuracy also improves slightly by step 400.

However, the fluctuations in training and validation loss suggest that while the model is learning faster, it might also be more volatile or overfit in parts (indicated by a high recall sometimes paired with lower precision).

#### Conclusion
The moderate learning rate (2e-5) appears to offer a good balance by steadily reducing losses and improving accuracy and F1 without dramatic swings in precision and recall.

In [15]:
# bert-base-uncased

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print("Model is on:", next(model.parameters()).device)
print("Learning rate:", training_args.learning_rate)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0
Learning rate: 2e-05


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.65,0.709921,0.577103,0.700496,0.554585,0.950599
2,0.6855,0.634007,0.637072,0.631329,0.669463,0.597305
3,0.525,0.661227,0.639408,0.681349,0.630573,0.741018
4,0.3291,0.833092,0.637072,0.682993,0.625935,0.751497


TrainOutput(global_step=2568, training_loss=0.558171216851083, metrics={'train_runtime': 2136.7987, 'train_samples_per_second': 24.029, 'train_steps_per_second': 1.502, 'total_flos': 1.080754970996736e+16, 'train_loss': 0.558171216851083, 'epoch': 4.0})

bert-base-uncased (4 epoch, lr: 2e-5, before preprocessing to remove casings)

| Epoch | Training Loss | Validation Loss | Accuracy | F1      | Precision | Recall   |
|-------|---------------|-----------------|----------|---------|-----------|----------|
| 1     | 0.639900      | 0.691008        | 0.585670 | 0.701124 | 0.561151  | 0.934132 |
| 2     | 0.670800      | 0.622549        | 0.645639 | 0.660194 | 0.658718  | 0.661677 |
| 3     | 0.514500      | 0.684059        | 0.651869 | 0.693205 | 0.640051  | 0.755988 |
| 4     | 0.338500      | 0.852544        | 0.637850 | 0.686023 | 0.624846  | 0.760479 |

In [18]:
trainer.evaluate()

{'eval_loss': 0.6340070366859436,
 'eval_model_preparation_time': 0.002,
 'eval_accuracy': 0.6370716510903427,
 'eval_f1': 0.6313291139240507,
 'eval_precision': 0.6694630872483222,
 'eval_recall': 0.5973053892215568,
 'eval_runtime': 19.6527,
 'eval_samples_per_second': 65.335,
 'eval_steps_per_second': 4.122}

In [13]:
learning_rates = [1e-5, 2e-5, 3e-5, 5e-5]
best_lr = None
best_f1 = 0

for lr in learning_rates:
    # Reinitialize model for each lr
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    print("Model is on:", next(model.parameters()).device)

    training_args.learning_rate = lr
    training_args.num_train_epochs = 1
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    trainer.train()
    metrics = trainer.evaluate()
    if metrics["eval_f1"] > best_f1:
        best_f1 = metrics["eval_f1"]
        best_lr = lr

print(f"Best learning rate: {best_lr}, Best F1: {best_f1}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6869,0.678743,0.535826,0.68861,0.528892,0.986527
200,0.657,0.665692,0.607477,0.646564,0.608179,0.69012
300,0.6665,0.664244,0.609034,0.661725,0.601716,0.73503
400,0.651,0.657628,0.620717,0.658246,0.619551,0.702096


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6863,0.668643,0.592679,0.663666,0.581736,0.772455
200,0.6699,0.665708,0.615265,0.663488,0.60875,0.729042
300,0.6616,0.665119,0.609813,0.66711,0.599761,0.751497
400,0.6521,0.655833,0.609813,0.632428,0.620144,0.64521
500,0.6681,0.65976,0.615265,0.663029,0.609023,0.727545
600,0.6477,0.65071,0.616044,0.632364,0.630015,0.634731


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6654,0.663773,0.599688,0.568792,0.646947,0.507485
200,0.6755,0.663832,0.602025,0.680824,0.584137,0.815868
300,0.6485,0.66074,0.612928,0.677482,0.597938,0.781437
400,0.6494,0.653745,0.617601,0.648029,0.621733,0.676647
500,0.6628,0.656104,0.617601,0.670248,0.607795,0.747006


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0


  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,0.6646,0.668259,0.600467,0.544,0.669584,0.458084
200,0.6745,0.660157,0.623053,0.677333,0.610577,0.760479
300,0.6323,0.671197,0.609813,0.690932,0.587618,0.838323
400,0.6506,0.655318,0.619159,0.610359,0.65247,0.573353
500,0.6733,0.655298,0.627726,0.679625,0.615291,0.758982
600,0.6316,0.643826,0.630062,0.647365,0.642121,0.652695


Best learning rate: 5e-05, Best F1: 0.6909315237507712


In [9]:
# distilbert-base-uncased

config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=2, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
training_args.num_train_epochs = 2
training_args.learning_rate = 2e-5
print("Model is on:", next(model.parameters()).device)
print("Learning rate:", training_args.learning_rate)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0
Learning rate: 2e-05


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6532,0.699789,0.58567,0.700787,0.561261,0.932635
2,0.6587,0.637094,0.644081,0.683299,0.636129,0.738024


TrainOutput(global_step=1284, training_loss=0.6465281116256832, metrics={'train_runtime': 553.1649, 'train_samples_per_second': 37.128, 'train_steps_per_second': 2.321, 'total_flos': 2720615433596928.0, 'train_loss': 0.6465281116256832, 'epoch': 2.0})

In [None]:
# cardiffnlp/twitter-roberta-base-sentiment-latest

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest", num_labels=2, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
training_args.num_train_epochs = 3
training_args.learning_rate = 2e-5
print("Model is on:", next(model.parameters()).device)
print("Learning rate:", training_args.learning_rate)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpo

Model is on: cuda:0
Learning rate: 2e-05


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6656,0.666093,0.59891,0.636042,0.60241,0.673653


TrainOutput(global_step=642, training_loss=0.6825153445157678, metrics={'train_runtime': 536.4309, 'train_samples_per_second': 19.143, 'train_steps_per_second': 1.197, 'total_flos': 2701887427491840.0, 'train_loss': 0.6825153445157678, 'epoch': 1.0})

In [None]:
# xlm-roberta-base (wasted capacity since xlm-roberta is a multilingual model trained on 100 languages)

model = AutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
training_args.num_train_epochs = 3
training_args.learning_rate = 2e-5
print("Model is on:", next(model.parameters()).device)
print("Learning rate:", training_args.learning_rate)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0
Learning rate: 2e-05


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.675,0.694289,0.520249,0.684426,0.520249,1.0
2,0.7031,0.695318,0.520249,0.684426,0.520249,1.0
3,0.6592,0.682945,0.554517,0.670127,0.545028,0.86976


TrainOutput(global_step=1926, training_loss=0.6847655094796375, metrics={'train_runtime': 4042.7724, 'train_samples_per_second': 7.62, 'train_steps_per_second': 0.476, 'total_flos': 8105662282475520.0, 'train_loss': 0.6847655094796375, 'epoch': 3.0})

In [None]:
# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# Train the model
trainer.train()


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.evaluate()

# Conduct testing on the test dataset
test_results = trainer.predict(test_dataset)

# Extract predictions and metrics
predictions = test_results.predictions.argmax(-1)  # Convert logits to class predictions
metrics = test_results.metrics  # Contains accuracy, F1, precision, recall, etc.

# Print metrics
print("Test Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

# Optionally, inspect the predictions
print("Predictions:", predictions)

  trainer = Trainer(


Test Metrics:
test_loss: 0.6432
test_model_preparation_time: 0.0010
test_accuracy: 0.6461
test_f1: 0.7090
test_precision: 0.6639
test_recall: 0.7607
test_runtime: 9.6851
test_samples_per_second: 132.4720
test_steps_per_second: 8.3630
Predictions: [1 1 0 ... 0 1 1]


Best result so far:

Model: distilbert-base-uncased

Dropout: 0.3

Learning rate: 2e-5

Test Metrics:

| Metric                     | Value     |
|----------------------------|-----------|
| Test Loss                 | 0.6432    |
| Model Preparation Time    | 0.0010    |
| Test Accuracy             | 0.6461    |
| Test F1                   | 0.7090    |
| Test Precision            | 0.6639    |
| Test Recall               | 0.7607    |
| Test Runtime              | 9.6851    |
| Test Samples per Second   | 132.4720  |
| Test Steps per Second     | 8.3630    |

In [19]:
trainer.save_model(saved_model_dir)

In [None]:
# from collections import Counter

# train_labels = [label.item() for label in train_dataset["labels"]]
# val_labels = [label.item() for label in val_dataset["labels"]]
# print("Train label distribution:", Counter(train_labels))
# print("Validation label distribution:", Counter(val_labels))

# predictions = trainer.predict(val_dataset)
# preds = predictions.predictions.argmax(-1)
# print(Counter(preds))  # Check the distribution of predicted labels