In [1]:
# Import libraries
import os
import json
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import torch
import torchvision
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from datasets import load_dataset

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\limyi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\limyi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\limyi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load the LIAR dataset
dataset = load_dataset("liar", trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples["statement"], padding="max_length", truncation=True)

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word.isalpha()]  # Remove numbers and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    
    return " ".join(tokens) if tokens else ""  # Return the processed text as a string

# Combine metadata with the preprocessed statement
def preprocess_function(examples):
    combined_input = [
        "Subject: " + subject + 
        "; Speaker: " + speaker + 
        "; Job Title: " + job_title + 
        "; State: " + state_info + 
        "; Party: " + party_affiliation + 
        " Statement: " + preprocess_text(statement)  # Apply preprocess_text here
        for subject, speaker, job_title, state_info, party_affiliation, statement in zip(
            examples["subject"],
            examples["speaker"],
            examples["job_title"],
            examples["state_info"],
            examples["party_affiliation"],
            examples["statement"]
        )
    ]
    return tokenizer(combined_input, padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["id", "subject", "speaker", "job_title", "state_info", "party_affiliation", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Preprocess labels to binary True / False
label_to_binary = {
    'false': False,
    'half-true': True,
    'mostly-true': True,
    'true': True,
    'barely-true': False,
    'pants-fire': False
}

# Access label names
label_names = dataset["train"].features["label"].names

# Apply binary label preprocessing
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {"labels": [label_to_binary[label_names[label]] for label in examples["labels"]]},
    batched=True
)

# Split into train, validation, and test sets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [3]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define training arguments (epoch)
training_args = TrainingArguments(
    # disable_tqdm=False,
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Specify the metric to monitor
    greater_is_better=False       # Specify if higher values of the metric are better
)

# # Define training arguments (steps for smaller batch logging)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",  # Evaluate during training
#     eval_steps=100,               # Evaluate every 100 steps
#     save_strategy="steps",        # Save checkpoints every 100 steps
#     save_steps=100,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     learning_rate=2e-5,
#     logging_dir="./logs",
#     logging_steps=10,             # Log every 10 steps
#     report_to="none",
#     load_best_model_at_end=True,  # Load the best model at the end of training
#     metric_for_best_model="f1",  # Specify the metric to monitor
#     greater_is_better=True       # Specify if higher values of the metric are better
# )



In [4]:
dropouts = [0.1, 0.2, 0.3, 0.4]

for dropout in dropouts:
    print("Dropout:", dropout)
    config = AutoConfig.from_pretrained("distilbert-base-uncased", num_labels=2, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", config=config)
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    training_args.num_train_epochs = 3
    training_args.learning_rate = 2e-5
    print("Model is on:", next(model.parameters()).device)
    print("Learning rate:", training_args.learning_rate)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
    )

    trainer.train()

Dropout: 0.1


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0
Learning rate: 2e-05


  0%|          | 0/1926 [00:00<?, ?it/s]

{'loss': 0.7014, 'grad_norm': 0.9615477323532104, 'learning_rate': 4.0000000000000003e-07, 'epoch': 0.02}
{'loss': 0.703, 'grad_norm': 1.209923505783081, 'learning_rate': 8.000000000000001e-07, 'epoch': 0.03}


KeyboardInterrupt: 

Overview of the Metrics
Training Loss & Validation Loss:
All dropout settings start off similarly in Epoch 1 (with slight variations), and by Epoch 3, the training losses are lower for dropout values 0.2–0.4 compared to 0.1. The validation losses are very similar across settings, suggesting that all configurations generalize fairly equally.

Accuracy & F1 Score:
Accuracy climbs from around 60% in Epoch 1 to roughly 65% by Epoch 3 across the board. However, dropout rates of 0.2 and 0.3 show a slight edge (with accuracy around 65.5% and F1 scores around 0.683) compared to 0.1 and even 0.4.

Precision & Recall:
Precision remains very consistent across dropout rates. The recall, however, is a bit higher with dropout rates 0.2 and 0.3 (around 0.715 by Epoch 3) than with 0.1 (about 0.707) or 0.4 (about 0.713).

Interpreting the Dropout Impact
Dropout 0.1:
A lower dropout rate may not regularize the model sufficiently. Although the validation loss is slightly lower in Epoch 3, other metrics (accuracy, F1, recall) are a bit behind the moderate dropout rates.

Dropout 0.2 and 0.3:
These settings appear to strike the best balance. They not only achieve the lowest training losses by Epoch 3 but also slightly improve accuracy, F1, and recall. Their nearly identical metrics suggest that the model is robust within this moderate dropout range.

Dropout 0.4:
While still similar overall, the highest dropout rate here might be pushing the regularization a bit too far, leading to a very slight drop in performance compared to the 0.2–0.3 range. Over-regularization can limit the network’s capacity to learn the finer details of the data.

Final Thoughts
In summary, the data indicate that:

Moderate dropout rates (around 0.2–0.3) seem optimal for this model setup, offering a better trade-off between preventing overfitting and maintaining learning capacity.

Very low dropout (0.1) might under-regularize the model, while too high dropout (0.4) may start to harm the model’s learning efficiency slightly.

The differences across these dropout values are subtle, suggesting that the model is relatively robust to small changes in dropout rate.

Choosing between 0.2 and 0.3 might come down to other factors (like training stability or slight variations in different runs), but both provide a favorable balance according to these metrics.

In [7]:
import sys
print("Is stdout a TTY?", sys.stdout.isatty())


Is stdout a TTY? False
