In [9]:
# Import libraries
import os
import json
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import torch
import torchvision
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from datasets import load_dataset
from collections import Counter

# # Download NLTK resources
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [10]:
# Load the LIAR dataset
dataset = "chengxuphd/liar2"
dataset = load_dataset(dataset)

pretrained_model = "roberta-base"

In [12]:
from transformers import pipeline

# Create translation pipelines inside the function for multiprocessing compatibility.
device = 0 if torch.cuda.is_available() else -1
en_to_fr = pipeline("translation", model="Helsinki-NLP/opus-mt-en-fr", device=device)
fr_to_en = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device)

# Ensure this function is defined at the top level.
def back_translate_batch(texts):
    # Translate from English to French.
    translations = en_to_fr(texts, max_length=512)
    pivot_texts = [t["translation_text"] for t in translations]

    # Translate back from French to English.
    back_translations = fr_to_en(pivot_texts, max_length=512)
    final_texts = [t["translation_text"] for t in back_translations]

    return final_texts

# Define a top-level function for dataset mapping.
def add_backtranslated_texts(examples):
    return {"paraphrased_statement": back_translate_batch(examples["statement"])}

# Use the dataset's map function with parallel processing.
augmented_dataset = dataset.map(
    add_backtranslated_texts,
    batched=True,
)

Device set to use cuda:0
Device set to use cuda:0


Map:   0%|          | 0/18369 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Map:   0%|          | 0/2297 [00:00<?, ? examples/s]

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

In [13]:
augmented_dataset.save_to_disk("augmented_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/18369 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2297 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2296 [00:00<?, ? examples/s]

In [15]:
print(augmented_dataset)
augmented_dataset["train"].to_csv("augmented_train_dataset.csv")

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'statement', 'date', 'subject', 'speaker', 'speaker_description', 'state_info', 'true_counts', 'mostly_true_counts', 'half_true_counts', 'mostly_false_counts', 'false_counts', 'pants_on_fire_counts', 'context', 'justification', 'paraphrased_statement'],
        num_rows: 18369
    })
    validation: Dataset({
        features: ['id', 'label', 'statement', 'date', 'subject', 'speaker', 'speaker_description', 'state_info', 'true_counts', 'mostly_true_counts', 'half_true_counts', 'mostly_false_counts', 'false_counts', 'pants_on_fire_counts', 'context', 'justification', 'paraphrased_statement'],
        num_rows: 2297
    })
    test: Dataset({
        features: ['id', 'label', 'statement', 'date', 'subject', 'speaker', 'speaker_description', 'state_info', 'true_counts', 'mostly_true_counts', 'half_true_counts', 'mostly_false_counts', 'false_counts', 'pants_on_fire_counts', 'context', 'justification', 'paraphrased_stateme

Creating CSV from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

21051033

In [5]:
from transformers import MarianMTModel, MarianTokenizer

def back_translate(text, src_lang="en", pivot_lang="fr"):
    # Define model names based on the pivot language.
    model_name_src_to_pivot = f"Helsinki-NLP/opus-mt-{src_lang}-{pivot_lang}"
    model_name_pivot_to_src = f"Helsinki-NLP/opus-mt-{pivot_lang}-{src_lang}"
    
    # Load tokenizers and models for both translation directions.
    tokenizer_src_to_pivot = MarianTokenizer.from_pretrained(model_name_src_to_pivot)
    model_src_to_pivot = MarianMTModel.from_pretrained(model_name_src_to_pivot)
    
    tokenizer_pivot_to_src = MarianTokenizer.from_pretrained(model_name_pivot_to_src)
    model_pivot_to_src = MarianMTModel.from_pretrained(model_name_pivot_to_src)
    
    # Translate from English to the pivot language (e.g., French).
    encoded_src = tokenizer_src_to_pivot(text, return_tensors="pt", padding=True, truncation=True)
    translated_tokens = model_src_to_pivot.generate(**encoded_src)
    pivot_text = tokenizer_src_to_pivot.decode(translated_tokens[0], skip_special_tokens=True)
    
    # Translate back from the pivot language to English.
    encoded_pivot = tokenizer_pivot_to_src(pivot_text, return_tensors="pt", padding=True, truncation=True)
    back_translated_tokens = model_pivot_to_src.generate(**encoded_pivot)
    final_text = tokenizer_pivot_to_src.decode(back_translated_tokens[0], skip_special_tokens=True)
    
    return final_text

# Example usage:
if __name__ == "__main__":
    original_text = "when is this going to change."
    paraphrased_text = back_translate(original_text, src_lang="en", pivot_lang="fr")
    print("Original:", original_text)
    print("Paraphrased:", paraphrased_text)


Original: when is this going to change.
Paraphrased: When will this change?


In [10]:
import random

# Randomly pick 5 statements from the dataset
random_statements = random.sample(dataset["train"]["statement"], 5)

# Perform backtranslation on the selected statements
for i, statement in enumerate(random_statements, 1):
    backtranslated = back_translate(statement)
    print(f"Original {i}: {statement}")
    print(f"Backtranslated {i}: {backtranslated}")
    print()

Original 1: People committed involuntarily for 72 hours under the Baker Act will get their guns back "automatically and immediately upon discharge....and their commitment is never entered into a background check database.
Backtranslated 1: People committed unintentionally for 72 hours under the Baker law will recover their weapons "automatically and immediately after their release... and their engagement has never entered a background check database.

Original 2: Update: Chuck Schumer's steamy affair with a high school cheerleader confirmed.
Backtranslated 2: Update: Chuck Schumer's case with a high school cheerleader has been confirmed.

Original 3: Ken Lanci is a lifelong Clevelander
Backtranslated 3: Ken Lanci is a Clevelander all his life

Original 4: Public support for abortion "is actually going down a little bit," polls show.
Backtranslated 4: Public support for abortion "is dropping a little," the polls show.

Original 5: The price of gasoline was at $1.85 a gallon when Preside

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples["statement"], padding="max_length", truncation=True)

# Preprocessing function for text
def preprocess_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\W', ' ', text)  # Remove non-alphanumeric characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word.isalpha()]  # Remove numbers and punctuation
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Stopword removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    
    return " ".join(tokens) if tokens else ""  # Return the processed text as a string

# Combine metadata with the preprocessed statement
def preprocess_function(examples):
    statements = examples["statement"]
    backtranslated_statements = batch_back_translate(statements)
    combined_input = [
        "Subject: " + (subject if subject is not None else "") + 
        "; Speaker: " + (speaker if speaker is not None else "") + 
        "; Speaker Description: " + (speaker_description if speaker_description is not None else "") + 
        "; State: " + (state_info if state_info is not None else "") + 
        "; Context: " + (context if context is not None else "") + 
        "; Statement: " + (statement if statement is not None else "")  # back_translate applied here
        # "; Statement: " + preprocess_text(statement if statement is not None else "")  # Apply preprocess_text here
        for subject, speaker, speaker_description, state_info, context, statement in zip(
            examples["subject"],
            examples["speaker"],
            examples["speaker_description"],
            examples["state_info"],
            examples["context"],
            backtranslated_statements
        )
    ]
    return tokenizer(combined_input, padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["id", "subject", "speaker", "speaker_description", "state_info", "context", "true_counts", "mostly_true_counts", "half_true_counts", "mostly_false_counts", "false_counts", "pants_on_fire_counts", "justification"])
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Preprocess labels to binary True / False
# label_to_binary = {
#     'false': False,
#     'half-true': True,
#     'mostly-true': True,
#     'true': True,
#     'barely-true': False,
#     'pants-fire': False
# }
label_to_binary = {
    0: False,
    1: False,
    2: False,
    3: True,
    4: True,
    5: True
}

# Access labels
# labels = dataset["train"].features["label"]

# Apply binary label preprocessing
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {"label": [label_to_binary[int(label)] for label in examples["label"]]},
    batched=True
)

# Split into train, validation, and test sets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

Map:   0%|          | 0/18369 [00:00<?, ? examples/s]

In [12]:
# Verify that dataset is preprocessed correctly

# Check label distribution in the train dataset
label_list = train_dataset["label"].tolist()
print("Label distribution in train dataset:", Counter(label_list))

Label distribution in train dataset: Counter({0: 10591, 1: 7778})


In [13]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define training arguments (epoch)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Specify the metric to monitor
    greater_is_better=False       # Specify if higher values of the metric are better
)

# # Define training arguments (steps for smaller batch logging)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",  # Evaluate during training
#     eval_steps=100,               # Evaluate every 100 steps
#     save_strategy="steps",        # Save checkpoints every 100 steps
#     save_steps=100,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     learning_rate=2e-5,
#     logging_dir="./logs",
#     logging_steps=10,             # Log every 10 steps
#     report_to="none",
#     load_best_model_at_end=True,  # Load the best model at the end of training
#     metric_for_best_model="f1",  # Specify the metric to monitor
#     greater_is_better=True       # Specify if higher values of the metric are better
# )



In [None]:
config = AutoConfig.from_pretrained(pretrained_model, num_labels=2, hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, config=config)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
training_args.num_train_epochs = 5
training_args.learning_rate = 2e-5
print("Model is on:", next(model.parameters()).device)
print("Learning rate:", training_args.learning_rate)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model is on: cuda:0
Learning rate: 2e-05


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5784,0.551846,0.679582,0.694859,0.581944,0.86214
2,0.5548,0.551378,0.72007,0.70355,0.637427,0.784979
3,0.4858,0.550206,0.734872,0.701324,0.670103,0.735597
4,0.4609,0.580846,0.72007,0.71023,0.631917,0.8107
5,0.4296,0.605068,0.721376,0.706422,0.637417,0.792181


TrainOutput(global_step=5745, training_loss=0.49161564542274044, metrics={'train_runtime': 4928.75, 'train_samples_per_second': 18.635, 'train_steps_per_second': 1.166, 'total_flos': 2.41654348795392e+16, 'train_loss': 0.49161564542274044, 'epoch': 5.0})

In [None]:
trainer.evaluate()

In [15]:
# Conduct testing on the test dataset
test_results = trainer.predict(test_dataset)

# Extract predictions and metrics
predictions = test_results.predictions.argmax(-1)  # Convert logits to class predictions
metrics = test_results.metrics  # Contains accuracy, F1, precision, recall, etc.

# Print metrics
print("Test Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")

Test Metrics:
test_loss: 0.5339
test_accuracy: 0.7395
test_f1: 0.7051
test_precision: 0.6777
test_recall: 0.7348
test_runtime: 39.2421
test_samples_per_second: 58.5090
test_steps_per_second: 3.6700


In [16]:
trainer.save_model("./roberta_model")

#### With text cleaning:

| Metric                   | Value     |
|--------------------------|-----------|
| Test Loss                | 0.5277    |
| Test Accuracy            | 0.7287    |
| Test F1 Score            | 0.7106    |
| Test Precision           | 0.6483    |
| Test Recall              | 0.7862    |
| Test Runtime (seconds)   | 34.2732   |
| Test Samples/Second      | 66.9910   |
| Test Steps/Second        | 4.2020    |

#### Without text cleaning:
| Metric                   | Value     |
|--------------------------|-----------|
| Test Loss                | 0.5339    |
| Test Accuracy            | 0.7395    |
| Test F1 Score            | 0.7051    |
| Test Precision           | 0.6777    |
| Test Recall              | 0.7348    |
| Test Runtime (seconds)   | 39.2421   |
| Test Samples/Second      | 58.5090   |
| Test Steps/Second        | 3.6700    |