In [1]:
# Import libraries
import os
import json
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import torch
import torch.nn as nn
import torchvision
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, AutoConfig
from datasets import load_dataset
from collections import Counter


In [None]:
class MetadataAwareModel(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, metadata_dim):
        super(MetadataAwareModel, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.metadata_fc = nn.Linear(metadata_dim, 128)  # Process metadata features
        self.classifier = nn.Linear(self.bert.config.hidden_size + 128, num_labels)  # Combine BERT and metadata
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids, attention_mask, metadata_features):
        # Get the [CLS] token embedding from BERT
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = bert_output.pooler_output

        # Process metadata features
        metadata_embedding = self.metadata_fc(metadata_features)

        # Concatenate BERT [CLS] embedding with metadata features
        combined_features = torch.cat((cls_embedding, metadata_embedding), dim=1)
        combined_features = self.dropout(combined_features)

        # Pass through the classifier
        logits = self.classifier(combined_features)
        return logits

In [None]:
from transformers import Trainer

class MetadataTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract metadata features from inputs
        metadata_features = inputs.pop("metadata_features")
        outputs = model(**inputs, metadata_features=metadata_features)
        loss = outputs.loss if hasattr(outputs, "loss") else outputs[0]
        return (loss, outputs) if return_outputs else loss

In [2]:
# Load the LIAR dataset
dataset = "chengxuphd/liar2"
dataset = load_dataset(dataset)

# def flatten_column(column):
#     # Check if each element is a list or a numpy array, then flatten.
#     if isinstance(column[0], (list, np.ndarray)):
#         return [item for sublist in column for item in (sublist if isinstance(sublist, list) else list(sublist))]
#     else:
#         return column

# subject_column = np.array(dataset['train']['subject']).reshape(-1, 1)
# subject_column = flatten_column(subject_column)
# # print("Shape:", subject_column.shape)
# print("First few entries:", subject_column[:5])


In [4]:
def flatten_column(column):
    if column is None or len(column) == 0:
        return []
    # Check if each element is a list or a numpy array, then flatten.
    if isinstance(column[0], (list, np.ndarray)):
        return [item for sublist in column for item in (sublist if isinstance(sublist, list) else list(sublist))]
    else:
        return column

def preprocess_metadata_column(column):
    # Replace missing values with "unknown" and preprocess each element
    # Flatten the column: if an element is a list, extract its first element
    column = flatten_column(column)
    
    # Convert to lowercase, strip extra spaces, and remove special characters
    column = [str(x).lower().strip() for x in column]
    column = [re.sub(r"[^a-zA-Z0-9\s;]", "", x) for x in column]
    
    # print(column[:5])

    return column

def preprocess_metadata(example):
    # Preprocess each metadata column
    example["subject"] = preprocess_metadata_column(example["subject"])
    example["speaker"] = preprocess_metadata_column(example["speaker"])
    example["speaker_description"] = preprocess_metadata_column(example["speaker_description"])
    example["state_info"] = preprocess_metadata_column(example["state_info"])
    example["context"] = preprocess_metadata_column(example["context"])
    return example

def encode_metadata(dataset):
    # Flatten columns in case they are still nested
    subject = flatten_column(dataset["subject"])
    speaker = flatten_column(dataset["speaker"])
    speaker_description = flatten_column(dataset["speaker_description"])
    state_info = flatten_column(dataset["state_info"])
    context = flatten_column(dataset["context"])
    
    subject_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    speaker_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    description_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    state_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    context_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    subject_features = subject_encoder.fit_transform(np.array(subject).reshape(-1, 1))
    speaker_features = speaker_encoder.fit_transform(np.array(speaker).reshape(-1, 1))
    speaker_description_features = description_encoder.fit_transform(np.array(speaker_description).reshape(-1, 1))
    state_features = state_encoder.fit_transform(np.array(state_info).reshape(-1, 1))
    context_features = context_encoder.fit_transform(np.array(context).reshape(-1, 1))

    metadata_features = np.hstack([
        subject_features, speaker_features, speaker_description_features,
        state_features, context_features
    ])
    return metadata_features.tolist()

# Preprocess the dataset before encoding
train_dataset = dataset["train"].map(preprocess_metadata, batched=True)
val_dataset = dataset["validation"].map(preprocess_metadata, batched=True)
test_dataset = dataset["test"].map(preprocess_metadata, batched=True)

print("Type of first subject entry:", type(train_dataset["subject"][0]))
print("First subject entry:", train_dataset["subject"][0])

# Now pass the preprocessed dataset to the encode_metadata function
train_metadata = encode_metadata(train_dataset)
val_metadata = encode_metadata(val_dataset)
test_metadata = encode_metadata(test_dataset)

Map:   0%|          | 0/18369 [00:00<?, ? examples/s]

Map:   0%|          | 0/2297 [00:00<?, ? examples/s]

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

Type of first subject entry: <class 'str'>
First subject entry: government regulation;polls and public opinion;guns


In [6]:
print(train_dataset['speaker_description'][:5])
for i in range(5, 10):  # Check the first 5 examples
    print(f"{dataset["train"][i]["speaker"]}")
    print(f"Example {i} metadata: {train_metadata[i]}")
    print(Counter(train_metadata[i]))

['chris abele is milwaukee county executive a position he won in an april 2011 special election to finish out the final year of the term of scott walker who was elected governor in november 2010 the election was the first attempt at political office for abele a milwaukee philanthropist and business owner\r\nthe office is nonpartisan but abele has indicated he is a democrat', 'thom tillis is a republican who serves as us senator from north carolina', 'chris christie announced june 6 2023 that he is running for the republican nomination for president christie was the governor of new jersey and a candidate for the republican nomination for president in 2016 as governor christie won office in 2010 and was reelected in 2014 prior to that he was us attorney for the district of new jersey from 2002 to 2008', 'lee zeldin is a republican representing new yorks 1st congressional district', 'mitt romney is a us senator from utah he\xa0ran for president in 2012 as the republican nominee losing to 

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np

df = pd.DataFrame({
    'subject': ['A', 'B', 'A', 'C'],
    'speaker': ['X', 'Y', 'X', 'Z'],
    'speaker_description': ['desc1', 'desc2', 'desc1', 'desc3'],
    'state_info': ['state1', 'state2', 'state1', 'state3'],
    'context': ['context1', 'context2', 'context1', 'context3']
})

def encode_metadata_test(dataset):
    subject_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    speaker_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    desc_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    state_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    context_enc = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

    subject_features = subject_enc.fit_transform(np.array(dataset["subject"]).reshape(-1, 1))
    speaker_features = speaker_enc.fit_transform(np.array(dataset["speaker"]).reshape(-1, 1))
    speaker_description_features = desc_enc.fit_transform(np.array(dataset["speaker_description"]).reshape(-1, 1))
    state_features = state_enc.fit_transform(np.array(dataset["state_info"]).reshape(-1, 1))
    context_features = context_enc.fit_transform(np.array(dataset["context"]).reshape(-1, 1))
    
    metadata_features = np.hstack([
        subject_features, speaker_features, speaker_description_features,
        state_features, context_features
    ])
    return metadata_features

print(encode_metadata_test(df))


In [None]:
pretrained_model = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["statement"], padding="max_length", truncation=True)

# # Combine metadata with the preprocessed statement
# def preprocess_function(examples):
#     combined_input = [
#         "Subject: " + (subject if subject is not None else "") + 
#         "; Speaker: " + (speaker if speaker is not None else "") + 
#         "; Speaker Description: " + (speaker_description if speaker_description is not None else "") + 
#         "; State: " + (state_info if state_info is not None else "") + 
#         "; Context: " + (context if context is not None else "") + 
#         "; Statement: " + (statement if statement is not None else "")  # Don't apply preprocess_text here
#         # "; Statement: " + preprocess_text(statement if statement is not None else "")  # Apply preprocess_text here
#         for subject, speaker, speaker_description, state_info, context, statement in zip(
#             examples["subject"],
#             examples["speaker"],
#             examples["speaker_description"],
#             examples["state_info"],
#             examples["context"],
#             examples["statement"]
#         )
#     ]
#     return tokenizer(combined_input, padding="max_length", truncation=True)

# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns and set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["id", "subject", "speaker", "speaker_description", "state_info", "context", "true_counts", "mostly_true_counts", "half_true_counts", "mostly_false_counts", "false_counts", "pants_on_fire_counts", "justification"])
# tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# Preprocess labels to binary True / False
# label_to_binary = {
#     'false': False,
#     'half-true': True,
#     'mostly-true': True,
#     'true': True,
#     'barely-true': False,
#     'pants-fire': False
# }
label_to_binary = {
    0: False,
    1: False,
    2: False,
    3: True,
    4: True,
    5: True
}

# Access labels
# labels = dataset["train"].features["label"]

# Apply binary label preprocessing
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {"label": [label_to_binary[int(label)] for label in examples["label"]]},
    batched=True
)

# Split into train, validation, and test sets
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [None]:
# Function to add metadata features to the dataset
def add_metadata(examples, idx, metadata):
    # Add metadata_features from the corresponding metadata list to the dataset
    examples["metadata_features"] = metadata[idx]
    return examples

# Add metadata to train, validation, and test datasets
train_dataset = train_dataset.map(
    lambda examples, idx: add_metadata(examples, idx, train_metadata),
    with_indices=True,
    batched=False  # Process one example at a time
)
print("Metadata added to train_dataset.")

val_dataset = val_dataset.map(
    lambda examples, idx: add_metadata(examples, idx, val_metadata),
    with_indices=True,
    batched=False  # Process one example at a time
)
print("Metadata added to val_metadata.")

test_dataset = test_dataset.map(
    lambda examples, idx: add_metadata(examples, idx, test_metadata),
    with_indices=True,
    batched=False  # Process one example at a time
)
print("Metadata added to test_dataset.")

In [None]:
print(train_dataset.column_names)
print(np.unique(val_dataset[0]['metadata_features'].numpy(), return_counts=True))
print(np.unique(val_dataset[10]['metadata_features'].numpy(), return_counts=True))
print(np.unique(train_dataset[0]['metadata_features'].numpy(), return_counts=True))
print(np.unique(train_dataset[696]['metadata_features'].numpy(), return_counts=True))
print(np.unique(train_dataset[666]['metadata_features'].numpy(), return_counts=True))

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Define training arguments (epoch)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="eval_loss",  # Specify the metric to monitor
    greater_is_better=False       # Specify if higher values of the metric are better
)

# # Define training arguments (steps for smaller batch logging)
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="steps",  # Evaluate during training
#     eval_steps=100,               # Evaluate every 100 steps
#     save_strategy="steps",        # Save checkpoints every 100 steps
#     save_steps=100,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     learning_rate=2e-5,
#     logging_dir="./logs",
#     logging_steps=10,             # Log every 10 steps
#     report_to="none",
#     load_best_model_at_end=True,  # Load the best model at the end of training
#     metric_for_best_model="f1",  # Specify the metric to monitor
#     greater_is_better=True       # Specify if higher values of the metric are better
# )

In [None]:
metadata_dim = len(train_metadata[0])
config = AutoConfig.from_pretrained(pretrained_model, num_labels=2, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)
model = MetadataAwareModel(pretrained_model, metadata_dim=metadata_dim, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
training_args.num_train_epochs = 5
training_args.learning_rate = 2e-5
print("Model is on:", next(model.parameters()).device)
print("Learning rate:", training_args.learning_rate)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()