In [3]:
pip install huggingface_hub[hf_xet]



In [4]:
!pip3 install emoji==0.6.0



In [5]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable wandb
import re
import torch
import pandas as pd
import numpy as np

# Dataset Splitting, Label Encoding, and Evaluation Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# Includes various pretrained models and utilities from Hugging Face Transformers
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
# Neural network submodules in PyTorch
import torch.nn as nn
import warnings
warnings.filterwarnings('ignore')

# ============== 1. Data Loading and Cleaning ==============
df = pd.read_csv(
    "SemEval2017-task4-dev.subtask-CE.english.INPUT.txt",
    sep='\t',
    header=None,
    names=['id', 'topic', 'label_num', 'tweet_raw'],
)

# Mapping Between Numeric and String Labels
label_map = {
    -2: "STRONGLYNEGATIVE",
    -1: "WEAKLYNEGATIVE",
     0: "NEUTRAL",
     1: "WEAKLYPOSITIVE",
     2: "STRONGLYPOSITIVE"
}
df['label'] = df['label_num'].map(label_map)

# To ensure fairness and comparability, the data cleaning procedures applied to the BERT family of models
# are kept consistent with those used for the RNN, BiLSTM, and CNN models
def basic_text_cleaning(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", text)
    text = re.sub(r"\s+", " ", text).strip() # Replace multiple spaces with a single space and strip leading/trailing whitespace
    return text

df['tweet'] = df['tweet_raw'].astype(str).apply(basic_text_cleaning) # Perform type conversion

# Concatenate the topic and tweet into a single input sequence for BERT
df['input_text'] = df.apply(lambda row: f"[TOPIC] {row['topic']} [SEP] {row['tweet']}", axis=1)

# ============== 2. Augmentation Using a Sentiment Lexicon ==============
senti_lexicon = {
    "love": 2, "like": 1, "good": 1, "hate": -2, "bad": -1, "horrible": -2
}
def lexicon_score(sentence):
    words = sentence.lower().split() # Convert sentences to lowercase and compare word by word
    score = 0
    for w in words:
        if w in senti_lexicon:
            score += senti_lexicon[w]
    return score

df['lexicon_score'] = df['tweet'].apply(lexicon_score) # Store the scores in df['lexicon_score']

# ============== 3. Data splitting and handling of class imbalance ==============
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])  # Convert to the range 0–4

# Use stratified sampling to ensure similar label distributions in the training and test sets
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label_id'] # Set a random seed to ensure reproducibility
)

# Compute class weights for use in weighted cross-entropy loss
# Assign higher weights to rare classes
train_labels_array = train_df['label_id'].to_numpy()
class_counts = Counter(train_labels_array)
num_samples = len(train_labels_array)
num_classes = len(class_counts)
weights = [num_samples / (num_classes * class_counts[i]) for i in range(num_classes)]
class_weights = torch.tensor(weights, dtype=torch.float)

# ============== 4. Construct the Dataset ==============
# Use the tokenizer to tokenize the input text, convert it to token IDs,
# truncate or pad to max_len, and generate the corresponding attention_mask
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len # Maximum length per text

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):  # Tokenize the input text, convert it to token IDs, and apply truncation or padding to max_len
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True, # Automatically add [CLS] and [SEP] tokens
            max_length=self.max_len, # Set the maximum sequence length
            padding='max_length',  # Pad sequences shorter than max_len
            truncation=True,     # Truncate sequences that exceed max_len
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()  # Return as a dictionary
        attention_mask = encoding['attention_mask'].squeeze() # Remove extra dimensions

        return {
            'input_ids': input_ids,      # Token ID
            'attention_mask': attention_mask, # Attention mask
            'labels': torch.tensor(label, dtype=torch.long) # Convert labels to LongTensor type
        }

# Convert to Python lists for constructing a custom Dataset
train_texts = train_df['input_text'].tolist()
train_labels = train_df['label_id'].tolist()
test_texts = test_df['input_text'].tolist()
test_labels = test_df['label_id'].tolist()

# ============== 5. Custom Trainer with Weighted Cross-Entropy Loss ==============
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Because additional arguments (e.g., num_items_in_batch) may be passed to Trainer during execution,
        **kwargs is included to prevent errors
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"}) # Perform forward propagation
        logits = outputs.logits

        # Using class_weights（Alternatively, use the ordinary loss_fct = nn.CrossEntropyLoss()）
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device)) # Compute the loss
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ============== 6. Training Configuration (TrainingArguments) ==============
training_args = TrainingArguments(
    output_dir='./checkpoints',
    num_train_epochs=3,       # Train for 3 epochs
    per_device_train_batch_size=8, # Set the batch size to 8 per GPU/CPU during training and validation
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,  # Log every 50 steps
    do_eval=True,
    report_to="none",  # Turn off wandb logging
)

# ============== 7. Train each model sequentially and output its test performance separately ==============
# Save both the model and tokenizer for later use in ensemble evaluation
model_names = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "xlnet-base-cased",
    "google/electra-base-generator",
    # The most suitable BERT model for recognizing Twitter tweets
    "vinai/bertweet-base"
]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

train_dataset = BERTDataset(train_texts, train_labels, None, max_len=128)
test_dataset  = BERTDataset(test_texts,  test_labels,  None, max_len=128)

all_models = []
all_tokenizers = []

for model_name in model_names:
    print(f"\n===== Fine-tuning and evaluating model: {model_name} =====")

    # 1) Load tokenizer & model
    # BERTweet follows the same architecture as RoBERTa and can be used via AutoTokenizer and AutoModel
    if "roberta" in model_name.lower():
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "distilbert" in model_name.lower():
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "xlnet" in model_name.lower():
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "electra" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "bertweet" in model_name.lower():
        # BERTweet is typically based on the RoBERTa architecture
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    else:
        # Default BERT
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

    # 2) Update the tokenizer used in the dataset
    train_dataset.tokenizer = tokenizer
    test_dataset.tokenizer  = tokenizer

    # 3) Define the Trainer
    trainer = CustomTrainer(
        model=model,      # Loaded pretrained model
        args=training_args,  # Training Parameters
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # 4) Training
    trainer.train()

    # 5) Test set prediction with a single model
    pred_output = trainer.predict(test_dataset)
    predictions = pred_output.predictions
    preds = np.argmax(predictions, axis=1)
    test_labels_true = test_df['label_id'].tolist()

    # 6) Display evaluation for the single model
    print(f"=== Test Performance for {model_name} ===")
    print(classification_report(test_labels_true, preds, target_names=le.classes_))
    acc = accuracy_score(test_labels_true, preds)
    print("Accuracy:", acc)

    # 7) Save the model and tokenizer to a list for later use in ensemble
    all_models.append(model)
    all_tokenizers.append(tokenizer)


===== Fine-tuning and evaluating model: bert-base-uncased =====


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0646,1.008048,0.640417
2,0.6819,1.231042,0.67337
3,0.3234,1.723662,0.674824


=== Test Performance for bert-base-uncased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.72      0.71      0.71      2017
STRONGLYNEGATIVE       0.24      0.18      0.20        28
STRONGLYPOSITIVE       0.31      0.29      0.30        76
  WEAKLYNEGATIVE       0.52      0.51      0.51       440
  WEAKLYPOSITIVE       0.69      0.70      0.70      1566

        accuracy                           0.67      4127
       macro avg       0.49      0.48      0.49      4127
    weighted avg       0.67      0.67      0.67      4127

Accuracy: 0.67482432759874

===== Fine-tuning and evaluating model: roberta-base =====


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0619,1.103679,0.639448
2,0.8847,1.02132,0.659074
3,0.6133,1.133411,0.670705


=== Test Performance for roberta-base ===
                  precision    recall  f1-score   support

         NEUTRAL       0.75      0.65      0.70      2017
STRONGLYNEGATIVE       0.37      0.25      0.30        28
STRONGLYPOSITIVE       0.29      0.45      0.35        76
  WEAKLYNEGATIVE       0.49      0.62      0.55       440
  WEAKLYPOSITIVE       0.67      0.74      0.70      1566

        accuracy                           0.67      4127
       macro avg       0.52      0.54      0.52      4127
    weighted avg       0.68      0.67      0.67      4127

Accuracy: 0.6707051126726435

===== Fine-tuning and evaluating model: distilbert-base-uncased =====


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0725,1.076863,0.632905
2,0.7719,1.122435,0.661255
3,0.3925,1.515739,0.664405


=== Test Performance for distilbert-base-uncased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.71      0.69      0.70      2017
STRONGLYNEGATIVE       0.19      0.11      0.14        28
STRONGLYPOSITIVE       0.32      0.36      0.34        76
  WEAKLYNEGATIVE       0.48      0.48      0.48       440
  WEAKLYPOSITIVE       0.68      0.71      0.69      1566

        accuracy                           0.66      4127
       macro avg       0.48      0.47      0.47      4127
    weighted avg       0.66      0.66      0.66      4127

Accuracy: 0.6644051369033196

===== Fine-tuning and evaluating model: xlnet-base-cased =====


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2508,1.308497,0.51369
2,1.135,1.09806,0.629028
3,0.7963,1.11623,0.652774


model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

=== Test Performance for xlnet-base-cased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.72      0.66      0.69      2017
STRONGLYNEGATIVE       0.54      0.25      0.34        28
STRONGLYPOSITIVE       0.22      0.37      0.28        76
  WEAKLYNEGATIVE       0.49      0.57      0.52       440
  WEAKLYPOSITIVE       0.66      0.69      0.68      1566

        accuracy                           0.65      4127
       macro avg       0.53      0.51      0.50      4127
    weighted avg       0.66      0.65      0.66      4127

Accuracy: 0.6527744124061061

===== Fine-tuning and evaluating model: google/electra-base-generator =====


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-generator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1105,1.071633,0.590986
2,0.9923,0.98086,0.650351
3,0.6628,1.056624,0.656409


model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

=== Test Performance for google/electra-base-generator ===
                  precision    recall  f1-score   support

         NEUTRAL       0.72      0.66      0.69      2017
STRONGLYNEGATIVE       0.47      0.29      0.36        28
STRONGLYPOSITIVE       0.27      0.33      0.29        76
  WEAKLYNEGATIVE       0.46      0.57      0.51       440
  WEAKLYPOSITIVE       0.67      0.70      0.69      1566

        accuracy                           0.66      4127
       macro avg       0.52      0.51      0.51      4127
    weighted avg       0.67      0.66      0.66      4127

Accuracy: 0.6564090138114853

===== Fine-tuning and evaluating model: vinai/bertweet-base =====


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0187,1.036478,0.628544
2,0.9915,0.952301,0.669009
3,0.4801,1.154225,0.675794


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

=== Test Performance for vinai/bertweet-base ===
                  precision    recall  f1-score   support

         NEUTRAL       0.75      0.65      0.70      2017
STRONGLYNEGATIVE       0.33      0.36      0.34        28
STRONGLYPOSITIVE       0.37      0.43      0.40        76
  WEAKLYNEGATIVE       0.49      0.57      0.53       440
  WEAKLYPOSITIVE       0.68      0.75      0.71      1566

        accuracy                           0.68      4127
       macro avg       0.52      0.55      0.54      4127
    weighted avg       0.69      0.68      0.68      4127

Accuracy: 0.6757935546401744


In [6]:
# ============== 8. Perform ensemble by averaging logits across models ==============
def predict_ensemble(texts, max_len=128):
    # Prefer using GPU acceleration
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for m in all_models:
        m.to(device)
        m.eval() # Set the model to evaluation mode

# Initialize an empty list to store predictions for all input texts
    preds_ens = []
    for text in texts:
        logits_sum = None
        for tkn, mdl in zip(all_tokenizers, all_models):
            inputs = tkn(
                text,
                return_tensors='pt',
                max_length=max_len,
                truncation=True,
                padding='max_length'
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                out = mdl(**inputs)  # Forward propagation
                logits = out.logits.detach().cpu().numpy()
            if logits_sum is None:  # Initialize with the logits from the first model
                logits_sum = logits
            else:
                logits_sum += logits

        # Compute the mean
        ensemble_logits = logits_sum / len(all_models)
        # argmax
        pred_label_id = np.argmax(ensemble_logits, axis=1)[0]
        preds_ens.append(pred_label_id)

    return preds_ens

print("\n===== Ensemble (logits average) on Test Set =====")
test_preds_ens = predict_ensemble(test_texts)
test_labels_true = test_df['label_id'].tolist()
print(classification_report(test_labels_true, test_preds_ens, target_names=le.classes_))
acc_ens = accuracy_score(test_labels_true, test_preds_ens)
print("Ensemble Accuracy:", acc_ens)

# Convert predictions from numeric labels back to text
# ensemble_pred_labels_str = le.inverse_transform(test_preds_ens)
# print("Sample ensemble predictions:", ensemble_pred_labels_str[:10])


===== Ensemble (logits average) on Test Set =====
                  precision    recall  f1-score   support

         NEUTRAL       0.74      0.70      0.72      2017
STRONGLYNEGATIVE       0.31      0.14      0.20        28
STRONGLYPOSITIVE       0.33      0.36      0.34        76
  WEAKLYNEGATIVE       0.52      0.56      0.54       440
  WEAKLYPOSITIVE       0.70      0.74      0.72      1566

        accuracy                           0.69      4127
       macro avg       0.52      0.50      0.50      4127
    weighted avg       0.69      0.69      0.69      4127

Ensemble Accuracy: 0.6903319602616913
