In [None]:
pip install pandas scikit-learn imbalanced-learn torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # 或在 Trainer 中设置 report_to="none"

import re
import torch
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# 如果想做 TF-IDF + SMOTE，可导入
# from sklearn.feature_extraction.text import TfidfVectorizer
# from imblearn.over_sampling import SMOTE

# transformers 相关
from transformers import (
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
import torch.nn as nn

# ============== 1. 数据加载与清洗 ==============
df = pd.read_csv(
    "SemEval2017-task4-dev.subtask-CE.english.INPUT.txt",
    sep='\t',
    header=None,
    names=['id', 'topic', 'label_num', 'tweet_raw'],
)

# label 数字与字符串映射
label_map = {
    -2: "STRONGLYNEGATIVE",
    -1: "WEAKLYNEGATIVE",
     0: "NEUTRAL",
     1: "WEAKLYPOSITIVE",
     2: "STRONGLYPOSITIVE"
}
df['label'] = df['label_num'].map(label_map)

def basic_text_cleaning(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['tweet'] = df['tweet_raw'].astype(str).apply(basic_text_cleaning)

# 将topic和tweet拼接成一个文本，用于BERT输入
df['input_text'] = df.apply(lambda row: f"[TOPIC] {row['topic']} [SEP] {row['tweet']}", axis=1)

# ============== 2. 可选情感词典增强（示例） ==============
# 这里仅演示一个简单情感词典计分。如果你不需要，可注释掉
senti_lexicon = {
    "love": 2, "like": 1, "good": 1, "hate": -2, "bad": -1, "horrible": -2
}
def lexicon_score(sentence):
    words = sentence.lower().split()
    score = 0
    for w in words:
        if w in senti_lexicon:
            score += senti_lexicon[w]
    return score

df['lexicon_score'] = df['tweet'].apply(lexicon_score)

# ============== 3. 数据拆分 & 样本不平衡处理 ==============
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])  # 转成0~4

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label_id']
)

# 计算 class_weights，用于加权交叉熵
train_labels_array = train_df['label_id'].to_numpy()
class_counts = Counter(train_labels_array)
num_samples = len(train_labels_array)
num_classes = len(class_counts)
weights = [num_samples / (num_classes * class_counts[i]) for i in range(num_classes)]
class_weights = torch.tensor(weights, dtype=torch.float)

# ============== 4. 构建Dataset ==============
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_texts = train_df['input_text'].tolist()
train_labels = train_df['label_id'].tolist()
test_texts = test_df['input_text'].tolist()
test_labels = test_df['label_id'].tolist()

# ============== 5. 自定义 Trainer（加权交叉熵） ==============
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        新版本 transformers 中，Trainer 在调用时可能传入更多参数 (如 num_items_in_batch)，
        所以这里加上 **kwargs 以避免报错。
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        # 使用 class_weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ============== 6. 训练配置 (TrainingArguments) ==============
training_args = TrainingArguments(
    output_dir='./checkpoints',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",  # 如果你的版本提示改回 evaluation_strategy="epoch" 也行
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    do_eval=True,
    report_to="none",  # 禁用wandb报告
)

# ============== 7. 选择5个模型并逐个fine-tune ==============
model_names = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "xlnet-base-cased",
    "google/electra-base-generator"
]

all_trained_models = []
all_tokenizers = []

# 准备dataset
train_dataset = BERTDataset(train_texts, train_labels, None, max_len=128)
test_dataset = BERTDataset(test_texts, test_labels, None, max_len=128)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

for model_name in model_names:
    print(f"***** Fine-tuning model {model_name} *****")

    # 加载 tokenizer & model
    if "roberta" in model_name.lower():
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "distilbert" in model_name.lower():
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "xlnet" in model_name.lower():
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "electra" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    else:
        # 缺省BERT
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

    # 更新dataset里使用的tokenizer
    train_dataset.tokenizer = tokenizer
    test_dataset.tokenizer = tokenizer

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()

    # 训练完成后保存
    # trainer.save_model(f"./checkpoints/{model_name}")

    # 记录训练好的模型和对应的 tokenizer，供后面做 ensemble
    all_trained_models.append(trainer.model)
    all_tokenizers.append(tokenizer)

# ============== 8. 简单的ensemble (logits平均) ==============
# 注意我们需要把模型和输入都放在同一个设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_ensemble(texts):
    # 先将所有模型移到GPU/CPU
    for model in all_trained_models:
        model.to(device)
        model.eval()

    final_preds = []
    for text in texts:
        logits_sum = None
        for tokenizer, model in zip(all_tokenizers, all_trained_models):
            # 构造输入
            inputs = tokenizer(
                text,
                return_tensors='pt',
                max_length=128,
                truncation=True,
                padding='max_length'
            )
            # 把输入也送到同一设备
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                # 取回 CPU 做numpy运算
                logits = outputs.logits.detach().cpu().numpy()

            if logits_sum is None:
                logits_sum = logits
            else:
                logits_sum += logits

        # 取5个模型的平均logits
        ensemble_logits = logits_sum / len(all_trained_models)
        # 取最大值对应的类别
        pred_label_id = np.argmax(ensemble_logits, axis=1)[0]
        final_preds.append(pred_label_id)

    return final_preds

# 对test集进行ensemble预测
test_preds_ens = predict_ensemble(test_texts)
test_labels_true = test_df['label_id'].tolist()

print("=== Ensemble Model Test Performance ===")
print(classification_report(test_labels_true, test_preds_ens, target_names=le.classes_))
acc_ens = accuracy_score(test_labels_true, test_preds_ens)
print("Ensemble Accuracy:", acc_ens)

# 如果需要把预测从数字转回文本
pred_labels_str = le.inverse_transform(test_preds_ens)
print("Sample ensemble predictions (text):", pred_labels_str[:10])


***** Fine-tuning model bert-base-uncased *****


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9696,0.913417,0.646959


***** Fine-tuning model roberta-base *****


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2593,1.253502,0.526048


***** Fine-tuning model distilbert-base-uncased *****


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9423,0.926661,0.64599


***** Fine-tuning model xlnet-base-cased *****


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5407,1.490338,0.488733


***** Fine-tuning model google/electra-base-generator *****


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-generator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0505,1.015844,0.632905


=== Ensemble Model Test Performance ===
                  precision    recall  f1-score   support

         NEUTRAL       0.73      0.61      0.66      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.50      0.03      0.05        76
  WEAKLYNEGATIVE       0.43      0.69      0.53       440
  WEAKLYPOSITIVE       0.68      0.76      0.71      1566

        accuracy                           0.66      4127
       macro avg       0.47      0.42      0.39      4127
    weighted avg       0.67      0.66      0.65      4127

Ensemble Accuracy: 0.6568936273322026
Sample ensemble predictions (text): ['WEAKLYPOSITIVE' 'WEAKLYPOSITIVE' 'WEAKLYPOSITIVE' 'WEAKLYPOSITIVE'
 'NEUTRAL' 'WEAKLYNEGATIVE' 'WEAKLYPOSITIVE' 'WEAKLYPOSITIVE'
 'WEAKLYPOSITIVE' 'NEUTRAL']


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # 禁用 wandb, 也可在 Trainer 里用 report_to="none"

import re
import torch
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter

from transformers import (
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
import torch.nn as nn

# ============== 1. 数据加载与清洗 ==============
df = pd.read_csv(
    "SemEval2017-task4-dev.subtask-CE.english.INPUT.txt",
    sep='\t',
    header=None,
    names=['id', 'topic', 'label_num', 'tweet_raw'],
)

# label 数字与字符串映射
label_map = {
    -2: "STRONGLYNEGATIVE",
    -1: "WEAKLYNEGATIVE",
     0: "NEUTRAL",
     1: "WEAKLYPOSITIVE",
     2: "STRONGLYPOSITIVE"
}
df['label'] = df['label_num'].map(label_map)

def basic_text_cleaning(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['tweet'] = df['tweet_raw'].astype(str).apply(basic_text_cleaning)

# 将topic和tweet拼接成一个文本，用于BERT输入
df['input_text'] = df.apply(lambda row: f"[TOPIC] {row['topic']} [SEP] {row['tweet']}", axis=1)

# ============== 2. 可选情感词典增强（示例） ==============
# 如果不需要，可注释
senti_lexicon = {
    "love": 2, "like": 1, "good": 1, "hate": -2, "bad": -1, "horrible": -2
}
def lexicon_score(sentence):
    words = sentence.lower().split()
    score = 0
    for w in words:
        if w in senti_lexicon:
            score += senti_lexicon[w]
    return score

df['lexicon_score'] = df['tweet'].apply(lexicon_score)

# ============== 3. 数据拆分 & 样本不平衡处理 ==============
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])  # 转成0~4

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label_id']
)

# 计算 class_weights，用于加权交叉熵
train_labels_array = train_df['label_id'].to_numpy()
class_counts = Counter(train_labels_array)
num_samples = len(train_labels_array)
num_classes = len(class_counts)
weights = [num_samples / (num_classes * class_counts[i]) for i in range(num_classes)]
class_weights = torch.tensor(weights, dtype=torch.float)

# ============== 4. 构建Dataset ==============
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_texts = train_df['input_text'].tolist()
train_labels = train_df['label_id'].tolist()
test_texts = test_df['input_text'].tolist()
test_labels = test_df['label_id'].tolist()

# ============== 5. 自定义 Trainer（加权交叉熵） ==============
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        在新版本 transformers 中，Trainer 在调用时可能多传入参数
        (如 num_items_in_batch)，所以这里加 **kwargs 以避免报错。
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits
        # 使用 class_weights
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ============== 6. 训练配置 (TrainingArguments) ==============
training_args = TrainingArguments(
    output_dir='./checkpoints',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",  # 或改回 evaluation_strategy="epoch" 兼容版本
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    do_eval=True,
    report_to="none",  # 禁用wandb日志
)

# ============== 7. 依次训练每个模型，并单独输出Test Performance ==============
model_names = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "xlnet-base-cased",
    "google/electra-base-generator"
]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# 准备 dataset
train_dataset = BERTDataset(train_texts, train_labels, None, max_len=128)
test_dataset = BERTDataset(test_texts, test_labels, None, max_len=128)

for model_name in model_names:
    print(f"\n===== Fine-tuning and evaluating model: {model_name} =====")

    # 1) 加载 tokenizer & model
    if "roberta" in model_name.lower():
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "distilbert" in model_name.lower():
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "xlnet" in model_name.lower():
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "electra" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    else:
        # 缺省BERT
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

    # 2) 更新dataset里使用的tokenizer
    train_dataset.tokenizer = tokenizer
    test_dataset.tokenizer = tokenizer

    # 3) 定义Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # 4) 训练
    trainer.train()

    # 5) 测试集预测
    pred_output = trainer.predict(test_dataset)
    predictions = pred_output.predictions
    preds = np.argmax(predictions, axis=1)
    # 真实标签
    test_labels_true = test_df['label_id'].tolist()

    # 6) 评估并打印
    print(f"=== Test Performance for {model_name} ===")
    print(classification_report(test_labels_true, preds, target_names=le.classes_))
    acc = accuracy_score(test_labels_true, preds)
    print("Accuracy:", acc)

    # 如果需要把预测从数字转回文本
    # pred_labels_str = le.inverse_transform(preds)
    # print("Sample predictions:", pred_labels_str[:10])



===== Fine-tuning and evaluating model: bert-base-uncased =====


tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9614,0.911207,0.656167


=== Test Performance for bert-base-uncased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.76      0.60      0.67      2017
STRONGLYNEGATIVE       0.20      0.04      0.06        28
STRONGLYPOSITIVE       0.29      0.36      0.32        76
  WEAKLYNEGATIVE       0.43      0.72      0.54       440
  WEAKLYPOSITIVE       0.68      0.73      0.71      1566

        accuracy                           0.66      4127
       macro avg       0.47      0.49      0.46      4127
    weighted avg       0.68      0.66      0.66      4127

Accuracy: 0.6561667070511267

===== Fine-tuning and evaluating model: roberta-base =====


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9634,0.898862,0.646474


=== Test Performance for roberta-base ===
                  precision    recall  f1-score   support

         NEUTRAL       0.76      0.57      0.65      2017
STRONGLYNEGATIVE       0.43      0.11      0.17        28
STRONGLYPOSITIVE       0.29      0.49      0.36        76
  WEAKLYNEGATIVE       0.42      0.74      0.53       440
  WEAKLYPOSITIVE       0.67      0.74      0.70      1566

        accuracy                           0.65      4127
       macro avg       0.52      0.53      0.49      4127
    weighted avg       0.68      0.65      0.65      4127

Accuracy: 0.6464744366367822

===== Fine-tuning and evaluating model: distilbert-base-uncased =====


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9463,0.924732,0.643324


=== Test Performance for distilbert-base-uncased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.75      0.60      0.66      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.28      0.36      0.32        76
  WEAKLYNEGATIVE       0.41      0.72      0.52       440
  WEAKLYPOSITIVE       0.68      0.71      0.69      1566

        accuracy                           0.64      4127
       macro avg       0.42      0.48      0.44      4127
    weighted avg       0.67      0.64      0.65      4127

Accuracy: 0.6433244487521201

===== Fine-tuning and evaluating model: xlnet-base-cased =====


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.127,1.156274,0.582505


model.safetensors:   0%|          | 0.00/467M [00:00<?, ?B/s]

=== Test Performance for xlnet-base-cased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.68      0.48      0.56      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.00      0.00      0.00        76
  WEAKLYNEGATIVE       0.35      0.71      0.47       440
  WEAKLYPOSITIVE       0.62      0.72      0.66      1566

        accuracy                           0.58      4127
       macro avg       0.33      0.38      0.34      4127
    weighted avg       0.60      0.58      0.58      4127

Accuracy: 0.5825054519021081

===== Fine-tuning and evaluating model: google/electra-base-generator =====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-generator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1024,1.015193,0.631936


model.safetensors:   0%|          | 0.00/135M [00:00<?, ?B/s]

=== Test Performance for google/electra-base-generator ===
                  precision    recall  f1-score   support

         NEUTRAL       0.71      0.61      0.65      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.32      0.09      0.14        76
  WEAKLYNEGATIVE       0.39      0.69      0.50       440
  WEAKLYPOSITIVE       0.68      0.69      0.68      1566

        accuracy                           0.63      4127
       macro avg       0.42      0.41      0.39      4127
    weighted avg       0.65      0.63      0.63      4127

Accuracy: 0.6319360310152653


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"  # 禁用 wandb, 也可在 Trainer 里用 report_to="none"

import re
import torch
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter

from transformers import (
    BertTokenizer, BertForSequenceClassification,
    RobertaTokenizer, RobertaForSequenceClassification,
    DistilBertTokenizer, DistilBertForSequenceClassification,
    XLNetTokenizer, XLNetForSequenceClassification,
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer
)
import torch.nn as nn

# ============== 1. 数据加载与清洗 ==============
df = pd.read_csv(
    "SemEval2017-task4-dev.subtask-CE.english.INPUT.txt",
    sep='\t',
    header=None,
    names=['id', 'topic', 'label_num', 'tweet_raw'],
)

# label 数字与字符串映射
label_map = {
    -2: "STRONGLYNEGATIVE",
    -1: "WEAKLYNEGATIVE",
     0: "NEUTRAL",
     1: "WEAKLYPOSITIVE",
     2: "STRONGLYPOSITIVE"
}
df['label'] = df['label_num'].map(label_map)

def basic_text_cleaning(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^A-Za-z0-9(),!?\'`]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['tweet'] = df['tweet_raw'].astype(str).apply(basic_text_cleaning)

# 将topic和tweet拼接成一个文本，用于BERT输入
df['input_text'] = df.apply(lambda row: f"[TOPIC] {row['topic']} [SEP] {row['tweet']}", axis=1)

# ============== 2. 可选情感词典增强（示例） ==============
senti_lexicon = {
    "love": 2, "like": 1, "good": 1, "hate": -2, "bad": -1, "horrible": -2
}
def lexicon_score(sentence):
    words = sentence.lower().split()
    score = 0
    for w in words:
        if w in senti_lexicon:
            score += senti_lexicon[w]
    return score

df['lexicon_score'] = df['tweet'].apply(lexicon_score)

# ============== 3. 数据拆分 & 样本不平衡处理 ==============
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['label'])  # 转成0~4

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label_id']
)

# 计算 class_weights，用于加权交叉熵（可选）
train_labels_array = train_df['label_id'].to_numpy()
class_counts = Counter(train_labels_array)
num_samples = len(train_labels_array)
num_classes = len(class_counts)
weights = [num_samples / (num_classes * class_counts[i]) for i in range(num_classes)]
class_weights = torch.tensor(weights, dtype=torch.float)

# ============== 4. 构建Dataset ==============
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_texts = train_df['input_text'].tolist()
train_labels = train_df['label_id'].tolist()
test_texts = test_df['input_text'].tolist()
test_labels = test_df['label_id'].tolist()

# ============== 5. 自定义 Trainer（加权交叉熵） ==============
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Trainer 在调用时可能多传入参数 (如 num_items_in_batch)，
        所以这里加 **kwargs 以避免报错。
        """
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits

        # 使用 class_weights（可改成普通 loss_fct = nn.CrossEntropyLoss()）
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

# ============== 6. 训练配置 (TrainingArguments) ==============
training_args = TrainingArguments(
    output_dir='./checkpoints',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=50,
    do_eval=True,
    report_to="none",  # 禁用wandb日志
)

# ============== 7. 依次训练每个模型，并单独输出Test Performance ==============
# 同时把模型和tokenizer存起来，供后面ensemble使用
model_names = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "xlnet-base-cased",
    "google/electra-base-generator",
    # 新增 vinai/bertweet-base
    "vinai/bertweet-base"
]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

train_dataset = BERTDataset(train_texts, train_labels, None, max_len=128)
test_dataset  = BERTDataset(test_texts,  test_labels,  None, max_len=128)

all_models = []
all_tokenizers = []

for model_name in model_names:
    print(f"\n===== Fine-tuning and evaluating model: {model_name} =====")

    # 1) 加载 tokenizer & model
    # 注意 bertweet 通常与 roberta 类似，也可直接用AutoTokenizer/AutoModel
    if "roberta" in model_name.lower():
        tokenizer = RobertaTokenizer.from_pretrained(model_name)
        model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "distilbert" in model_name.lower():
        tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "xlnet" in model_name.lower():
        tokenizer = XLNetTokenizer.from_pretrained(model_name)
        model = XLNetForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "electra" in model_name.lower():
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    elif "bertweet" in model_name.lower():
        # BERTweet常是RoBERTa架构
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
    else:
        # 缺省BERT
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

    # 2) 更新dataset里使用的tokenizer
    train_dataset.tokenizer = tokenizer
    test_dataset.tokenizer  = tokenizer

    # 3) 定义Trainer
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    # 4) 训练
    trainer.train()

    # 5) 单模型测试集预测
    pred_output = trainer.predict(test_dataset)
    predictions = pred_output.predictions
    preds = np.argmax(predictions, axis=1)
    test_labels_true = test_df['label_id'].tolist()

    # 6) 打印单模型评估
    print(f"=== Test Performance for {model_name} ===")
    print(classification_report(test_labels_true, preds, target_names=le.classes_))
    acc = accuracy_score(test_labels_true, preds)
    print("Accuracy:", acc)

    # 7) 保存模型 & tokenizer 到列表，用于后续ensemble
    all_models.append(model)
    all_tokenizers.append(tokenizer)

# ============== 8. 做一个 ensemble (logits平均) ==============
def predict_ensemble(texts, max_len=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for m in all_models:
        m.to(device)
        m.eval()

    preds_ens = []
    for text in texts:
        logits_sum = None
        for tkn, mdl in zip(all_tokenizers, all_models):
            inputs = tkn(
                text,
                return_tensors='pt',
                max_length=max_len,
                truncation=True,
                padding='max_length'
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                out = mdl(**inputs)
                logits = out.logits.detach().cpu().numpy()
            if logits_sum is None:
                logits_sum = logits
            else:
                logits_sum += logits

        # 取平均
        ensemble_logits = logits_sum / len(all_models)
        # argmax
        pred_label_id = np.argmax(ensemble_logits, axis=1)[0]
        preds_ens.append(pred_label_id)

    return preds_ens

print("\n===== Ensemble (logits average) on Test Set =====")
test_preds_ens = predict_ensemble(test_texts)
test_labels_true = test_df['label_id'].tolist()
print(classification_report(test_labels_true, test_preds_ens, target_names=le.classes_))
acc_ens = accuracy_score(test_labels_true, test_preds_ens)
print("Ensemble Accuracy:", acc_ens)

# 如果需要把预测从数字转回文本
# ensemble_pred_labels_str = le.inverse_transform(test_preds_ens)
# print("Sample ensemble predictions:", ensemble_pred_labels_str[:10])


===== Fine-tuning and evaluating model: bert-base-uncased =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9426,0.913898,0.650109


=== Test Performance for bert-base-uncased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.77      0.58      0.66      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.27      0.39      0.32        76
  WEAKLYNEGATIVE       0.43      0.73      0.54       440
  WEAKLYPOSITIVE       0.67      0.74      0.70      1566

        accuracy                           0.65      4127
       macro avg       0.43      0.49      0.44      4127
    weighted avg       0.68      0.65      0.65      4127

Accuracy: 0.6501090380421614

===== Fine-tuning and evaluating model: roberta-base =====


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9634,0.898862,0.646474


=== Test Performance for roberta-base ===
                  precision    recall  f1-score   support

         NEUTRAL       0.76      0.57      0.65      2017
STRONGLYNEGATIVE       0.43      0.11      0.17        28
STRONGLYPOSITIVE       0.29      0.49      0.36        76
  WEAKLYNEGATIVE       0.42      0.74      0.53       440
  WEAKLYPOSITIVE       0.67      0.74      0.70      1566

        accuracy                           0.65      4127
       macro avg       0.52      0.53      0.49      4127
    weighted avg       0.68      0.65      0.65      4127

Accuracy: 0.6464744366367822

===== Fine-tuning and evaluating model: distilbert-base-uncased =====


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9463,0.924732,0.643324


=== Test Performance for distilbert-base-uncased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.75      0.60      0.66      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.28      0.36      0.32        76
  WEAKLYNEGATIVE       0.41      0.72      0.52       440
  WEAKLYPOSITIVE       0.68      0.71      0.69      1566

        accuracy                           0.64      4127
       macro avg       0.42      0.48      0.44      4127
    weighted avg       0.67      0.64      0.65      4127

Accuracy: 0.6433244487521201

===== Fine-tuning and evaluating model: xlnet-base-cased =====


Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.127,1.156274,0.582505


=== Test Performance for xlnet-base-cased ===
                  precision    recall  f1-score   support

         NEUTRAL       0.68      0.48      0.56      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.00      0.00      0.00        76
  WEAKLYNEGATIVE       0.35      0.71      0.47       440
  WEAKLYPOSITIVE       0.62      0.72      0.66      1566

        accuracy                           0.58      4127
       macro avg       0.33      0.38      0.34      4127
    weighted avg       0.60      0.58      0.58      4127

Accuracy: 0.5825054519021081

===== Fine-tuning and evaluating model: google/electra-base-generator =====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-generator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1024,1.015193,0.631936


=== Test Performance for google/electra-base-generator ===
                  precision    recall  f1-score   support

         NEUTRAL       0.71      0.61      0.65      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.32      0.09      0.14        76
  WEAKLYNEGATIVE       0.39      0.69      0.50       440
  WEAKLYPOSITIVE       0.68      0.69      0.68      1566

        accuracy                           0.63      4127
       macro avg       0.42      0.41      0.39      4127
    weighted avg       0.65      0.63      0.63      4127

Accuracy: 0.6319360310152653

===== Fine-tuning and evaluating model: vinai/bertweet-base =====


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8983,0.881347,0.659801


=== Test Performance for vinai/bertweet-base ===
                  precision    recall  f1-score   support

         NEUTRAL       0.78      0.60      0.68      2017
STRONGLYNEGATIVE       0.19      0.11      0.14        28
STRONGLYPOSITIVE       0.26      0.41      0.32        76
  WEAKLYNEGATIVE       0.43      0.72      0.54       440
  WEAKLYPOSITIVE       0.68      0.75      0.71      1566

        accuracy                           0.66      4127
       macro avg       0.47      0.52      0.48      4127
    weighted avg       0.69      0.66      0.67      4127

Accuracy: 0.659801308456506

===== Ensemble (logits average) on Test Set =====
                  precision    recall  f1-score   support

         NEUTRAL       0.76      0.60      0.68      2017
STRONGLYNEGATIVE       0.00      0.00      0.00        28
STRONGLYPOSITIVE       0.33      0.28      0.30        76
  WEAKLYNEGATIVE       0.44      0.74      0.55       440
  WEAKLYPOSITIVE       0.69      0.76      0.72      156

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
