In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import random
from nltk.corpus import wordnet

# 设置设备为CPU
device = torch.device('cpu')

# 示例数据
data = {
    'text': [
        "I love this product, it's amazing!",
        "This is the worst experience I've ever had.",
        "The movie was okay, not great but not bad either.",
        "I had a fantastic time at the concert.",
        "The service at the restaurant was terrible."
    ],
    'label': [1, 0, 2, 1, 0]  # 1: Positive, 0: Negative, 2: Neutral
}

df = pd.DataFrame(data)

# 数据增强函数 - 同义词替换
def synonym_replacement(text, n):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        synonym = synonyms[0].lemmas()[0].name()
        if synonym != random_word:
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# 增强数据
augmented_texts = [synonym_replacement(text, 2) for text in df['text']]

# 使用 pd.concat 合并原始数据和增强数据
df_augmented = pd.DataFrame({'text': augmented_texts, 'label': df['label']})
df = pd.concat([df, df_augmented], ignore_index=True)

# 数据预处理
X = df['text']
y = df['label']

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用BERT的Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 数据加载器
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TextDataset(
        texts=df['text'].to_numpy(),
        labels=df['label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)  # 设置 num_workers 为 0，避免多进程问题

# 超参数
BATCH_SIZE = 8
MAX_LEN = 32
EPOCHS = 3

if __name__ == "__main__":
    train_data_loader = create_data_loader(pd.DataFrame({'text': X_train, 'label': y_train}), tokenizer, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(pd.DataFrame({'text': X_test, 'label': y_test}), tokenizer, MAX_LEN, BATCH_SIZE)

    # 加载预训练的BERT模型
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    model = model.to(device)

    # 优化器和学习率调度器
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    # 训练函数
    def train_epoch(model, data_loader, optimizer, scheduler, device):
        model = model.train()
        losses = []
        correct_predictions = 0

        for data in data_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)
            
            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            loss.backward()
            optimizer.step()
            scheduler.step()

        return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

    # 测试函数
    def eval_model(model, data_loader, device):
        model = model.eval()
        losses = []
        correct_predictions = 0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for data in data_loader:
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                labels = data['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                _, preds = torch.max(outputs.logits, dim=1)

                correct_predictions += torch.sum(preds == labels)
                losses.append(loss.item())

                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        return correct_predictions.double() / len(data_loader.dataset), np.mean(losses), all_preds, all_labels

    # 训练模型
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, scheduler, device)
        print(f'Train loss {train_loss} accuracy {train_acc}')

    # 评估模型
    test_acc, test_loss, all_preds, all_labels = eval_model(model, test_data_loader, device)
    print(f'Test Accuracy: {test_acc}')

    # 动态获取实际使用的标签
    labels = np.unique(all_labels)
    target_names = [f'class {label}' for label in labels]  # 为实际出现的标签生成名称

    print(classification_report(all_labels, all_preds, target_names=target_names))

    # 测试新的输入
    new_texts = ["I really enjoy using this software!", "The weather today is bad."]
    encoded_new_texts = tokenizer.batch_encode_plus(
        new_texts,
        add_special_tokens=True,
        max_length=MAX_LEN,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoded_new_texts['input_ids'].to(device)
    attention_mask = encoded_new_texts['attention_mask'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    _, predictions = torch.max(outputs.logits, dim=1)

    print("Predictions:", predictions.cpu().numpy())


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/3
Train loss 1.0586031675338745 accuracy 0.5
Epoch 2/3




Train loss 0.9127342104911804 accuracy 0.75
Epoch 3/3
Train loss 0.7937213182449341 accuracy 1.0
Test Accuracy: 1.0
              precision    recall  f1-score   support

     class 0       1.00      1.00      1.00         1
     class 1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Predictions: [1 1]


