In [3]:

import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# 1. 数据准备
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # 使用BERT tokenizer对文本进行编码
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 2. 创建示例数据集
def create_sample_data():
    """创建示例数据集"""
    # 示例数据：电影评论情感分类（0=负面，1=正面）
    texts = [
        "This movie is absolutely fantastic! I loved every minute of it.",
        "Terrible film. Waste of time and money. Poor acting and plot.",
        "Great story with excellent performances. Highly recommended!",
        "Boring and predictable. Not worth watching.",
        "Amazing cinematography and brilliant acting. A masterpiece!",
        "Awful script and bad direction. Completely disappointing.",
        "Wonderful film with great characters and storyline.",
        "One of the worst movies I've ever seen. Terrible!",
        "Outstanding performance by all actors. Brilliant direction.",
        "Not good at all. Very boring and poorly made.",
        "Incredible movie with stunning visuals and great plot.",
        "Complete waste of time. Poorly executed in every way.",
        "Excellent film with fantastic acting and great story.",
        "Very disappointing. Bad acting and terrible script.",
        "Brilliant movie with amazing performances and direction.",
        "Awful film. Boring and completely uninteresting.",
        "Fantastic story with excellent acting and direction.",
        "Poorly made with bad plot and terrible acting.",
        "Wonderful cinematography and great performances by all.",
        "One of the best films I've seen this year. Highly recommend!"
    ]
    
    labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1]  # 1=正面, 0=负面
    
    return texts, labels

# 3. 模型训练函数
def train_model(model, train_dataloader, val_dataloader, epochs=4, lr=2e-5):
    """训练BERT分类模型"""
    # 设置设备
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    # 设置优化器和学习率调度器
    optimizer = AdamW(model.parameters(), lr=lr)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # 训练循环
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        
        for batch in train_dataloader:
            # 将数据移到设备上
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # 清零梯度
            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            # 反向传播
            loss.backward()
            
            # 梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            # 更新参数
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Average training loss: {avg_train_loss:.4f}")
        
        # 验证模型
        val_accuracy = evaluate_model(model, val_dataloader, device)
        print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    return model

# 4. 模型评估函数
def evaluate_model(model, dataloader, device):
    """评估模型性能"""
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).flatten()
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy

# 5. 预测函数
def predict_sentiment(model, tokenizer, text, device):
    """对单个文本进行情感预测"""
    model.eval()
    
    # 编码文本
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=1)
        prediction = torch.argmax(logits, dim=1).item()
        confidence = probabilities[0][prediction].item()
    
    return prediction, confidence

# 6. 主函数 - 完整流程
def bert_text_classification_example():
    """使用BERT进行文本分类的完整示例"""
    print("=== BERT文本分类示例 ===")
    
    # 1. 加载预训练的BERT模型和分词器
    print("1. 加载预训练的BERT模型和分词器...")
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,  # 二分类任务
        output_attentions=False,
        output_hidden_states=False
    )
    
    # 2. 准备数据
    print("2. 准备数据...")
    texts, labels = create_sample_data()
    
    # 划分训练集和验证集
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    
    # 创建数据集
    train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer)
    
    # 创建数据加载器
    batch_size = 4
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # 3. 训练模型
    print("3. 开始训练模型...")
    trained_model = train_model(model, train_dataloader, val_dataloader, epochs=3)
    
    # 4. 测试预测
    print("4. 测试预测...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    test_texts = [
        "This is an amazing movie with great acting!",
        "I hate this film. It's boring and terrible.",
        "The movie was okay, nothing special but not bad either."
    ]
    
    for text in test_texts:
        prediction, confidence = predict_sentiment(trained_model, tokenizer, text, device)
        sentiment = "Positive" if prediction == 1 else "Negative"
        print(f"Text: {text}")
        print(f"Prediction: {sentiment} (Confidence: {confidence:.4f})")
        print("-" * 50)
    
    # 5. 详细评估
    print("5. 详细评估...")
    val_accuracy = evaluate_model(trained_model, val_dataloader, device)
    print(f"Final Validation Accuracy: {val_accuracy:.4f}")
    
    return trained_model, tokenizer

# 运行示例
if __name__ == "__main__":
    # 注意：运行此示例需要安装transformers库
    # pip install transformers torch scikit-learn
    try:
        model, tokenizer = bert_text_classification_example()
    except Exception as e:
        print(f"运行示例时出错: {e}")
        print("请确保已安装所需的依赖库: pip install transformers torch scikit-learn")


=== BERT文本分类示例 ===
1. 加载预训练的BERT模型和分词器...
运行示例时出错: (MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/tokenizer_config.json (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1010)')))"), '(Request ID: ebc51ba5-9497-4d2d-b322-be5b3bfb55f4)')
请确保已安装所需的依赖库: pip install transformers torch scikit-learn
