Sentiment Analysis

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import spacy
from torchtext.datasets import IMDB
from torchtext.data import Field, BucketIterator
from torchtext.data.utils import get_tokenizer

# 数据预处理
spacy_en = spacy.load('en_core_web_sm')
def tokenizer(text):
    return [token.text for token in spacy_en.tokenizer(text)]

TEXT = Field(sequential=True, tokenize=tokenizer, include_lengths=True, batch_first=True)
LABEL = Field(sequential=False, use_vocab=True, is_target=True, batch_first=True)

# 下载IMDB数据集
train_data, test_data = IMDB.splits(TEXT, LABEL)

# 构建词汇表
TEXT.build_vocab(train_data, max_size=25000, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

# 数据加载器
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data), batch_size=BATCH_SIZE, device=device
)

# 定义RNN模型
class RNNModel(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()

        # 嵌入层
        self.embedding = nn.Embedding(input_dim, embedding_dim)

        # 循环层（RNN）
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)

        # 输出层
        self.fc = nn.Linear(hidden_dim, output_dim)
        
        # Dropout层
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        # text是一个batch的输入，text_lengths是每个句子的实际长度
        embedded = self.embedding(text)
        
        # pack_padded_sequence 用来处理不同长度的序列
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        
        # 通过RNN层
        packed_output, hidden = self.rnn(packed_embedded)
        
        # 通过Dropout层
        hidden = self.dropout(hidden[-1])
        
        # 输出
        output = self.fc(hidden)
        return output

# 设置模型超参数
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
dropout = 0.5

# 实例化模型并将其转移到相应的设备
model = RNNModel(input_dim, embedding_dim, hidden_dim, output_dim, n_layers, dropout).to(device)

# 使用预训练的GloVe嵌入
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

# 初始化模型的输出层权重
model.fc.weight.data.normal_(0, 0.01)
model.fc.bias.data.fill_(0)

# 定义损失函数和优化器
optimizer = optim.Adam(model.parameters(), lr=1e-4)  # 可调整学习率
criterion = nn.BCEWithLogitsLoss()

# 训练函数
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in iterator:
        text, text_lengths = batch.text
        labels = batch.label.float()
        
        # 将数据移到设备
        text, labels = text.to(device), labels.to(device)
        
        # 不再将 text_lengths 移到 CPU，保持其在 GPU 上
        text_lengths = text_lengths.to(device)
        
        optimizer.zero_grad()
        
        # 正向传播
        predictions = model(text, text_lengths).squeeze(1)
        
        # 计算损失
        loss = criterion(predictions, labels)
        
        # 计算准确率
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# 准确率计算函数
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))  # 直接计算预测的标签
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

# 评估函数
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            labels = batch.label.float()
            
            # 将数据移到设备
            text, labels = text.to(device), labels.to(device)
            
            # 不再将 text_lengths 移到 CPU，保持其在 GPU 上
            text_lengths = text_lengths.to(device)
            
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# 训练和评估模型
N_EPOCHS = 20  # 增加 epoch 数量
print("start training")
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    
    print(f"Epoch {epoch+1}")
    print(f"Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%")
    print(f"Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%")

    print(f"Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%")
    print(f"Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%")


start training


RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor