# 情感分析及数据集

In [24]:
import os
import torch
from torch import nn

In [25]:
import os
import requests
import hashlib
import tarfile
from tqdm import tqdm

def download_imdb(cache_dir=os.path.join('.', 'data')):
    """
    一个独立的函数，用于下载、校验和解压aclImdb数据集。
    """
    # 1. 定义数据源信息 (URL 和 SHA-1 校验和)
    url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
    sha1_hash = '01ada507287d82875905620988597833ad4e0903'
    
    # 2. 创建缓存目录和文件路径
    os.makedirs(cache_dir, exist_ok=True)
    archive_path = os.path.join(cache_dir, 'aclImdb_v1.tar.gz')
    # 解压后会生成一个名为 'aclImdb' 的文件夹
    dest_dir = os.path.join(cache_dir, 'aclImdb')

    # 3. 检查本地是否有最终解压好的文件夹，如果没有则下载
    if not os.path.exists(dest_dir):
        print(f"本地未找到 '{dest_dir}'，开始下载流程...")
        # 下载 .tar.gz 文件
        print(f"正在从 {url} 下载...")
        try:
            r = requests.get(url, stream=True, timeout=120) # 考虑到文件较大，增加超时
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(archive_path, 'wb') as f, tqdm(
                desc='aclImdb_v1.tar.gz', total=total_size, unit='iB', unit_scale=True
            ) as bar:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
                    bar.update(len(chunk))
        except requests.exceptions.RequestException as e:
            if os.path.exists(archive_path): os.remove(archive_path)
            raise IOError(f"下载文件时出错: {e}")

        # 4. 校验文件完整性
        print("正在校验文件...")
        sha1 = hashlib.sha1()
        with open(archive_path, 'rb') as f:
            while True:
                data = f.read(1048576) # 1MB 块
                if not data: break
                sha1.update(data)
        if sha1.hexdigest() != sha1_hash:
            raise IOError(f"文件 {archive_path} SHA1 校验和不匹配！")
        
        # 5. 解压 .tar.gz 文件
        print(f"正在解压 {archive_path}...")
        with tarfile.open(archive_path, 'r:gz') as tf:
            tf.extractall(cache_dir)
        os.remove(archive_path) # 删除 .tar.gz 文件节省空间
        print("数据准备完成。")
    else:
        print(f"在本地找到已缓存的数据目录: {dest_dir}")

    return dest_dir

# --- 如何使用 ---
# 只有当此脚本作为主程序运行时，以下代码块才会执行
if __name__ == '__main__':
    # 你需要安装requests和tqdm: pip install requests tqdm
    
    # 调用函数获取数据目录
    # 首次运行时会自动下载 (约80MB)，之后会直接使用本地缓存
    try:
        data_dir = download_imdb()
        
        # 打印路径以验证
        print(f"\n成功获取 aclImdb 数据目录路径: {data_dir}")
        
        # 我们可以检查一下目录内容来确认解压是否成功
        # 例如，检查训练数据中的正面评论文件夹是否存在
        train_pos_path = os.path.join(data_dir, 'train', 'pos')
        if os.path.exists(train_pos_path):
            print(f"在 {train_pos_path} 中找到训练数据，解压成功！")
        else:
            print("错误：未找到预期的训练数据文件夹。")
            
    except (IOError, requests.exceptions.RequestException) as e:
        print(f"\n处理失败: {e}")

在本地找到已缓存的数据目录: ./data/aclImdb

成功获取 aclImdb 数据目录路径: ./data/aclImdb
在 ./data/aclImdb/train/pos 中找到训练数据，解压成功！


In [31]:
#@save
def read_imdb(data_dir, is_train):
    """读取IMDb评论数据集文本序列和标签"""
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test',
                                   label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels

train_data = read_imdb(data_dir, is_train=True)
print('训练集数目：', len(train_data[0]))
for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('标签：', y, 'review:', x[0:60])

# 训练集数目： 25000
# 标签： 1 review: For a movie that gets no respect there sure are a lot of mem
# 标签： 1 review: Bizarre horror movie filled with famous faces but stolen by 
# 标签： 1 review: A solid, if unremarkable film. Matthau, as Einstein, was won

训练集数目： 25000
标签： 1 review: For a movie that gets no respect there sure are a lot of mem
标签： 1 review: Bizarre horror movie filled with famous faces but stolen by 
标签： 1 review: A solid, if unremarkable film. Matthau, as Einstein, was won


In [6]:
import collections

class Vocab:  #@save
    """文本词表"""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # 按出现频率排序
        counter = count_corpus(tokens)
        self._token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                   reverse=True)
        # 未知词元的索引为0
        self.idx_to_token = ['<unk>'] + reserved_tokens
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}
        for token, freq in self._token_freqs:
            if freq < min_freq:
                break
            if token not in self.token_to_idx:
                self.idx_to_token.append(token)
                self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

    @property
    def unk(self):  # 未知词元的索引为0
        return 0

    @property
    def token_freqs(self):
        return self._token_freqs

def count_corpus(tokens):  #@save
    """统计词元的频率"""
    # 这里的tokens是1D列表或2D列表
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # 将词元列表展平成一个列表
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [7]:
def tokenize(lines, token='word'):  #@save
    """将文本行拆分为单词或字符词元"""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('错误：未知词元类型：' + token)

In [32]:
train_tokens = tokenize(train_data[0], token='word')
vocab = Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>'])

In [33]:
def truncate_pad(line, num_steps, padding_token):
    """截断或填充文本序列"""
    if len(line) > num_steps:
        return line[:num_steps]  # 截断
    return line + [padding_token] * (num_steps - len(line))  # 填充

num_steps = 500  # 序列长度
train_features = torch.tensor([truncate_pad(
    vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
print(train_features.shape)

torch.Size([25000, 500])


In [34]:
import torch
from torch.utils.data import TensorDataset, DataLoader

def load_array(data_arrays, batch_size, is_train=True):
    """
    将内存中的数据（如NumPy数组、列表或PyTorch张量）封装成PyTorch数据迭代器。
    这是一个 d2l.load_array 函数的独立实现。

    参数:
        data_arrays (tuple or list): 包含特征、标签等数据数组的元组或列表。
                                     所有数组的第一个维度（样本数）必须相同。
        batch_size (int): 每个小批量的大小。
        is_train (bool): 如果为True，则在每个周期打乱数据顺序；否则不打乱。

    返回:
        一个PyTorch数据迭代器 (torch.utils.data.DataLoader)。
    """
    # 1. 将所有输入数组转换为 PyTorch 的 Tensor
    #    这里假设输入可以是 list, numpy.ndarray, 或 torch.Tensor
    dataset_tensors = [torch.tensor(data) if not isinstance(data, torch.Tensor) else data 
                       for data in data_arrays]
    
    # 2. 使用 TensorDataset 将数据打包
    #    *dataset_tensors 会将列表中的每个张量解包作为独立的参数传入
    dataset = TensorDataset(*dataset_tensors)
    
    # 3. 使用 DataLoader 创建最终的数据迭代器
    #    shuffle 参数根据 is_train 的值来决定
    data_iter = DataLoader(dataset, batch_size, shuffle=is_train)
    
    return data_iter

train_iter = load_array((train_features,
    torch.tensor(train_data[1])), 64)

for X, y in train_iter:
    print('X:', X.shape, ', y:', y.shape)
    break
print('小批量数目：', len(train_iter))

X: torch.Size([64, 500]) , y: torch.Size([64])
小批量数目： 391


In [37]:
#@save
def load_data_imdb(batch_size, num_steps=500):
    """返回数据迭代器和IMDb评论数据集的词表"""
    data_dir = download_imdb()
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = tokenize(train_data[0], token='word')
    test_tokens = tokenize(test_data[0], token='word')
    vocab = Vocab(train_tokens, min_freq=5)
    train_features = torch.tensor([truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = load_array((train_features, torch.tensor(train_data[1])),
                                batch_size)
    test_iter = load_array((test_features, torch.tensor(test_data[1])),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab

# 情感分析：使用循环神经网络

In [38]:
import torch
from torch import nn

batch_size = 64
train_iter, test_iter, vocab = load_data_imdb(batch_size)

在本地找到已缓存的数据目录: ./data/aclImdb


In [39]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens,
                 num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # 将bidirectional设置为True以获取双向循环神经网络
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                                bidirectional=True)
        self.decoder = nn.Linear(4 * num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是（批量大小，时间步数）
        # 因为长短期记忆网络要求其输入的第一个维度是时间维，
        # 所以在获得词元表示之前，输入会被转置。
        # 输出形状为（时间步数，批量大小，词向量维度）
        embeddings = self.embedding(inputs.T)
        self.encoder.flatten_parameters()
        # 返回上一个隐藏层在不同时间步的隐状态，
        # outputs的形状是（时间步数，批量大小，2*隐藏单元数）
        outputs, _ = self.encoder(embeddings)
        # 连结初始和最终时间步的隐状态，作为全连接层的输入，
        # 其形状为（批量大小，4*隐藏单元数）
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

In [40]:
embed_size, num_hiddens, num_layers = 100, 100, 2
devices = torch.device('mps' if torch.mps.is_available() else 'cpu')
net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_uniform_(m._parameters[param])
net.apply(init_weights);

In [41]:
import os
import torch
import requests
import zipfile
from tqdm import tqdm

def _download_embedding_if_needed(name, cache_dir=os.path.join('.', 'data')):
    """
    一个独立的辅助函数，用于下载和解压GloVe词向量。
    该版本能自动在压缩包内查找正确的 .txt 文件名，并支持缓存。
    """
    # 数据源信息库：包含URL
    DATA_HUB = {
        'glove.6b.50d': ('http://d2l-data.s3-accelerate.amazonaws.com/glove.6B.50d.zip',),
        'glove.6b.100d': ('http://d2l-data.s3-accelerate.amazonaws.com/glove.6B.100d.zip',)
    }
    
    if name not in DATA_HUB:
        raise ValueError(f"未定义的数据集名称: {name}")

    url, = DATA_HUB[name]
    
    os.makedirs(cache_dir, exist_ok=True)
    zip_path = os.path.join(cache_dir, url.split('/')[-1])
    
    # 动态确定解压后的文件名，以增加代码的健壮性
    # 通常文件名与下载名类似，例如 'glove.6b.100d.txt'
    # 我们先检查一个可能的文件名，如果不存在再执行下载解压
    potential_fname = name + '.txt'
    embedding_path = os.path.join(cache_dir, potential_fname)
    
    # 关键的缓存检查：如果最终文件不存在，才执行下载解压
    if not os.path.exists(embedding_path):
        print(f"本地未找到词向量文件，开始下载和解压流程...")
        
        # 1. 下载 .zip 文件 (如果压缩包也不存在)
        if not os.path.exists(zip_path):
            print(f"正在下载 {url} ...")
            try:
                r = requests.get(url, stream=True, timeout=60)
                r.raise_for_status()
                total_size = int(r.headers.get('content-length', 0))
                with open(zip_path, 'wb') as f, tqdm(
                    desc=name, total=total_size, unit='iB', unit_scale=True
                ) as bar:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
                        bar.update(len(chunk))
            except requests.exceptions.RequestException as e:
                if os.path.exists(zip_path): os.remove(zip_path)
                raise IOError(f"下载文件时出错: {e}")

        # 2. 解压 .zip 文件
        print(f"正在解压 {zip_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zf:
            # 自动在压缩包内查找 .txt 文件
            txt_filename = ''
            for file_in_zip in zf.namelist():
                if file_in_zip.lower().endswith('.txt'):
                    txt_filename = file_in_zip
                    break
            
            if not txt_filename:
                raise IOError(f"在 {zip_path} 中未找到 .txt 文件。")
            
            print(f"在压缩包中找到文件: {txt_filename}, 正在解压...")
            zf.extract(txt_filename, cache_dir)
            # 更新为正确的最终文件路径
            embedding_path = os.path.join(cache_dir, txt_filename)
        
        os.remove(zip_path) # 操作完成后删除zip文件以节省空间
        print("下载和解压完成。")
    else:
        print(f"在本地找到已缓存的文件: {embedding_path}")

    return embedding_path


class TokenEmbedding:
    """
    一个独立的GloVe嵌入加载类，功能与d2l.TokenEmbedding类似。
    """
    def __init__(self, embedding_name):
        self.idx_to_token, self.idx_to_vec = self._load_embedding(embedding_name)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}
        print(f"成功加载 '{embedding_name}'。词汇表大小: {len(self.idx_to_token)}")

    def _load_embedding(self, embedding_name):
        idx_to_token, idx_to_vec = ['<unk>'], []
        embedding_path = _download_embedding_if_needed(embedding_name)
        
        with open(embedding_path, 'r', encoding='utf-8') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                if len(elems) > 1:
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)
        
        embedding_dim = len(idx_to_vec[0])
        idx_to_vec = [[0.0] * embedding_dim] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec, dtype=torch.float32)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            tokens = [tokens]
        indices = [self.token_to_idx.get(token, self.unknown_idx) for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)


In [42]:
glove_embedding = TokenEmbedding('glove.6b.100d')
embeds = glove_embedding[vocab.idx_to_token]
embeds.shape

本地未找到词向量文件，开始下载和解压流程...
正在下载 http://d2l-data.s3-accelerate.amazonaws.com/glove.6B.100d.zip ...


glove.6b.100d: 100%|██████████| 134M/134M [00:12<00:00, 10.9MiB/s] 


正在解压 ./data/glove.6B.100d.zip...
在压缩包中找到文件: glove.6B.100d/vec.txt, 正在解压...
下载和解压完成。
成功加载 'glove.6b.100d'。词汇表大小: 400001


torch.Size([49346, 100])

In [43]:
net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False

In [47]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
import time
import random

def accuracy(y_hat, y):
    """计算预测正确的数量"""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)
    cmp = y_hat.type(y.dtype) == y
    return float(cmp.type(y.dtype).sum())

def evaluate_accuracy_gpu(net, data_iter, device=None):
    """使用GPU计算模型在数据集上的精度"""
    if isinstance(net, nn.Module):
        net.eval()  # 设置为评估模式
        if not device:
            device = next(iter(net.parameters())).device
    
    metric = [0.0, 0]  # 正确预测数，总预测数
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric[0] += accuracy(net(X), y)
            metric[1] += y.numel()
    return metric[0] / metric[1]

def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices):
    """
    一个不依赖d2l的、功能完备的训练函数，支持多GPU。
    这是 d2l.train_ch13 的一个独立实现。
    """
    start_time = time.time()
    # 将模型分发到指定的设备上
    net = nn.DataParallel(net, device_ids=devices).to(devices)
    
    print(f"在 {devices} 上开始训练...")
    
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        
        # 训练过程
        net.train() # 设置为训练模式
        train_loss_sum, train_acc_sum, num_samples = 0.0, 0.0, 0
        
        for i, (X, y) in enumerate(train_iter):
            # 将数据移动到主设备
            X, y = X.to(devices), y.to(devices)
            
            trainer.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y)
            l.sum().backward()
            trainer.step()
            
            # 累加指标
            train_loss_sum += l.sum()
            train_acc_sum += accuracy(y_hat, y)
            num_samples += y.numel()

        # 评估过程
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        
        # 打印信息
        epoch_time = time.time() - epoch_start_time
        train_loss_avg = train_loss_sum / num_samples
        train_acc_avg = train_acc_sum / num_samples
        
        print(f'Epoch {epoch + 1}/{num_epochs}, '
              f'训练损失 {train_loss_avg:.4f}, '
              f'训练精度 {train_acc_avg:.4f}, '
              f'测试精度 {test_acc:.4f}, '
              f'耗时 {epoch_time:.2f} 秒')

    total_time = time.time() - start_time
    print(f"\n训练完成！总耗时 {total_time:.2f} 秒")


lr, num_epochs = 0.01, 5
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss(reduction="none")
devices = torch.device('mps' if torch.mps.is_available() else 'cpu')
train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs,
    devices)

在 mps 上开始训练...


KeyboardInterrupt: 

In [None]:
#@save
def predict_sentiment(net, vocab, sequence):
    """预测文本序列的情感"""
    sequence = torch.tensor(vocab[sequence.split()], device=devices)
    label = torch.argmax(net(sequence.reshape(1, -1)), dim=1)
    return 'positive' if label == 1 else 'negative'

# 情感分析：使用卷积神经网络

In [48]:
import torch
from torch import nn

batch_size = 64
train_iter, test_iter, vocab = load_data_imdb(batch_size)

在本地找到已缓存的数据目录: ./data/aclImdb


In [49]:
def corr1d(X, K):
    w = K.shape[0]
    Y = torch.zeros((X.shape[0] - w + 1))
    for i in range(Y.shape[0]):
        Y[i] = (X[i: i + w] * K).sum()
    return Y

In [50]:
X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
corr1d(X, K)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [51]:
def corr1d_multi_in(X, K):
    # 首先，遍历'X'和'K'的第0维（通道维）。然后，把它们加在一起
    return sum(corr1d(x, k) for x, k in zip(X, K))

X = torch.tensor([[0, 1, 2, 3, 4, 5, 6],
              [1, 2, 3, 4, 5, 6, 7],
              [2, 3, 4, 5, 6, 7, 8]])
K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(X, K)

tensor([ 2.,  8., 14., 20., 26., 32.])