In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121  # 或改为 cpu 版
!pip install datasets torchtext scikit-learn

Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [None]:
# -*- coding: utf-8 -*-
import os, re, random, math
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

# ========== 超参数 ==========
SEED = 2025
BATCH_SIZE = 64
EMBED_DIM = 200
HIDDEN_SIZE = 256
NUM_LAYERS = 1
BIDIRECTIONAL = False
DROPOUT = 0.3
LR = 2e-3
EPOCHS = 3
MIN_FREQ = 5
MAX_LEN = 256
VOCAB_MAX_SIZE = None   # 可设上限，比如 50000；None 表示不设上限

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# ========== 1) 数据 ==========
print("Loading IMDB...")
ds = load_dataset("imdb")
test_full = ds["test"]
dev = test_full.select(range(5000))
test = test_full.select(range(5000, len(test_full)))
train = ds["train"]

# ========== 2) 纯 Python 分词 ==========
# 基础英文分词器：小写 + 按非字母数字切分（保留缩写词的'）
_word_re = re.compile(r"[A-Za-z0-9']+")
def tokenize(text: str):
    return _word_re.findall(text.lower())

# ========== 3) 词表构建（仅用训练集，避免信息泄漏） ==========
SPECIALS = ["<pad>", "<unk>"]
PAD, UNK = 0, 1

def build_vocab(dataset, min_freq=1, max_size=None):
    counter = Counter()
    for ex in dataset:
        counter.update(tokenize(ex["text"]))
    # 过滤低频
    items = [(w, c) for w, c in counter.items() if c >= min_freq]
    # 频次优先，频次相同时按词典序
    items.sort(key=lambda x: (-x[1], x[0]))
    if max_size is not None:
        items = items[:max_size - len(SPECIALS)]
    stoi = {w: i+len(SPECIALS) for i, (w, _) in enumerate(items)}
    stoi["<pad>"] = PAD
    stoi["<unk>"] = UNK
    itos = [None] * len(stoi)
    for w, i in stoi.items():
        itos[i] = w
    return stoi, itos

print("Building vocab...")
stoi, itos = build_vocab(train, min_freq=MIN_FREQ, max_size=VOCAB_MAX_SIZE)
vocab_size = len(stoi)
print(f"Vocab size: {vocab_size} (min_freq={MIN_FREQ})")

def encode_tokens(tokens):
    return [stoi.get(t, UNK) for t in tokens]

# ========== 4) Dataset / Collate ==========
class IMDBDataset(Dataset):
    def __init__(self, hf_split):
        self.data = hf_split
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        ex = self.data[idx]
        ids = encode_tokens(tokenize(ex["text"]))
        y = int(ex["label"])
        return torch.tensor(ids, dtype=torch.long), torch.tensor(y, dtype=torch.long)

def clip_trunc(seq, max_len=MAX_LEN):
    return seq[:max_len]

def collate_fn(batch):
    ids_list, labels = [], []
    for ids, y in batch:
        ids = clip_trunc(ids, MAX_LEN)
        ids_list.append(ids)
        labels.append(y)
    padded = pad_sequence(ids_list, batch_first=True, padding_value=PAD)
    lengths = torch.tensor([min(len(x), MAX_LEN) for x in ids_list], dtype=torch.long)
    labels = torch.stack(labels)
    return padded, lengths, labels

train_ds = IMDBDataset(train)
dev_ds   = IMDBDataset(dev)
test_ds  = IMDBDataset(test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn, num_workers=2)
dev_loader   = DataLoader(dev_ds,   batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

# ========== 5) 模型 ==========
class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers=1,
                 bidirectional=False, dropout=0.0, pad_idx=0, num_classes=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=(dropout if num_layers > 1 else 0.0),
            bidirectional=bidirectional,
        )
        fc_in = hidden_size * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(fc_in, num_classes)

    def forward(self, x, lengths):
        emb = self.embed(x)                      # [B, T, E]
        out, (hn, cn) = self.lstm(emb)           # hn: [L*D, B, H]
        if self.lstm.bidirectional:
            last_h = torch.cat([hn[-2], hn[-1]], dim=1)  # [B, 2H]
        else:
            last_h = hn[-1]                               # [B, H]
        last_h = self.dropout(last_h)
        logits = self.fc(last_h)                 # [B, 2]
        return logits

model = LSTMSentiment(
    vocab_size=vocab_size,
    embed_dim=EMBED_DIM,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    bidirectional=BIDIRECTIONAL,
    dropout=DROPOUT,
    pad_idx=PAD,
    num_classes=2
).to(device)

print(model)

# ========== 6) 训练与评估 ==========
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)

def evaluate(dataloader):
    model.eval()
    all_y, all_p = [], []
    with torch.no_grad():
        for x, lengths, y in dataloader:
            x, lengths, y = x.to(device), lengths.to(device), y.to(device)
            logits = model(x, lengths)
            pred = logits.argmax(dim=1)
            all_y.extend(y.tolist())
            all_p.extend(pred.tolist())
    acc = accuracy_score(all_y, all_p)
    f1 = f1_score(all_y, all_p, average="macro")
    return acc, f1

best_dev = 0.0
for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for x, lengths, y in train_loader:
        x, lengths, y = x.to(device), lengths.to(device), y.to(device)
        logits = model(x, lengths)
        loss = criterion(logits, y)
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()
        total_loss += float(loss.item()) * x.size(0)

    avg_loss = total_loss / len(train_loader.dataset)
    dev_acc, dev_f1 = evaluate(dev_loader)
    print(f"Epoch {epoch:02d} | train_loss={avg_loss:.4f} | dev_acc={dev_acc:.4f} | dev_f1={dev_f1:.4f}")
    if dev_acc > best_dev:
        best_dev = dev_acc
        torch.save(model.state_dict(), "lstm_imdb_best.pt")

model.load_state_dict(torch.load("lstm_imdb_best.pt", map_location=device))
test_acc, test_f1 = evaluate(test_loader)
print(f"[TEST] acc={test_acc:.4f} | f1={test_f1:.4f}")

# ========== 7) 推理 ==========
def predict(sentence: str):
    model.eval()
    with torch.no_grad():
        ids = encode_tokens(tokenize(sentence))[:MAX_LEN]
        if not ids:
            return "neutral?", 0.5
        ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0).to(device)
        lengths = torch.tensor([len(ids[0])], dtype=torch.long).to(device)
        logits = model(ids, lengths)
        prob = torch.softmax(logits, dim=1)
        pred = prob.argmax(dim=1).item()
        return ("pos" if pred == 1 else "neg", float(prob[0, pred]))

print(predict("This movie is absolutely wonderful. I loved the acting and story!"))
print(predict("Terrible plot and awful pacing. Waste of time."))


device: cuda
Loading IMDB...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Building vocab...
Vocab size: 30564 (min_freq=5)
LSTMSentiment(
  (embed): Embedding(30564, 200, padding_idx=0)
  (lstm): LSTM(200, 256, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)
Epoch 01 | train_loss=0.6954 | dev_acc=0.8952 | dev_f1=0.4724
Epoch 02 | train_loss=0.6797 | dev_acc=0.9186 | dev_f1=0.4788
Epoch 03 | train_loss=0.5854 | dev_acc=0.6482 | dev_f1=0.3933
[TEST] acc=0.5579 | f1=0.5502
('neg', 0.5036768913269043)
('neg', 0.8156421780586243)
