In [1]:
with open("corpus.txt", "w") as f:
    f.write("I love natural language processing\n")
    f.write("natural language processing is fun\n")
    f.write("I love deep learning\n")
    f.write("deep learning powers artificial intelligence\n")
    f.write("language models can generate text\n")
    f.write("text generation is a task in natural language processing\n")

In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [1]:
import gensim.downloader as api

# 选择预训练词向量，例如 GloVe 100维
wv = api.load("glove-wiki-gigaword-100")

print("词向量维度:", wv["computer"].shape)
print("相似词:", wv.most_similar("language", topn=5))
print("king - man + woman ≈", wv.most_similar(positive=["king", "woman"], negative=["man"], topn=1))


词向量维度: (100,)
相似词: [('languages', 0.8260655403137207), ('word', 0.7464082837104797), ('spoken', 0.7381494045257568), ('arabic', 0.7318817377090454), ('english', 0.7214903831481934)]
king - man + woman ≈ [('queen', 0.7698540687561035)]


In [2]:
# ================================
# 环境依赖: pip install torch gensim tqdm
# 文件: word_rnn_lm.py
# ================================

import io, random, math
from collections import Counter
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import gensim.downloader as api
from tqdm import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------------
# 1. 读取语料 & 构建词表
# -------------------------------
def read_corpus(path="corpus.txt"):
    tokens = []
    with io.open(path, "r", encoding="utf-8") as f:
        for line in f:
            words = line.strip().split()
            if words:
                tokens.extend(words + ["<eos>"])  # 每句加一个结束标记
    return tokens

def build_vocab(tokens, min_freq=1, specials=("<pad>", "<unk>", "<eos>")):
    counter = Counter(tokens)
    stoi = {sp: i for i, sp in enumerate(specials)}
    idx = len(stoi)
    for w, c in counter.items():
        if w in stoi:
            continue
        if c >= min_freq:
            stoi[w] = idx
            idx += 1
    itos = {i: w for w, i in stoi.items()}
    return stoi, itos

# -------------------------------
# 2. 数据集 (滑动窗口: 输入→下一个词)
# -------------------------------
class WordLMDataset(Dataset):
    def __init__(self, tokens, stoi, seq_len=5):
        ids = [stoi.get(t, stoi["<unk>"]) for t in tokens]
        self.inputs, self.targets = [], []
        for i in range(len(ids) - seq_len):
            self.inputs.append(ids[i:i+seq_len])
            self.targets.append(ids[i+1:i+seq_len+1])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

# -------------------------------
# 3. 加载预训练 GloVe
# -------------------------------
def load_glove(stoi, embed_dim=100):
    print("正在下载并加载 GloVe 预训练词向量...")
    wv = api.load(f"glove-wiki-gigaword-{embed_dim}")
    matrix = np.random.normal(0, 0.1, (len(stoi), embed_dim)).astype(np.float32)
    hit = 0
    for w, i in stoi.items():
        if w in wv:
            matrix[i] = wv[w]
            hit += 1
    print(f"GloVe 覆盖了 {hit}/{len(stoi)} 个词 ({hit/len(stoi):.2%})")
    return torch.tensor(matrix)

# -------------------------------
# 4. 模型定义
# -------------------------------
class WordRNNLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embed_dim, hidden_size, batch_first=True)
        self.proj = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, h0=None):
        emb = self.embedding(x)            # (B,T,E)
        out, hN = self.rnn(emb, h0)        # (B,T,H)
        logits = self.proj(out)            # (B,T,V)
        return logits, hN

# -------------------------------
# 5. 训练函数
# -------------------------------
def train_model(model, train_loader, stoi, itos, epochs=10, lr=0.01):
    pad_idx = stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch}"):
            x, y = x.to(DEVICE), y.to(DEVICE)
            logits, _ = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # 防止梯度爆炸
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch} 平均loss: {avg_loss:.4f}")

# -------------------------------
# 6. 文本生成
# -------------------------------
@torch.no_grad()
def generate_text(model, stoi, itos, prefix=["I"], max_len=10, temperature=1.0):
    model.eval()
    ids = [stoi.get(w, stoi["<unk>"]) for w in prefix]
    x = torch.tensor(ids, dtype=torch.long, device=DEVICE).unsqueeze(0)

    for _ in range(max_len):
        logits, _ = model(x)
        next_logit = logits[0, -1] / temperature
        probs = torch.softmax(next_logit, dim=-1)
        next_id = torch.multinomial(probs, 1).item()
        x = torch.cat([x, torch.tensor([[next_id]], device=DEVICE)], dim=1)
    return [itos[i] for i in x[0].tolist()]

# -------------------------------
# 主流程
# -------------------------------
def main():
    # Step1: 读语料
    tokens = read_corpus("corpus.txt")
    stoi, itos = build_vocab(tokens, min_freq=1)
    print("词表大小:", len(stoi))

    # Step2: 数据加载
    dataset = WordLMDataset(tokens, stoi, seq_len=5)
    train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

    # Step3: 构建模型
    embed_dim = 100
    hidden_size = 100
    model = WordRNNLM(len(stoi), embed_dim, hidden_size, pad_idx=stoi["<pad>"]).to(DEVICE)

    # Step4: 加载预训练词向量
    embedding_matrix = load_glove(stoi, embed_dim)
    model.embedding.weight.data.copy_(embedding_matrix)

    # Step5: 训练
    train_model(model, train_loader, stoi, itos, epochs=10, lr=0.01)

    # Step6: 文本生成
    out = generate_text(model, stoi, itos, prefix=["I"], max_len=8)
    print("生成文本:", " ".join(out))


In [3]:
main()

词表大小: 23
正在下载并加载 GloVe 预训练词向量...
GloVe 覆盖了 19/23 个词 (82.61%)


Epoch 1: 100%|██████████| 9/9 [00:00<00:00, 62.65it/s]


Epoch 1 平均loss: 2.2167


Epoch 2: 100%|██████████| 9/9 [00:00<00:00, 333.86it/s]


Epoch 2 平均loss: 0.6362


Epoch 3: 100%|██████████| 9/9 [00:00<00:00, 320.81it/s]


Epoch 3 平均loss: 0.2986


Epoch 4: 100%|██████████| 9/9 [00:00<00:00, 340.05it/s]


Epoch 4 平均loss: 0.3173


Epoch 5: 100%|██████████| 9/9 [00:00<00:00, 327.49it/s]


Epoch 5 平均loss: 0.2121


Epoch 6: 100%|██████████| 9/9 [00:00<00:00, 270.31it/s]


Epoch 6 平均loss: 0.2349


Epoch 7: 100%|██████████| 9/9 [00:00<00:00, 313.29it/s]


Epoch 7 平均loss: 0.2228


Epoch 8: 100%|██████████| 9/9 [00:00<00:00, 350.44it/s]


Epoch 8 平均loss: 0.2392


Epoch 9: 100%|██████████| 9/9 [00:00<00:00, 325.64it/s]


Epoch 9 平均loss: 0.2168


Epoch 10: 100%|██████████| 9/9 [00:00<00:00, 330.57it/s]


Epoch 10 平均loss: 0.2018
生成文本: I love natural language processing is fun <eos> I
