In [16]:
import torch
import numpy as np
from collections import  defaultdict

def build_vocab(corpus):
    word_dict = defaultdict(int)
    for sentence in corpus:
        for word in sentence:
            word_dict[word] += 1
    sorted_words = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
    word2idx = {word: i+1 for i, (word, _) in enumerate(sorted_words)}
    idx2word = {i: word for i, (word, _) in enumerate(sorted_words)}
    return word2idx, idx2word

def create_contexts_target(corpus, window_size=2):
    contexts, targets = [], []
    for sentence in corpus:
        for i in range(window_size, len(sentence) - window_size):
            target = sentence[i]
            context = sentence[i-window_size:i] + sentence[i+1:i+window_size+1]
            contexts.append(context)
            targets.append(target)
    return np.array(contexts), np.array(targets)

In [17]:
import torch.nn as nn

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        
    def forward(self, inputs):
        embeds = self.embeddings(inputs)  # [batch_size, context_size, embedding_dim]
        mean_embeds = torch.mean(embeds, dim=1)  # [batch_size, embedding_dim]
        out = self.linear(mean_embeds)  # [batch_size, vocab_size]
        return out

In [None]:
# 示例语料库
corpus = [
    ["the", "quick", "brown", "fox", "jumps"],
    ["over", "the", "lazy", "dog"]
]

# 参数设置
EMBEDDING_DIM = 100
WINDOW_SIZE = 2
BATCH_SIZE = 32
EPOCHS = 50

# 数据预处理
word2idx, idx2word = build_vocab(corpus)
contexts, targets = create_contexts_target(corpus, WINDOW_SIZE)

# 转换为PyTorch张量
context_tensor = torch.LongTensor([[word2idx[w] for w in ctx] for ctx in contexts])
target_tensor = torch.LongTensor([word2idx[t] for t in targets])

# 初始化模型
model = CBOW(len(word2idx)+1, EMBEDDING_DIM)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

# 训练循环
for epoch in range(EPOCHS):
    total_loss = 0
    for i in range(0, len(contexts), BATCH_SIZE):
        batch_context = context_tensor[i:i+BATCH_SIZE]
        batch_target = target_tensor[i:i+BATCH_SIZE]
        
        optimizer.zero_grad()
        outputs = model(batch_context)
        loss = criterion(outputs, batch_target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(contexts):.4f}")

Epoch 1, Loss: 2.5771
Epoch 2, Loss: 2.4970
Epoch 3, Loss: 2.4175
Epoch 4, Loss: 2.3385
Epoch 5, Loss: 2.2602
Epoch 6, Loss: 2.1827
Epoch 7, Loss: 2.1059
Epoch 8, Loss: 2.0300
Epoch 9, Loss: 1.9549
Epoch 10, Loss: 1.8809
Epoch 11, Loss: 1.8078
Epoch 12, Loss: 1.7359
Epoch 13, Loss: 1.6651
Epoch 14, Loss: 1.5956
Epoch 15, Loss: 1.5274
Epoch 16, Loss: 1.4605
Epoch 17, Loss: 1.3952
Epoch 18, Loss: 1.3314
Epoch 19, Loss: 1.2693
Epoch 20, Loss: 1.2088
Epoch 21, Loss: 1.1501
Epoch 22, Loss: 1.0933
Epoch 23, Loss: 1.0383
Epoch 24, Loss: 0.9852
Epoch 25, Loss: 0.9340
Epoch 26, Loss: 0.8849
Epoch 27, Loss: 0.8378
Epoch 28, Loss: 0.7927
Epoch 29, Loss: 0.7496
Epoch 30, Loss: 0.7085
Epoch 31, Loss: 0.6694
Epoch 32, Loss: 0.6323
Epoch 33, Loss: 0.5972
Epoch 34, Loss: 0.5639
Epoch 35, Loss: 0.5325
Epoch 36, Loss: 0.5029
Epoch 37, Loss: 0.4750
Epoch 38, Loss: 0.4487
Epoch 39, Loss: 0.4241
Epoch 40, Loss: 0.4009
Epoch 41, Loss: 0.3793
Epoch 42, Loss: 0.3590
Epoch 43, Loss: 0.3399
Epoch 44, Loss: 0.32

In [28]:
# 获取词向量矩阵
embeddings = model.embeddings.weight.data

# 示例：查找相似词
def find_similar(word, topn=3):
    word_idx = word2idx[word]
    word_vec = embeddings[word_idx]
    similarities = torch.matmul(embeddings, word_vec) / (
        torch.norm(embeddings, dim=1) * torch.norm(word_vec)
    )
    _, indices = torch.topk(similarities, topn+1)
    indices -= 1
    return [idx2word[idx.item()] for idx in indices[1:]] # 排除自身

print("Similar to 'fox':", find_similar('fox'))

Similar to 'fox': ['dog', 'quick', 'jumps']
