# CBOW

In [17]:
import numpy as np
import os

In [18]:
class CBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W1 = np.random.randn(vocab_size, embedding_dim) * 0.01
        self.W2 = np.random.randn(embedding_dim, vocab_size) * 0.01

    def save_model(self, model_dir='./CBOW'):
        os.makedirs(model_dir, exist_ok=True)  # 确保目录存在
        np.save(os.path.join(model_dir, 'W1.npy'), self.W1)
        np.save(os.path.join(model_dir, 'W2.npy'), self.W2)

    def load_model(self, model_dir='./CBOW'):
        os.makedirs(model_dir, exist_ok=True)  # 确保目录存在
        self.W1 = np.load(os.path.join(model_dir, 'W1.npy'))
        self.W2 = np.load(os.path.join(model_dir, 'W2.npy'))
    
    def cosine_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return e_x / e_x.sum(axis=1, keepdims=True)

    def forward_backward(self, contexts, targets, learning_rate):
        # 前向传播
        h = np.mean(self.W1[contexts], axis=1)  # 批量中每个上下文的平均词向量
        u = np.dot(h, self.W2)
        y_pred = self.softmax(u)

        # 计算损失
        losses = -np.log(y_pred[np.arange(len(targets)), targets])
        loss = np.mean(losses)

        # 反向传播
        e = y_pred
        e[np.arange(len(targets)), targets] -= 1
        e /= len(targets)

        dW2 = np.dot(h.T, e)
        grad_input = np.dot(e, self.W2.T)  # 计算输入层梯度

        # 初始化dW1为0
        dW1 = np.zeros_like(self.W1)

        # 更新每个上下文词的梯度
        for i, context_words in enumerate(contexts):
            for context_word in context_words:
                dW1[context_word] += grad_input[i] / len(context_words)

        # 更新权重
        self.W2 -= learning_rate * dW2
        self.W1 -= learning_rate * dW1

        return loss


    def train(self, data, epochs=50, learning_rate=0.1, batch_size=1024):
        for epoch in range(epochs):
            np.random.shuffle(data)
            total_loss = 0
            batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
            for batch in batches:
                contexts = np.array([x[0] for x in batch])
                targets = np.array([x[1] for x in batch])
                loss = self.forward_backward(contexts, targets, learning_rate)
                total_loss += loss * len(batch)
            print(f"Epoch {epoch + 1}, Loss: {total_loss / len(data)}")


In [19]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def tokenize_text(text):
    return text.lower().split()

def build_vocabulary(tokens):
    vocabulary = list(set(tokens))
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    return word_to_index, vocabulary

def generate_training_data(tokens, word_to_index, window_size=2):
    data = []
    for i in range(window_size, len(tokens) - window_size):
        context = [word_to_index[tokens[j]] for j in range(i - window_size, i + window_size + 1) if j != i]
        target = word_to_index[tokens[i]]
        data.append((context, target))
    return data

In [20]:
# 运行训练
text = read_text_file('text8')
part_text = text[:int(len(text) * 0.001)]  # 使用0.1%的数据
tokens = tokenize_text(part_text)
word_to_index, vocabulary = build_vocabulary(tokens)
data = generate_training_data(tokens, word_to_index)

# 初始化 CBOW 模型
cbow = CBOW(len(vocabulary), embedding_dim=100)
cbow.train(data)

Epoch 1, Loss: 8.163079134069969
Epoch 2, Loss: 8.163075104779086
Epoch 3, Loss: 8.163071071526051
Epoch 4, Loss: 8.163067032220722
Epoch 5, Loss: 8.163062977035752
Epoch 6, Loss: 8.163058914991133
Epoch 7, Loss: 8.163054835052758
Epoch 8, Loss: 8.163050752248548
Epoch 9, Loss: 8.163046648736186
Epoch 10, Loss: 8.16304252892463
Epoch 11, Loss: 8.163038387692149
Epoch 12, Loss: 8.163034235151603
Epoch 13, Loss: 8.163030053962355
Epoch 14, Loss: 8.163025848956098
Epoch 15, Loss: 8.16302162204616
Epoch 16, Loss: 8.1630173728625
Epoch 17, Loss: 8.163013092321693
Epoch 18, Loss: 8.163008791585144
Epoch 19, Loss: 8.163004469080485
Epoch 20, Loss: 8.16300010865888
Epoch 21, Loss: 8.162995714569677
Epoch 22, Loss: 8.162991284146326
Epoch 23, Loss: 8.162986820290511
Epoch 24, Loss: 8.162982331470271
Epoch 25, Loss: 8.162977790387632
Epoch 26, Loss: 8.162973209266367
Epoch 27, Loss: 8.162968586642773
Epoch 28, Loss: 8.162963911328392
Epoch 29, Loss: 8.162959195833283
Epoch 30, Loss: 8.1629544399