# The Annotated Negative Sampling 

In [2]:
import re
import string
import urllib.request
import zipfile
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import pickle
import os

## Preprocessing of sentences

In [None]:
def download_text8():
    """下载Text8数据集到data文件夹"""
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)  # 创建data目录
    
    file_path = os.path.join(data_dir, "text8")
    zip_path = os.path.join(data_dir, "text8.zip")
    
    if not os.path.exists(file_path):
        print("Downloading Text8 dataset...")
        url = "http://mattmahoney.net/dc/text8.zip"
        urllib.request.urlretrieve(url, zip_path)
        
        with zipfile.ZipFile(zip_path) as f:
            with open(file_path, "wb") as out_file:
                out_file.write(f.read("text8"))
        
        os.remove(zip_path)
        print("Text8 dataset downloaded and extracted into the 'data' folder.")
    
    # 读取数据
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    return data

def create_random_sentences_from_continuous_text(text, sample_size = 10000):
    """
    注意到text8数据集的内容是连续文本，无法根据标点符号进行分词故实现从连续文本中创建句子的随机策略
    
    Args:
        text: 连续的文本字符串
        sample_size: 句子数量
    """
    words = text.split()
    sentences = []
    # 随机长度块：模拟自然句子的长度变化
    import random
    i = 0
    while i < len(words) - 20:
        # 随机句子长度（5-20词）
        sentence_length = random.randint(5, min(20, len(words) - i))
        sentence = ' '.join(words[i:i + sentence_length])
        sentences.append(sentence)
        i += sentence_length
    
    if len(sentences) > sample_size:
        return random.sample(sentences, sample_size)

    return sentences

In [None]:
class EnglishTextCleaner:
    def __init__(self, 
                 min_sentence_length=3,
                 max_sentence_length=50,
                 remove_stopwords=False,
                 min_word_freq=5,
                 max_vocab_size=50000):
        
        self.remove_stopwords = remove_stopwords
        self.min_word_freq = min_word_freq
        self.max_vocab_size = max_vocab_size
        self.min_sentence_length = min_sentence_length
        self.max_sentence_length = max_sentence_length
        
        # 初始化停用词
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        
        # 编译正则表达式（提高效率）
        self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        self.number_pattern = re.compile(r'\b\d+(?:\.\d+)?\b')
        self.special_chars = re.compile(r'[^a-zA-Z\s]')
        self.multiple_spaces = re.compile(r'\s+')
    
    def clean_single_text(self, text):
        """清洗单个文本"""
        if not isinstance(text, str):
            return ""
        
        # 转换为小写
        text = text.lower()
        
        # 移除URL
        text = self.url_pattern.sub(' ', text)
        
        # 移除邮箱
        text = self.email_pattern.sub(' ', text)
        
        # 移除或标准化数字
        text = self.number_pattern.sub('<NUM>', text)  # 可选：保留数字标记
        # text = self.number_pattern.sub(' ', text)  # 或者完全移除数字
        
        # 移除特殊字符，只保留字母和空格
        text = self.special_chars.sub(' ', text)
        
        # 标准化空格
        text = self.multiple_spaces.sub(' ', text)
        
        return text.strip()
    
    def tokenize_and_filter_sentence(self, sentence):
        """对句子进行分词并过滤"""
        # 分词
        words = word_tokenize(sentence)
        
        # 过滤词汇
        filtered_words = []
        for word in words:
            # 停用词过滤
            if self.remove_stopwords and word in self.stop_words:
                continue
            
            # 只保留字母（去除标点符号残留）
            if word.isalpha():
                filtered_words.append(word)
        
        return filtered_words
    
    def build_vocabulary(self, sentences):
        """构建词汇表"""
        word_count = Counter()
        for sentence in sentences:
            filterd_words = self.tokenize_and_filter_sentence(sentence=sentence)
            word_count.update(filterd_words)
        
        print(f"Total unique words before filtering: {len(word_count)}")
        
        # 过滤低频词
        vocab = {word: count for word, count in word_count.items() 
                if count >= self.min_word_freq}
        
        print(f"Words after frequency filtering (>= {self.min_word_freq}): {len(vocab)}")
        
        # 限制词汇表大小
        if len(vocab) > self.max_vocab_size:
            vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:self.max_vocab_size])
            print(f"Vocabulary truncated to top {self.max_vocab_size} words")
        
        return vocab
    
    def filter_sentences_by_vocab(self, sentences, vocab):
        """根据词汇表过滤句子"""
        vocab_set = set(vocab.keys())
        filtered_sentences = []
        
        for sentence in sentences:
            # 只保留词汇表中的词
            filtered_sentence = [word for word in sentence.split() if word in vocab_set]
            
            # 重新检查句子长度
            if self.min_sentence_length <= len(filtered_sentence) <= self.max_sentence_length:
                filtered_sentences.append(' '.join(filtered_sentence))
        
        return filtered_sentences
    
    def process_corpus(self, sentences):
        """处理整个语料库"""
        print("Step 1: Cleaning and tokenizing texts...")
        all_sentences = sentences
        
        print(f"Step 2: Total sentences after initial processing: {len(all_sentences)}")
        
        # 构建词汇表
        print("Step 3: Building vocabulary...")
        vocab = self.build_vocabulary(all_sentences)
        
        # 根据词汇表过滤句
        print("Step 4: Filtering sentences by vocabulary...")
        filtered_sentences = self.filter_sentences_by_vocab(all_sentences, vocab)
        
        print(f"Final corpus: {len(filtered_sentences)} sentences, {len(vocab)} unique words")
        
        return filtered_sentences, vocab

In [5]:
data = download_text8()[:10000000]
sample_sentences = create_random_sentences_from_continuous_text(data, sample_size=500)
cleaner = EnglishTextCleaner()
sentences, vocab = cleaner.process_corpus(sample_sentences)


Step 1: Cleaning and tokenizing texts...
Step 2: Total sentences after initial processing: 500
Step 3: Building vocabulary...
Total unique words before filtering: 2500
Words after frequency filtering (>= 1): 2500
Step 4: Filtering sentences by vocabulary...
Final corpus: 500 sentences, 2500 unique words


## Construct Dataloader

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import random

In [26]:
class Word2VecDataset(Dataset):
    def __init__(self, sentences, window_size=5, min_count=5, negative_samples=5):
        """
        Args:
            sentences: 句子列表，每个句子是字符串
            window_size: 上下文窗口大小
            min_count: 词频阈值，低于此频率的词会被过滤
            negative_samples: 负采样数量
        """
        super().__init__()
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.vocab = self._build_vocab(sentences, min_count)

        sorted_vocab = sorted(self.vocab.items(), key = lambda x : x[1], reverse=True)
        self.id2tok = {idx: word for idx, (word, _) in enumerate(sorted_vocab)}
        self.tok2id = {word: idx for idx, word in self.id2tok.items()}
        self.vocab_size = len(sorted_vocab)

        # negative sampling table
        self.negative_sampling_table = self._build_negative_sampling_table()

        # create training pairs
        self.data = self._create_training_pairs(sentences)


    def _build_vocab(self, sentences, min_count):
        word_count = Counter()
        for sentence in sentences:
            words = sentence.lower().split()
            word_count.update(words)

        vocab = {word: count for word, count in word_count.items() if count >= min_count}
        return vocab
    
    def _build_negative_sampling_table(self):
        """构建负采样表，基于词频的3/4次方"""
        table_size = 1e8
        table = []

        total_power = sum([count ** 0.75 for count in self.vocab.values()])

        for word_id in range(self.vocab_size):
            word = self.id2tok[word_id]
            word_count = self.vocab[word]
            prob = (word_count ** 0.75) / total_power
            count = int(prob * table_size)
            table.extend([word_id] * count)

        return table
    
    def _create_training_pairs(self, sentences):
        data = []
        for sentence in sentences:
            words = sentence.lower().split()

            words = [word for word in words if word in self.tok2id]
            for center_idx, center_word in enumerate(words):
                center_id = self.tok2id[center_word]

            start = max(0, center_idx - self.window_size)
            end = min(len(words), center_idx + self.window_size + 1)
            # 正样本：真实的上下文词
            for context_idx in range(start, end):
                if context_idx != center_idx:
                    context_word = words[context_idx]
                    context_id = self.tok2id[context_word]
                    data.append((center_id, context_id, 1))
                    
                    # 负采样：为每个正样本生成负样本
                    for _ in range(self.negative_samples):
                        negative_id = self._get_negative_sample()
                        # 确保负样本不是当前的上下文词
                        while negative_id == context_id:
                            negative_id = self._get_negative_sample()
                        data.append((center_id, negative_id, 0))
    
        return data

    def _get_negative_sample(self):
        """从负采样表中随机选择一个词"""
        return random.choice(self.negative_sampling_table)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        center_id, context_id, label = self.data[idx]
        return torch.tensor(center_id, dtype=torch.long), \
               torch.tensor(context_id, dtype=torch.long), \
               torch.tensor(label, dtype=torch.float32)

In [30]:
# 示例数据
sentences = [
    "the quick brown fox jumps over the lazy dog",
    "machine learning is a subset of artificial intelligence",
    "word2vec learns distributed representations of words",
    "skip-gram model predicts context words given center word",
    "natural language processing uses neural networks"
]

# 创建数据集
dataset = Word2VecDataset(
    sentences=sentences,
    window_size=2,
    min_count=1,
    negative_samples=5
)

print(f"词汇表大小: {dataset.vocab_size}")
print(f"训练样本数量: {len(dataset)}")
print(f"前几个词汇: {[dataset.id2tok[i] for i in range(min(5, dataset.vocab_size))]}")

# 查看前几个训练样本
for i in range(min(10, len(dataset))):
    center, context, label = dataset[i]
    center_word = dataset.id2tok[center.item()]
    context_word = dataset.id2tok[context.item()]
    print(f"样本 {i}: 中心词='{center_word}', 上下文词='{context_word}', 标签={label.item()}")

词汇表大小: 34
训练样本数量: 60
前几个词汇: ['the', 'of', 'words', 'quick', 'brown']
样本 0: 中心词='dog', 上下文词='the', 标签=1.0
样本 1: 中心词='dog', 上下文词='learns', 标签=0.0
样本 2: 中心词='dog', 上下文词='lazy', 标签=0.0
样本 3: 中心词='dog', 上下文词='jumps', 标签=0.0
样本 4: 中心词='dog', 上下文词='jumps', 标签=0.0
样本 5: 中心词='dog', 上下文词='word', 标签=0.0
样本 6: 中心词='dog', 上下文词='lazy', 标签=1.0
样本 7: 中心词='dog', 上下文词='representations', 标签=0.0
样本 8: 中心词='dog', 上下文词='predicts', 标签=0.0
样本 9: 中心词='dog', 上下文词='model', 标签=0.0


In [33]:
BATCH_SIZE = 2**2

dataloader = DataLoader(dataset, BATCH_SIZE, shuffle=False)
print(f"Number of batches: {len(dataloader)}")

Number of batches: 15


## Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.optim import SGD
import math


### 负采样公式

$$
\sigma\left({v'}_{w_O}^\top v_{w_I}\right) + \sum_{i=1}^k \mathbb{E}_{w_i \sim P_n(w)} \left[ \log \sigma\left( -{v'}_{w_i}^\top v_{w_I} \right) \right]
$$

---

### 公式含义解释：

这是一个用于训练词向量的目标函数，其核心思想是：

- 给定一个中心词 $ w_I $（input word），预测它的上下文词 $ w_O $（output word）；
- 同时从噪声分布 $ P_n(w) $ 中采样 $ k $ 个“负样本”词 $ w_i $，并让模型学会区分这些负样本和真正的上下文词。

#### 符号说明：

| 符号 | 含义 |
|------|------|
| $ v_{w_I} $ | 中心词 $ w_I $ 的输入嵌入向量（input vector） |
| $ {v'}_{w_O} $ | 上下文词 $ w_O $ 的输出嵌入向量（output vector） |
| $ {v'}_{w_i} $ | 第 $ i $ 个负样本词的输出嵌入向量 |
| $ \sigma(x) = \frac{1}{1 + e^{-x}} $ | Sigmoid 函数 |
| $ P_n(w) $ | 噪声分布，通常是一个基于词频的幂律分布 |
| $ \mathbb{E}_{w_i \sim P_n(w)} $ | 从噪声分布中采样的期望值 |

---

### 直观理解：

整个目标函数可以分为两部分：

#### 1. 正样本项（Positive Term）：
$$
\log \sigma\left({v'}_{w_O}^\top v_{w_I}\right)
$$
这部分鼓励模型认为正样本（真实的上下文词）与中心词之间的内积更大，即它们更相关。

#### 2. 负样本项（Negative Terms）：
$$
\sum_{i=1}^k \log \sigma\left( -{v'}_{w_i}^\top v_{w_I} \right)
$$
这部分鼓励模型认为负样本（随机采样的词）与中心词之间不相关，因为加了负号后，希望它们的内积越小越好。


In [2]:
class SkipGramWithObjective(nn.Module):
    """
    完整实现论文中的Skip-gram目标函数
    
    目标函数：maximize (1/T) * Σ_{t=1}^T Σ_{-c≤j≤c,j≠0} log p(w_{t+j}|w_t)
    其中 p(w_O|w_I) = exp(v'_{w_O}^T v_{w_I}) / Σ_{w=1}^W exp(v'_w^T v_{w_I})
    """
    
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # v_{w_I}: 输入词嵌入矩阵 (中心词的表示)
        self.center_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # v'_{w_O}: 输出词嵌入矩阵 (上下文词的表示)  
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self._init_weights()
    
    def _init_weights(self):
        """论文建议的初始化方式"""
        init_range = 0.5 / self.embedding_dim
        self.center_embeddings.weight.data.uniform_(-init_range, init_range)
        self.context_embeddings.weight.data.uniform_(-init_range, init_range)
    
    def forward_naive_softmax(self, center_word, context_words):
        """
        原始的softmax实现 - 计算复杂度O(V)，实际中不可行
        p(w_O|w_I) = exp(v'_{w_O}^T v_{w_I}) / Σ_{w=1}^W exp(v'_w^T v_{w_I})
        """
        # 获取中心词嵌入 v_{w_I}
        center_embed = self.center_embeddings(center_word)  # [batch_size, embedding_dim]
        
        # 计算所有词的分数
        all_context_embeds = self.context_embeddings.weight  # [vocab_size, embedding_dim]
        scores = torch.matmul(center_embed, all_context_embeds.t())  # [batch_size, vocab_size]
        
        # softmax归一化
        log_probs = F.log_softmax(scores, dim=1)  # [batch_size, vocab_size]
        
        # 选择真实上下文词的概率
        context_log_probs = log_probs.gather(1, context_words.unsqueeze(1))  # [batch_size, 1]
        
        # 最大化平均对数概率 = 最小化负对数似然
        return -context_log_probs.mean()
    
    def forward_negative_sampling(self, center_word, context_word, negative_words):
        """
        负采样优化版本 - 论文的核心创新
        目标：σ(v'_{w_O}^T v_{w_I}) + Σ_{i=1}^k E_{w_i~P_n(w)}[log σ(-v'_{w_i}^T v_{w_I})]
        """
        # 获取中心词嵌入
        center_embed = self.center_embeddings(center_word)  # [batch_size, embedding_dim]
        
        # 正样本：真实上下文词
        pos_embed = self.context_embeddings(context_word)   # [batch_size, embedding_dim]
        pos_score = torch.sum(center_embed * pos_embed, dim=1)  # [batch_size]
        pos_loss = F.logsigmoid(pos_score)  # log σ(v'_{w_O}^T v_{w_I})
        
        # 负样本：随机采样的词
        neg_embeds = self.context_embeddings(negative_words)  # [batch_size, k, embedding_dim]
        neg_scores = torch.bmm(neg_embeds, center_embed.unsqueeze(2)).squeeze(2)  # [batch_size, k]
        neg_loss = torch.sum(F.logsigmoid(-neg_scores), dim=1)  # [batch_size]
        
        # 最大化目标函数 = 最小化负损失
        return -(pos_loss + neg_loss).mean()
    
    def forward_hierarchical_softmax(self, center_word, target_word, path, code):
        """
        层次化Softmax优化版本
        将O(V)的softmax转化为O(log V)的二分类序列
        """
        center_embed = self.center_embeddings(center_word)
        
        loss = 0
        # 沿着Huffman树的路径进行二分类
        for i in range(len(path)):
            if path[i] == -1:  # 路径结束
                break
            
            node_embed = self.context_embeddings(path[i])
            score = torch.dot(center_embed.squeeze(), node_embed)
            
            # 根据Huffman编码决定分类方向
            if code[i] == 1:
                loss += F.logsigmoid(score)
            else:
                loss += F.logsigmoid(-score)
        
        return -loss

In [3]:
class Word2VecTrainer:
    """
    完整的Word2Vec训练器，实现论文中的优化策略
    """
    
    def __init__(self, model, vocab_size, initial_lr=0.025):
        self.model = model
        self.vocab_size = vocab_size
        self.initial_lr = initial_lr
        self.current_lr = initial_lr
        
        # 使用SGD优化器，论文推荐
        self.optimizer = SGD(model.parameters(), lr=self.current_lr)
    
    def update_learning_rate(self, current_epoch, total_epochs, current_batch, total_batches):
        """
        论文中提到的学习率衰减策略
        线性衰减：lr = initial_lr * (1 - progress)
        """
        progress = (current_epoch * total_batches + current_batch) / (total_epochs * total_batches)
        self.current_lr = self.initial_lr * (1 - progress)
        self.current_lr = max(self.current_lr, self.initial_lr * 0.0001)  # 最小学习率
        
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.current_lr
    
    def train_step(self, center_words, context_words, negative_words=None):
        """单步训练"""
        self.optimizer.zero_grad()
        
        if negative_words is not None:
            # 使用负采样
            loss = self.model.forward_negative_sampling(center_words, context_words, negative_words)
        else:
            # 使用原始softmax（不推荐，仅用于小词汇表）
            loss = self.model.forward_naive_softmax(center_words, context_words)
        
        loss.backward()
        self.optimizer.step()
        
        return loss.item()

In [4]:
class SkipGramObjectiveDemo:
    """
    演示目标函数的具体计算过程
    """
    
    @staticmethod
    def demonstrate_objective_function():
        """
        演示论文中的目标函数计算
        
        目标函数: (1/T) * Σ_{t=1}^T Σ_{-c≤j≤c,j≠0} log p(w_{t+j}|w_t)
        
        其中: p(w_O|w_I) = exp(v'_{w_O}^T v_{w_I}) / Σ_{w=1}^W exp(v'_w^T v_{w_I})
        """
        print("=== Skip-gram目标函数演示 ===")
        
        # 模拟参数
        vocab_size = 1000
        embedding_dim = 100
        T = 5  # 序列长度
        c = 2  # 窗口大小
        
        model = SkipGramWithObjective(vocab_size, embedding_dim)
        
        # 模拟训练序列: [w1, w2, w3, w4, w5]
        sequence = torch.randint(0, vocab_size, (T,))
        print(f"训练序列长度: {T}")
        print(f"窗口大小: {c}")
        
        total_log_prob = 0
        total_pairs = 0
        
        # 对序列中每个位置t的词w_t
        for t in range(T):
            center_word = sequence[t]
            print(f"\n中心词位置 t={t}, 词ID={center_word.item()}")
            
            # 考虑窗口内的所有上下文词
            for j in range(-c, c+1):
                if j == 0:  # 跳过中心词本身
                    continue
                
                context_pos = t + j
                if 0 <= context_pos < T:  # 确保在序列范围内
                    context_word = sequence[context_pos]
                    
                    # 计算 log p(w_{t+j}|w_t)
                    with torch.no_grad():
                        center_embed = model.center_embeddings(center_word.unsqueeze(0))
                        all_context_embeds = model.context_embeddings.weight
                        
                        # 计算所有词的分数
                        scores = torch.matmul(center_embed, all_context_embeds.t())
                        log_probs = F.log_softmax(scores, dim=1)
                        
                        # 获取目标上下文词的对数概率
                        target_log_prob = log_probs[0, context_word].item()
                        
                        total_log_prob += target_log_prob
                        total_pairs += 1
                        
                        print(f"  上下文词位置 {context_pos}, 词ID={context_word.item()}, "
                              f"log p(w_{context_pos}|w_{t}) = {target_log_prob:.4f}")
        
        # 计算平均对数概率
        avg_log_prob = total_log_prob / total_pairs
        print(f"\n目标函数值: (1/T) * Σ log p = {avg_log_prob:.4f}")
        print(f"总词对数量: {total_pairs}")
        
        return avg_log_prob
    


In [5]:

# 1. 演示目标函数的具体计算
demo = SkipGramObjectiveDemo()
objective_value = demo.demonstrate_objective_function()

print("\n" + "="*50)
print("实际训练示例")
print("="*50)

# 2. 实际训练示例
vocab_size = 5000
embedding_dim = 300
batch_size = 32

# 创建模型
model = SkipGramWithObjective(vocab_size, embedding_dim)
trainer = Word2VecTrainer(model, vocab_size)

# 模拟训练数据
center_words = torch.randint(0, vocab_size, (batch_size,))
context_words = torch.randint(0, vocab_size, (batch_size,))
negative_words = torch.randint(0, vocab_size, (batch_size, 5))

# 训练一步
loss = trainer.train_step(center_words, context_words, negative_words)
print(f"训练损失: {loss:.4f}")
print(f"当前学习率: {trainer.current_lr:.6f}")

# 3. 展示优化前后的区别
print(f"\n复杂度对比:")
print(f"原始softmax: O({vocab_size}) = O({vocab_size:,})")
print(f"负采样: O(k) = O(5)")  
print(f"层次化softmax: O(log V) = O({int(math.log2(vocab_size))})")

print(f"\n论文关键创新:")
print("1. 目标函数：最大化周围词的预测概率")
print("2. 负采样：将多分类转化为二分类问题")
print("3. 层次化softmax：使用Huffman树降低复杂度")
print("4. 子采样：处理高频词的噪声问题")

=== Skip-gram目标函数演示 ===
训练序列长度: 5
窗口大小: 2

中心词位置 t=0, 词ID=640
  上下文词位置 1, 词ID=391, log p(w_1|w_0) = -6.9079
  上下文词位置 2, 词ID=82, log p(w_2|w_0) = -6.9078

中心词位置 t=1, 词ID=391
  上下文词位置 0, 词ID=640, log p(w_0|w_1) = -6.9079
  上下文词位置 2, 词ID=82, log p(w_2|w_1) = -6.9077
  上下文词位置 3, 词ID=28, log p(w_3|w_1) = -6.9078

中心词位置 t=2, 词ID=82
  上下文词位置 0, 词ID=640, log p(w_0|w_2) = -6.9077
  上下文词位置 1, 词ID=391, log p(w_1|w_2) = -6.9078
  上下文词位置 3, 词ID=28, log p(w_3|w_2) = -6.9078
  上下文词位置 4, 词ID=300, log p(w_4|w_2) = -6.9078

中心词位置 t=3, 词ID=28
  上下文词位置 1, 词ID=391, log p(w_1|w_3) = -6.9077
  上下文词位置 2, 词ID=82, log p(w_2|w_3) = -6.9076
  上下文词位置 4, 词ID=300, log p(w_4|w_3) = -6.9077

中心词位置 t=4, 词ID=300
  上下文词位置 2, 词ID=82, log p(w_2|w_4) = -6.9078
  上下文词位置 3, 词ID=28, log p(w_3|w_4) = -6.9077

目标函数值: (1/T) * Σ log p = -6.9078
总词对数量: 14

实际训练示例
训练损失: 4.1589
当前学习率: 0.025000

复杂度对比:
原始softmax: O(5000) = O(5,000)
负采样: O(k) = O(5)
层次化softmax: O(log V) = O(12)

论文关键创新:
1. 目标函数：最大化周围词的预测概率
2. 负采样：将多分类转化为二分类问题
3. 层次化sof