# Implementation of Distributed Representations of Words and Phrases and their Compositionality

In [2]:
import re
import string
import urllib.request
import zipfile
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import pickle
import os

## Constants

## Preprocessing of sentences

In [3]:
def download_text8():
    """下载Text8数据集到data文件夹"""
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)  # 创建data目录
    
    file_path = os.path.join(data_dir, "text8")
    zip_path = os.path.join(data_dir, "text8.zip")
    
    if not os.path.exists(file_path):
        print("Downloading Text8 dataset...")
        url = "http://mattmahoney.net/dc/text8.zip"
        urllib.request.urlretrieve(url, zip_path)
        
        with zipfile.ZipFile(zip_path) as f:
            with open(file_path, "wb") as out_file:
                out_file.write(f.read("text8"))
        
        os.remove(zip_path)
        print("Text8 dataset downloaded and extracted into the 'data' folder.")
    
    # 读取数据
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    return data

def create_random_sentences_from_continuous_text(text, sample_size = 10000):
    """
    注意到text8数据集的内容是连续文本，无法根据标点符号进行分词故实现从连续文本中创建句子的随机策略
    
    Args:
        text: 连续的文本字符串
        sample_size: 句子数量
    """
    words = text.split()
    sentences = []
    # 随机长度块：模拟自然句子的长度变化
    import random
    i = 0
    while i < len(words) - 20:
        # 随机句子长度（5-20词）
        sentence_length = random.randint(5, min(20, len(words) - i))
        sentence = ' '.join(words[i:i + sentence_length])
        sentences.append(sentence)
        i += sentence_length
    
    if len(sentences) > sample_size:
        return random.sample(sentences, sample_size)

    return sentences



In [None]:
class EnglishTextCleaner:
    def __init__(self, 
                 min_sentence_length=3,
                 max_sentence_length=50,
                 remove_stopwords=False,
                 min_word_freq=5,
                 max_vocab_size=50000):
        
        self.remove_stopwords = remove_stopwords
        self.min_word_freq = min_word_freq
        self.max_vocab_size = max_vocab_size
        self.min_sentence_length = min_sentence_length
        self.max_sentence_length = max_sentence_length
        
        # 初始化停用词
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        
        # 编译正则表达式（提高效率）
        self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        self.number_pattern = re.compile(r'\b\d+(?:\.\d+)?\b')
        self.special_chars = re.compile(r'[^a-zA-Z\s]')
        self.multiple_spaces = re.compile(r'\s+')
    
    def clean_single_text(self, text):
        """清洗单个文本"""
        if not isinstance(text, str):
            return ""
        
        # 转换为小写
        text = text.lower()
        
        # 移除URL
        text = self.url_pattern.sub(' ', text)
        
        # 移除邮箱
        text = self.email_pattern.sub(' ', text)
        
        # 移除或标准化数字
        text = self.number_pattern.sub('<NUM>', text)  # 可选：保留数字标记
        # text = self.number_pattern.sub(' ', text)  # 或者完全移除数字
        
        # 移除特殊字符，只保留字母和空格
        text = self.special_chars.sub(' ', text)
        
        # 标准化空格
        text = self.multiple_spaces.sub(' ', text)
        
        return text.strip()
    
    def tokenize_and_filter_sentence(self, sentence):
        """对句子进行分词并过滤"""
        # 分词
        words = word_tokenize(sentence)
        
        # 过滤词汇
        filtered_words = []
        for word in words:
            # 停用词过滤
            if self.remove_stopwords and word in self.stop_words:
                continue
            
            # 只保留字母（去除标点符号残留）
            if word.isalpha():
                filtered_words.append(word)
        
        return filtered_words
    
    def build_vocabulary(self, sentences):
        """构建词汇表"""
        word_count = Counter()
        for sentence in sentences:
            filterd_words = self.tokenize_and_filter_sentence(sentence=sentence)
            word_count.update(filterd_words)
        
        print(f"Total unique words before filtering: {len(word_count)}")
        
        # 过滤低频词
        vocab = {word: count for word, count in word_count.items() 
                if count >= self.min_word_freq}
        
        print(f"Words after frequency filtering (>= {self.min_word_freq}): {len(vocab)}")
        
        # 限制词汇表大小
        if len(vocab) > self.max_vocab_size:
            vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:self.max_vocab_size])
            print(f"Vocabulary truncated to top {self.max_vocab_size} words")
        
        return vocab
    
    def filter_sentences_by_vocab(self, sentences, vocab):
        """根据词汇表过滤句子"""
        vocab_set = set(vocab.keys())
        filtered_sentences = []
        
        for sentence in sentences:
            # 只保留词汇表中的词
            filtered_sentence = [word for word in sentence.split() if word in vocab_set]
            
            # 重新检查句子长度
            if self.min_sentence_length <= len(filtered_sentence) <= self.max_sentence_length:
                filtered_sentences.append(' '.join(filtered_sentence))
        
        return filtered_sentences
    
    def process_corpus(self, sentences):
        """处理整个语料库"""
        print("Step 1: Cleaning and tokenizing texts...")
        all_sentences = sentences
        
        print(f"Step 2: Total sentences after initial processing: {len(all_sentences)}")
        
        # 构建词汇表
        print("Step 3: Building vocabulary...")
        vocab = self.build_vocabulary(all_sentences)
        
        # 根据词汇表过滤句
        print("Step 4: Filtering sentences by vocabulary...")
        filtered_sentences = self.filter_sentences_by_vocab(all_sentences, vocab)
        
        print(f"Final corpus: {len(filtered_sentences)} sentences, {len(vocab)} unique words")
        
        return filtered_sentences, vocab

In [5]:
data = download_text8()[:10000000]
sample_sentences = create_random_sentences_from_continuous_text(data, sample_size=500)
cleaner = EnglishTextCleaner()
sentences, vocab = cleaner.process_corpus(sample_sentences)


Step 1: Cleaning and tokenizing texts...
Step 2: Total sentences after initial processing: 500
Step 3: Building vocabulary...
Total unique words before filtering: 2500
Words after frequency filtering (>= 1): 2500
Step 4: Filtering sentences by vocabulary...
Final corpus: 500 sentences, 2500 unique words


## Construct Dataloader

In [32]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import random

In [26]:
class Word2VecDataset(Dataset):
    def __init__(self, sentences, window_size=5, min_count=5, negative_samples=5):
        """
        Args:
            sentences: 句子列表，每个句子是字符串
            window_size: 上下文窗口大小
            min_count: 词频阈值，低于此频率的词会被过滤
            negative_samples: 负采样数量
        """
        super().__init__()
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.vocab = self._build_vocab(sentences, min_count)

        sorted_vocab = sorted(self.vocab.items(), key = lambda x : x[1], reverse=True)
        self.id2tok = {idx: word for idx, (word, _) in enumerate(sorted_vocab)}
        self.tok2id = {word: idx for idx, word in self.id2tok.items()}
        self.vocab_size = len(sorted_vocab)

        # negative sampling table
        self.negative_sampling_table = self._build_negative_sampling_table()

        # create training pairs
        self.data = self._create_training_pairs(sentences)


    def _build_vocab(self, sentences, min_count):
        word_count = Counter()
        for sentence in sentences:
            words = sentence.lower().split()
            word_count.update(words)

        vocab = {word: count for word, count in word_count.items() if count >= min_count}
        return vocab
    
    def _build_negative_sampling_table(self):
        """构建负采样表，基于词频的3/4次方"""
        table_size = 1e8
        table = []

        total_power = sum([count ** 0.75 for count in self.vocab.values()])

        for word_id in range(self.vocab_size):
            word = self.id2tok[word_id]
            word_count = self.vocab[word]
            prob = (word_count ** 0.75) / total_power
            count = int(prob * table_size)
            table.extend([word_id] * count)

        return table
    
    def _create_training_pairs(self, sentences):
        data = []
        for sentence in sentences:
            words = sentence.lower().split()

            words = [word for word in words if word in self.tok2id]
            for center_idx, center_word in enumerate(words):
                center_id = self.tok2id[center_word]

            start = max(0, center_idx - self.window_size)
            end = min(len(words), center_idx + self.window_size + 1)
            # 正样本：真实的上下文词
            for context_idx in range(start, end):
                if context_idx != center_idx:
                    context_word = words[context_idx]
                    context_id = self.tok2id[context_word]
                    data.append((center_id, context_id, 1))
                    
                    # 负采样：为每个正样本生成负样本
                    for _ in range(self.negative_samples):
                        negative_id = self._get_negative_sample()
                        # 确保负样本不是当前的上下文词
                        while negative_id == context_id:
                            negative_id = self._get_negative_sample()
                        data.append((center_id, negative_id, 0))
    
        return data

    def _get_negative_sample(self):
        """从负采样表中随机选择一个词"""
        return random.choice(self.negative_sampling_table)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        center_id, context_id, label = self.data[idx]
        return torch.tensor(center_id, dtype=torch.long), \
               torch.tensor(context_id, dtype=torch.long), \
               torch.tensor(label, dtype=torch.float32)

In [30]:
# 示例数据
sentences = [
    "the quick brown fox jumps over the lazy dog",
    "machine learning is a subset of artificial intelligence",
    "word2vec learns distributed representations of words",
    "skip-gram model predicts context words given center word",
    "natural language processing uses neural networks"
]

# 创建数据集
dataset = Word2VecDataset(
    sentences=sentences,
    window_size=2,
    min_count=1,
    negative_samples=5
)

print(f"词汇表大小: {dataset.vocab_size}")
print(f"训练样本数量: {len(dataset)}")
print(f"前几个词汇: {[dataset.id2tok[i] for i in range(min(5, dataset.vocab_size))]}")

# 查看前几个训练样本
for i in range(min(10, len(dataset))):
    center, context, label = dataset[i]
    center_word = dataset.id2tok[center.item()]
    context_word = dataset.id2tok[context.item()]
    print(f"样本 {i}: 中心词='{center_word}', 上下文词='{context_word}', 标签={label.item()}")

词汇表大小: 34
训练样本数量: 60
前几个词汇: ['the', 'of', 'words', 'quick', 'brown']
样本 0: 中心词='dog', 上下文词='the', 标签=1.0
样本 1: 中心词='dog', 上下文词='learns', 标签=0.0
样本 2: 中心词='dog', 上下文词='lazy', 标签=0.0
样本 3: 中心词='dog', 上下文词='jumps', 标签=0.0
样本 4: 中心词='dog', 上下文词='jumps', 标签=0.0
样本 5: 中心词='dog', 上下文词='word', 标签=0.0
样本 6: 中心词='dog', 上下文词='lazy', 标签=1.0
样本 7: 中心词='dog', 上下文词='representations', 标签=0.0
样本 8: 中心词='dog', 上下文词='predicts', 标签=0.0
样本 9: 中心词='dog', 上下文词='model', 标签=0.0


In [33]:
BATCH_SIZE = 2**2

dataloader = DataLoader(dataset, BATCH_SIZE, shuffle=False)
print(f"Number of batches: {len(dataloader)}")

Number of batches: 15


## Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class CBOWModel(nn.Module):
    """
    Continuous Bag of Words (CBOW) 模型
    根据上下文词预测中心词
    """
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # 输入层：上下文词的嵌入矩阵
        self.in_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 输出层：预测中心词的权重矩阵
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # 初始化权重
        self._init_weights()
    
    def _init_weights(self):
        """初始化权重，使用均匀分布"""
        init_range = 0.5 / self.embedding_dim
        self.in_embeddings.weight.data.uniform_(-init_range, init_range)
        self.out_embeddings.weight.data.uniform_(-init_range, init_range)
    
    def forward(self, context_words, center_word, negative_words=None):
        """
        Args:
            context_words: [batch_size, context_size] 上下文词ID
            center_word: [batch_size] 中心词ID
            negative_words: [batch_size, negative_size] 负采样词ID
        """
        batch_size = context_words.size(0)
        
        # 获取上下文词嵌入并求平均
        context_embeds = self.in_embeddings(context_words)  # [batch_size, context_size, embedding_dim]
        context_mean = torch.mean(context_embeds, dim=1)    # [batch_size, embedding_dim]
        
        # 正样本分数
        center_embeds = self.out_embeddings(center_word)    # [batch_size, embedding_dim]
        pos_score = torch.sum(context_mean * center_embeds, dim=1)  # [batch_size]
        pos_loss = F.logsigmoid(pos_score)
        
        # 负样本分数
        neg_loss = 0
        if negative_words is not None:
            neg_embeds = self.out_embeddings(negative_words)  # [batch_size, negative_size, embedding_dim]
            neg_scores = torch.bmm(neg_embeds, context_mean.unsqueeze(2)).squeeze(2)  # [batch_size, negative_size]
            neg_loss = torch.sum(F.logsigmoid(-neg_scores), dim=1)  # [batch_size]
        
        return -(pos_loss + neg_loss).mean()