# Implementation of Distributed Representations of Words and Phrases and their Compositionality

In [1]:
import re
import string
import urllib.request
import zipfile
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import pickle
import os

## Constants

## Preprocessing of sentences

In [3]:
def download_text8():
    """下载Text8数据集到data文件夹"""
    data_dir = "data"
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)  # 创建data目录
    
    file_path = os.path.join(data_dir, "text8")
    zip_path = os.path.join(data_dir, "text8.zip")
    
    if not os.path.exists(file_path):
        print("Downloading Text8 dataset...")
        url = "http://mattmahoney.net/dc/text8.zip"
        urllib.request.urlretrieve(url, zip_path)
        
        with zipfile.ZipFile(zip_path) as f:
            with open(file_path, "wb") as out_file:
                out_file.write(f.read("text8"))
        
        os.remove(zip_path)
        print("Text8 dataset downloaded and extracted into the 'data' folder.")
    
    # 读取数据
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
    return data

def create_random_sentences_from_continuous_text(text, sample_size = 10000):
    """
    注意到text8数据集的内容是连续文本，无法根据标点符号进行分词故实现从连续文本中创建句子的随机策略
    
    Args:
        text: 连续的文本字符串
        sample_size: 句子数量
    """
    words = text.split()
    sentences = []
    # 随机长度块：模拟自然句子的长度变化
    import random
    i = 0
    while i < len(words) - 20:
        # 随机句子长度（5-20词）
        sentence_length = random.randint(5, min(20, len(words) - i))
        sentence = words[i:i + sentence_length]
        sentences.append(sentence)
        i += sentence_length
    
    if len(sentences) > sample_size:
        return random.sample(sentences, sample_size)

    return sentences



In [4]:
class EnglishTextCleaner:
    def __init__(self, 
                 min_sentence_length=3,
                 max_sentence_length=50,
                 remove_stopwords=False,
                 min_word_freq=5,
                 max_vocab_size=50000):
        
        self.remove_stopwords = remove_stopwords
        self.min_word_freq = min_word_freq
        self.max_vocab_size = max_vocab_size
        self.min_sentence_length = min_sentence_length
        self.max_sentence_length = max_sentence_length
        
        # 初始化停用词
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        
        # 编译正则表达式（提高效率）
        self.url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        self.email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
        self.number_pattern = re.compile(r'\b\d+(?:\.\d+)?\b')
        self.special_chars = re.compile(r'[^a-zA-Z\s]')
        self.multiple_spaces = re.compile(r'\s+')
    
    def clean_single_text(self, text):
        """清洗单个文本"""
        if not isinstance(text, str):
            return ""
        
        # 转换为小写
        text = text.lower()
        
        # 移除URL
        text = self.url_pattern.sub(' ', text)
        
        # 移除邮箱
        text = self.email_pattern.sub(' ', text)
        
        # 移除或标准化数字
        text = self.number_pattern.sub('<NUM>', text)  # 可选：保留数字标记
        # text = self.number_pattern.sub(' ', text)  # 或者完全移除数字
        
        # 移除特殊字符，只保留字母和空格
        text = self.special_chars.sub(' ', text)
        
        # 标准化空格
        text = self.multiple_spaces.sub(' ', text)
        
        return text.strip()
    
    def tokenize_and_filter_sentence(self, sentence):
        """对句子进行分词并过滤"""
        # 分词
        words = word_tokenize(sentence)
        
        # 过滤词汇
        filtered_words = []
        for word in words:
            # 停用词过滤
            if self.remove_stopwords and word in self.stop_words:
                continue
            
            # 只保留字母（去除标点符号残留）
            if word.isalpha():
                filtered_words.append(word)
        
        return filtered_words
    
    def build_vocabulary(self, sentences):
        """构建词汇表"""
        word_count = Counter()
        for sentence in sentences:
            word_count.update(sentence)
        
        print(f"Total unique words before filtering: {len(word_count)}")
        
        # 过滤低频词
        vocab = {word: count for word, count in word_count.items() 
                if count >= self.min_word_freq}
        
        print(f"Words after frequency filtering (>= {self.min_word_freq}): {len(vocab)}")
        
        # 限制词汇表大小
        if len(vocab) > self.max_vocab_size:
            vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True)[:self.max_vocab_size])
            print(f"Vocabulary truncated to top {self.max_vocab_size} words")
        
        return vocab
    
    def filter_sentences_by_vocab(self, sentences, vocab):
        """根据词汇表过滤句子"""
        vocab_set = set(vocab.keys())
        filtered_sentences = []
        
        for sentence in sentences:
            # 只保留词汇表中的词
            filtered_sentence = [word for word in sentence if word in vocab_set]
            
            # 重新检查句子长度
            if self.min_sentence_length <= len(filtered_sentence) <= self.max_sentence_length:
                filtered_sentences.append(filtered_sentence)
        
        return filtered_sentences
    
    def process_corpus(self, sentences):
        """处理整个语料库"""
        print("Step 1: Cleaning and tokenizing texts...")
        all_sentences = sentences
        
        print(f"Step 2: Total sentences after initial processing: {len(all_sentences)}")
        
        # 构建词汇表
        print("Step 3: Building vocabulary...")
        vocab = self.build_vocabulary(all_sentences)
        
        # 根据词汇表过滤句子
        print("Step 4: Filtering sentences by vocabulary...")
        filtered_sentences = self.filter_sentences_by_vocab(all_sentences, vocab)
        
        print(f"Final corpus: {len(filtered_sentences)} sentences, {len(vocab)} unique words")
        
        return filtered_sentences, vocab

In [6]:
data = download_text8()[:10000000]
sample_sentences = create_random_sentences_from_continuous_text(data, sample_size=50000)
cleaner = EnglishTextCleaner()
sentences, vocab = cleaner.process_corpus(sample_sentences)


Step 1: Cleaning and tokenizing texts...
Step 2: Total sentences after initial processing: 50000
Step 3: Building vocabulary...
Total unique words before filtering: 43318
Words after frequency filtering (>= 5): 10361
Step 4: Filtering sentences by vocabulary...
Final corpus: 49914 sentences, 10361 unique words
