# 基于SVD分解以及基于SGNS两种方法构建词向量
张扬2020212185

## 零、环境配置

In [1]:
# from google.colab import drive
# drive.mount('/content/gdrive')
# import os
# os.chdir("/content/gdrive/MyDrive/nlp_hw_2")
# !pip install fbpca

In [2]:
import numpy as np
from collections import Counter
from scipy.sparse import csc_matrix
from scipy.sparse import save_npz
from scipy.sparse import load_npz
import fbpca
import json

## 一、SVD(Singular value decomposition)
### 1-1技术细节
1. 构建**共现矩阵**：
2. 执行**奇异值分解**：
3. **降维**：
4. **提取词向量**：

### 1-2 构建共现矩阵

In [3]:
word2id = {}
def build_matrix(text_, K=5):

    print('读取文本，统计词频...')
    word_counts = Counter(text_.split()) # 统计词频
    words_sheet = list(word_counts.keys()) # 词表
    global word2id
    word2id = {word: i for i, word in enumerate(words_sheet)} # 词到id的映射
    print("词表大小：{}".format(len(words_sheet)))
    with open('./data/word2id.json', 'w') as f:
        json.dump(word2id, f)



    print('统计共现词对...')
    words = text_.split()
    print('原文长度：{}'.format(len(words)))
    co_occurrence_counts = Counter()  # 初始化一个空的Counter对象
    for i, word in enumerate(words):  # 遍历文本中的每个单词
        # 计算上下文窗口的开始和结束
        start = max(0, i - K)
        end = min(i + K, len(words))
        if i % 3000000 == 0:
            print("当前单词：{}，上下文窗口：{}~{}".format(word, start, end))
        context_words = words[start:end]# 获取上下文单词
        for context_word in context_words: # 遍历每个上下文单词
            co_occurrence_counts[(word, context_word)] += 1 # 更新共现计数
    print("共现词对数量：{}".format(len(co_occurrence_counts)))




    print('构建共现矩阵...')
    row, col, data = [], [], [] # 初始化三个列表
    for (word, context_word), count in co_occurrence_counts.items():       
        if word in word2id and context_word in word2id: # 如果两个词都在词表中
            row.append(word2id[word]) # 添加行索引
            col.append(word2id[context_word]) # 添加列索引
            data.append(count) # 添加共现计数
    co_occurrence_matrix = csc_matrix((data, (row, col)), shape=(len(words_sheet),len(words_sheet))) # 构建共现矩阵
    save_npz('./data/co_occurrence_matrix_{}.npz'.format(K), co_occurrence_matrix)
    print("共现矩阵占用空间：{}MB".format(co_occurrence_matrix.data.nbytes / 1024 / 1024 ))
    print("共现矩阵大小{}".format(co_occurrence_matrix.shape))

    return co_occurrence_matrix, word2id


### 1-3 执行奇异值分解和降维

In [4]:
def train_svd_word_vectors(text, K=5, num_components=100):
    # Build co-occurrence matrix
    read_from_file = True
    if read_from_file:
        print('从文件读取...')
        co_occurrence_matrix = load_npz('./data/co_occurrence_matrix_5.npz')
        with open('./data/word2id.json', 'r') as f:
            word2id = json.load(f) # 从文件读取词到id的映射
        print("共现矩阵占用空间：{}MB".format(co_occurrence_matrix.data.nbytes / 1024 / 1024 ))
        print("共现矩阵大小: {}".format(co_occurrence_matrix.shape))
    else:
        co_occurrence_matrix, word2id = build_matrix(text, K)


    print('执行奇异值分解...')
    u, s, v = fbpca.pca(co_occurrence_matrix, k=num_components)
    print("左奇异向量矩阵大小为{}\n对角奇异值矩阵大小为{}\n右奇异向量矩阵分别大小为: {}\n".format(u.shape, s.shape, v.shape))
    print('提取词向量...')
    # 提取左奇异向量矩阵的前k列,实现降维
    u_k = u[:, :num_components]
    # 提取词向量
    word_vectors = u_k
    print("词向量矩阵大小为{}".format(word_vectors.shape))

    # 计算总共有多少个非零奇异值
    num_non_zero_singular_values = np.count_nonzero(s)

    # 计算选取了多少个奇异值
    num_selected_singular_values = K

    # 计算选取的奇异值之和
    sum_selected_singular_values = np.sum(s[:K])

    # 计算全部奇异值之和
    sum_all_singular_values = np.sum(s)

    # 计算选取的奇异值之和与全部奇异值之和的比例
    ratio = sum_selected_singular_values / sum_all_singular_values

    print("非零奇异值总数: ", num_non_zero_singular_values)
    print("选取的奇异值数量: ", num_selected_singular_values)
    print("选取的奇异值之和: ", sum_selected_singular_values)
    print("全部奇异值之和: ", sum_all_singular_values)
    print("选取的奇异值之和与全部奇异值之和的比例: ", ratio)

    
    
    return word_vectors

In [5]:
with open('./data/text8.txt', 'r') as f:
    text = f.read() # 读取文本

print('==================正在使用SVD训练词向量==================')
word_vectors_svd = train_svd_word_vectors(text) # 训练词向量
with open('./data/word_vectors_svd.npy', 'wb') as f:
    np.save(f, word_vectors_svd) # 保存词向量

从文件读取...
共现矩阵占用空间：115.18721008300781MB
共现矩阵大小: (253854, 253854)
执行奇异值分解...
左奇异向量矩阵大小为(253854, 100)
对角奇异值矩阵大小为(100,)
右奇异向量矩阵分别大小为: (100, 253854)

提取词向量...
词向量矩阵大小为(253854, 100)
非零奇异值总数:  100
选取的奇异值数量:  5
选取的奇异值之和:  4769443.126239194
全部奇异值之和:  10079468.653480353
选取的奇异值之和与全部奇异值之和的比例:  0.47318398322438815


## 二、SGNS(SkipGramNegSampling)

In [6]:
import numpy as np
from tqdm import tqdm

class SkipGramNegSampling:
    def __init__(self, vocabulary, embedding_dimension=100, 
                    negative_sample_count=10, 
                    training_epochs=1, 
                    learning_rate=0.1, 
                    window_size=2, 
                    random_seed=123):
        
        self.vocabulary = vocabulary # 词表
        self.vocabulary_size = len(vocabulary) # 词表大小
        self.embedding_dimension = embedding_dimension # 词向量维度
        self.negative_sample_count = negative_sample_count # 负采样样本数
        self.learning_rate = learning_rate # 学习率
        self.window_size = window_size # 上下文窗口大小
        self.training_epochs = training_epochs # 训练轮数

        # 随机初始化词向量矩阵
        self.input_weights = np.random.RandomState(random_seed).randn(
                                    len(vocabulary), embedding_dimension) 
        self.output_weights = np.random.RandomState(random_seed).randn(
                                    len(vocabulary), embedding_dimension)

    def train_on_document(self, document):
        """
        在给定文本上训练SGNS模型
        :param document: 语料的分词列表
        """
        # 定义sigmoid函数
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        # 开始训练
        for epoch in range(self.training_epochs):
            for center_word_index in tqdm(range(len(document))):
                # 取出中心词
                center_word_id = self.vocabulary[document[center_word_index]] # 中心词的id

                # 初始化向量矩阵
                window = self.window_size # 窗口大小
                context_word_indices = [i for i in
                                        range(center_word_index - window, center_word_index + window + 1)  # 窗口内的索引
                                        if i != center_word_index and 0 <= i < len(document)] # 上下文词的索引

                # 对于每个上下文单词
                for context_word_index in context_word_indices: 
                    context_word_id = self.vocabulary[document[context_word_index]] # 上下文词的id

                    # 构造正样本和负样本集合
                    samples = [(center_word_id, context_word_id, 1)] # 正样本
                    negative_samples = [] # 负样本
                    while len(negative_samples) < self.negative_sample_count: # 采样负样本
                        negative_word_id = np.random.randint(0, self.vocabulary_size) # 随机采样一个词
                        if negative_word_id != center_word_id and negative_word_id not in context_word_indices: # 不能是中心词和上下文词
                            negative_samples.append((negative_word_id, context_word_id, 0)) # 添加到负样本集合中

                    samples.extend(negative_samples)

                    # 针对每个样本进行梯度下降
                    for sample in samples:
                        # 取出样本中的词向量和标签
                        word_i, word_j, label = sample

                        # 计算预测值
                        input_vector = self.input_weights[word_i, :]
                        output_vector = self.output_weights[word_j, :]

                        # 计算得分和预测值
                        score = np.dot(input_vector, output_vector)
                        prediction = sigmoid(score)

                        # 计算梯度并更新参数
                        error = label - prediction
                        input_gradient = error * output_vector
                        output_gradient = error * input_vector

                        # 更新参数
                        self.input_weights[word_i, :] += self.learning_rate * input_gradient
                        self.output_weights[word_j, :] += self.learning_rate * output_gradient 

        # 保存并返回结果
        np.save('./data/word_vectors_sgns.npy', self.output_weights) # 保存词向量
        return self.output_weights



def train_sgns_word_vectors(text):
    global word2id
    read_from_file = True
    if read_from_file:
        print('从文件读取...')
        with open('word2id.json', 'r') as f:
            word2id = json.load(f) # 从文件读取词到id的映射
    else:
        co_matrix, word2id = build_matrix(text, 2) 
        
    sgns = SkipGramNegSampling(word2id)
    word_vectors = sgns.train_on_document(text)

    return word_vectors

In [9]:
print('==================正在使用SGNS训练词向量==================')
text = text.split(' ')
print(text[:10])
word_vectors_sgns = train_sgns_word_vectors(text) # 训练词向量
print("词向量矩阵大小为",word_vectors_sgns.shape)

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
从文件读取...


  0%|          | 12064/17005207 [00:07<3:00:41, 1567.48it/s]


KeyboardInterrupt: 

## 三、根据词向量计算余弦相似度

In [10]:
import numpy as np
import json
from scipy.spatial.distance import cosine

with open('./data/word2id.json', 'r') as f:
            word2id = json.load(f)

word_vectors_svd = np.load('./data/word_vectors_svd.npy')
word_vectors_sgns = np.load('./data/word_vectors_sgns.npy')

In [12]:
def cos_sim(vec1, vec2):
    """计算两个向量的余弦相似度"""
    return 1 - cosine(vec1, vec2)

def get_vector_svd(word):
    global word_vectors_svd
    return word_vectors_svd[word2id[word]]

def get_vector_sgns(word):
    global word_vectors_sgns
    return word_vectors_sgns[word2id[word]]

def get_sgns_similarity(word1, word2):
    if word1 in word2id.keys() and word2 in word2id.keys() :
        return cos_sim(get_vector_sgns(word1), get_vector_sgns(word2))
    else:
        return 0
    

def get_svd_similarity(word1, word2):
    if word1 in word2id.keys() and word2 in word2id.keys() :
        return cos_sim(get_vector_svd(word1), get_vector_svd(word2))
    else:
        return 0

In [13]:
print('==================正在计算相似度==================')

# 打开文件
with open('./data/wordsim353_agreed.txt', 'r') as f:
    lines = f.readlines()

# 处理文件
new_lines = []
for line in lines:
    # 将每行划分为列
    cols = line.split('\t')
    print(cols)

    # 计算相似度
    sim_svd = get_svd_similarity(cols[1], cols[2])
    sim_sgns = get_sgns_similarity(cols[1], cols[2])
    # 创建新行
    new_line = line.strip() + '\t' + str(sim_svd) + '\t' + str(sim_sgns) + '\n'
    new_lines.append(new_line)

# 将新行写入新文件
with open('./2020212185.txt', 'w') as f:
    f.writelines(new_lines)
print("已经写入！")

['t', 'love', 'sex', '6.77', '0.2801852241164029', '0.3871768589690032\n']
['h', 'tiger', 'cat', '7.35', '0.32634819891930433', '0.38208944853982385\n']
['i', 'tiger', 'tiger', '10.00', '1', '1\n']
['t', 'book', 'paper', '7.46', '0.2091638059894767', '0.43228576247855843\n']
['M', 'computer', 'keyboard', '7.62', '0.5704308119940917', '0.45779548957009497\n']
['t', 'computer', 'internet', '7.58', '0.4819722133464692', '0.4565007962877863\n']
['S', 'plane', 'car', '5.77', '0.3232610369967963', '0.39723883723263487\n']
['S', 'train', 'car', '6.31', '0.3938164034958984', '0.35929250372210264\n']
['t', 'telephone', 'communication', '7.50', '0.4414185021164686', '0.47143148780207234\n']
['S', 'television', 'radio', '6.77', '0.46676627981621643', '0.5058324550940997\n']
['H', 'media', 'radio', '7.42', '0.5338034431323619', '0.4174462128158448\n']
['t', 'drug', 'abuse', '6.85', '0.5731255881458122', '0.567596035267704\n']
['S', 'bread', 'butter', '6.19', '0.38358015818269786', '0.4236610044075