# glove

In [10]:
from random import shuffle
import numpy as np
import os

class GloVe:
    def __init__(self, vocab_size, embedding_dim=100, x_max=100, alpha=0.75):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W = np.random.randn(vocab_size, embedding_dim) * 0.001
        self.Wt = np.random.randn(vocab_size, embedding_dim) * 0.001
        self.b = np.zeros(vocab_size)
        self.bt = np.zeros(vocab_size)
        self.x_max = x_max
        self.alpha = alpha

    def save_model(self, model_dir='./glove'):
        os.makedirs(model_dir, exist_ok=True)  # 确保目录存在
        np.save(os.path.join(model_dir, 'W.npy'), self.W)
        np.save(os.path.join(model_dir, 'Wt.npy'), self.Wt)
        np.save(os.path.join(model_dir, 'b.npy'), self.b)
        np.save(os.path.join(model_dir, 'bt.npy'), self.bt)

    def load_model(self, model_dir='./glove'):
        os.makedirs(model_dir, exist_ok=True)  # 确保目录存在
        self.W = np.load(os.path.join(model_dir, 'W.npy'))
        self.Wt = np.load(os.path.join(model_dir, 'Wt.npy'))
        self.b = np.load(os.path.join(model_dir, 'b.npy'))
        self.bt = np.load(os.path.join(model_dir, 'bt.npy'))

    def cosine_similarity(self, vec1, vec2):
        """计算两个向量之间的余弦相似度"""
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def weight_fn(self, x):
        return (x / self.x_max) ** self.alpha if x < self.x_max else 1

    def train(self, cooccurrences, epochs=20, learning_rate=0.02):
        for epoch in range(epochs):
            shuffle(cooccurrences)
            total_loss = 0
            for i, j, Xij in cooccurrences:
                weight = self.weight_fn(Xij)
                dot = np.dot(self.W[i], self.Wt[j])
                cost = (dot + self.b[i] + self.bt[j] - np.log(Xij)) ** 2
                total_loss += weight * cost

                grad_w = 2 * weight * (dot + self.b[i] + self.bt[j] - np.log(Xij))
                grad_b = 2 * weight * (1 + self.b[i] + self.bt[j] - np.log(Xij))

                self.W[i] -= learning_rate * grad_w * self.Wt[j]
                self.Wt[j] -= learning_rate * grad_w * self.W[i]
                self.b[i] -= learning_rate * grad_b
                self.bt[j] -= learning_rate * grad_b

            print(f"Epoch {epoch + 1}, Loss: {total_loss}")

In [11]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

def tokenize_text(text):
    # 简单文本预处理和分词
    return text.lower().split()

def build_vocabulary(tokens):
    vocabulary = list(set(tokens))
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    index_to_word = {i: word for i, word in enumerate(vocabulary)}
    return word_to_index, index_to_word, vocabulary

In [12]:
from scipy.sparse import dok_matrix

def build_cooccurrence_matrix(tokens, word_to_index, window_size=5):
    vocab_size = len(word_to_index)
    cooccurrences = dok_matrix((vocab_size, vocab_size), dtype=np.float32)

    for i in range(len(tokens)):
        token = tokens[i]
        left = tokens[max(i - window_size, 0): i]
        right = tokens[i + 1: i + 1 + window_size]

        for distance, context_word in enumerate(left + right, 1):
            j = i - window_size + distance if distance <= len(left) else i + distance - len(left)
            weight = 1.0 / distance  # This can be tweaked to different weighting functions

            cooccurrences[word_to_index[token], word_to_index[context_word]] += weight
            cooccurrences[word_to_index[context_word], word_to_index[token]] += weight

    return cooccurrences

# 使用部分text8数据
text = read_text_file('text8')
part_text = text[:int(len(text) * 0.001)]  # 使用0.1%的数据
tokens = tokenize_text(part_text)
word_to_index, index_to_word, vocabulary = build_vocabulary(tokens)
cooccurrences = build_cooccurrence_matrix(tokens, word_to_index)

在训练时不知什么原因导致loss以3000+起始，且经过超参数的调整未能有有效的变化

In [13]:
def extract_cooccurrences(cooccurrences):
    # 提取非零共现记录到列表
    data = [(i, j, cooccurrences[i, j]) for i, j in cooccurrences.keys() if cooccurrences[i, j] > 0]
    return data

# 使用提取函数
data = extract_cooccurrences(cooccurrences)

# 初始化并训练模型
glove = GloVe(len(vocabulary))
glove.train(data)

Epoch 1, Loss: 3334.226509575537
Epoch 2, Loss: 3428.4098207538545
Epoch 3, Loss: 3551.5037310203766
Epoch 4, Loss: 3563.3806471501143
Epoch 5, Loss: 3466.7877195014566
Epoch 6, Loss: 2996.375810389824
Epoch 7, Loss: 2192.397449017376
Epoch 8, Loss: 1752.127645821808
Epoch 9, Loss: 1492.505948053505
Epoch 10, Loss: 1371.9985983824379
Epoch 11, Loss: 1286.6481533338797
Epoch 12, Loss: 1243.694423216382
Epoch 13, Loss: 1215.1792167542262
Epoch 14, Loss: 1196.5045275870577
Epoch 15, Loss: 1186.574153588268
Epoch 16, Loss: 1181.3925086713764
Epoch 17, Loss: 1178.5998664903036
Epoch 18, Loss: 1179.3843938870036
Epoch 19, Loss: 1178.603465395194
Epoch 20, Loss: 1185.216029545383
