# Skip-gram

In [1]:
import numpy as np
import os

In [2]:
class Word2Vec:
    def __init__(self, vocab_size, embedding_dim=100, learning_rate=0.01):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.W1 = np.random.rand(vocab_size, embedding_dim)
        self.W2 = np.random.rand(embedding_dim, vocab_size)

    def save_model(self, model_dir='./word2vec'):
        os.makedirs(model_dir, exist_ok=True)  # 确保目录存在
        np.save(os.path.join(self.model_dir, 'W1.npy'), self.W1)
        np.save(os.path.join(self.model_dir, 'W2.npy'), self.W2)

    def load_model(self, model_dir='./word2vec'):
        os.makedirs(model_dir, exist_ok=True)  # 确保目录存在
        self.W1 = np.load(os.path.join(self.model_dir, 'W1.npy'))
        self.W2 = np.load(os.path.join(self.model_dir, 'W2.npy'))
    
    def cosine_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def forward_backward(self, x, target):
        h = self.W1[x]
        u = np.dot(self.W2.T, h)
        y_pred = self.softmax(u)
        loss = -np.log(y_pred[target])
        e = y_pred
        e[target] -= 1
        dW2 = np.outer(h, e)
        dW1 = np.dot(self.W2, e)
        return loss, dW1, dW2

    def train(self, X, Y, batch_size=1024, epochs=10):
        n_samples = len(X)
        for epoch in range(epochs):
            total_loss = 0
            for i in range(0, n_samples, batch_size):
                batch_X = X[i:i+batch_size]
                batch_Y = Y[i:i+batch_size]
                for x, y in zip(batch_X, batch_Y):
                    loss, dW1, dW2 = self.forward_backward(x, y)
                    self.W1[x] -= self.learning_rate * dW1
                    self.W2 -= self.learning_rate * dW2
                    total_loss += loss
            print(f"Epoch {epoch + 1}, Loss: {total_loss / n_samples}")

In [3]:
# 数据和词汇表的处理
def tokenize_text(text):
    return text.lower().split()

def build_vocabulary(tokens):
    vocabulary = list(set(tokens))
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    index_to_word = {i: word for i, word in enumerate(vocabulary)}
    return word_to_index, index_to_word, vocabulary

def generate_training_data(tokens, word_to_index, window_size=2):
    n_tokens = len(tokens)
    X = []
    Y = []
    for i in range(n_tokens):
        start = max(0, i - window_size)
        end = min(n_tokens, i + window_size + 1)
        for j in range(start, end):
            if i != j:
                X.append(word_to_index[tokens[i]])
                Y.append(word_to_index[tokens[j]])
    return np.array(X), np.array(Y)

由于text8数据集过大，训练时长太长，因此这里仅使用前百分之0.1来训练

In [4]:
def read_text_file(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text

# 运行训练
text = read_text_file('text8')
part_text = text[:int(len(text) * 0.001)]  # 使用0.1%的数据
tokens = tokenize_text(part_text)
word_to_index, index_to_word, vocabulary = build_vocabulary(tokens)
X, Y = generate_training_data(tokens, word_to_index, window_size=4)

model = Word2Vec(len(vocabulary))
model.train(X, Y)

Epoch 1, Loss: 6.255813514644019
Epoch 2, Loss: 6.047366952688088
Epoch 3, Loss: 5.969298869413322
Epoch 4, Loss: 5.902210123415412
Epoch 5, Loss: 5.839691276507373
Epoch 6, Loss: 5.7793955552743626
Epoch 7, Loss: 5.720165067127052
Epoch 8, Loss: 5.661411844977673
Epoch 9, Loss: 5.602876795553143
Epoch 10, Loss: 5.544511668154516
