In [1]:
import numpy as np
import pandas as pd
import pickle
import jieba
import os
from tqdm import tqdm

In [3]:


def load_stop_words(file='/content/drive/MyDrive/N_F/Language_model/stopwords.txt'):
    with open(file, encoding='utf-8') as f:
        return f.read().split('\n')


def cut_words(file='/content/drive/MyDrive/N_F/Language_model/Original_dataset.csv'):
    stop_word = load_stop_words()

    result = []
    all_data = pd.read_csv(file, encoding='gbk', names=["data"])["data"]
    for words in all_data:
        c_words = jieba.lcut(words)
        result.append([word for word in c_words if word not in stop_word])

    return result


def get_dict(data):
    index2word = []
    for words in data:
        for word in words:
            if word not in index2word:
                index2word.append(word)

    word2index = {word: index for index, word in enumerate(index2word)}
    word_size = len(word2index)

    word2onehot = {}
    for word, index in word2index.items():
        one_hot = np.zeros((1, word_size))
        one_hot[0, index] = 1
        word2onehot[word] = one_hot

    return word2index, index2word, word2onehot

def softmax(x):
    ex = np.exp(x)
    return ex/np.sum(ex, axis=1, keepdims = True)


if __name__ == '__main__':
    data = cut_words()
    word2index, index2word, word2onehot = get_dict(data)

    word_size = len(word2index)
    embedding_num = 107
    lr = 0.1
    epoch = 10
    n_gram = 3

    w1 = np.random.normal(-1, 1, size=(word_size, embedding_num))
    w2 = np.random.normal(-1, 1, size=(embedding_num, word_size))

    for ep in range(epoch):
        for words in tqdm(data):
            for n_index, now_word in enumerate(words):
                now_word_onehot = word2onehot[now_word]
                other_words = words[max(n_index - n_gram, 0):n_index] + words[n_index + 1: n_index+1+n_gram]
                for other_word in other_words:
                    other_word_onehot = word2onehot[other_word]

                    hidden = now_word_onehot @ w1
                    p = hidden @ w2
                    pre = softmax(p)


                    G2 = pre - other_word_onehot
                    delta_w2 = hidden.T @ G2
                    G1 = G2 @ w2.T
                    delta_w1 = now_word_onehot.T @ G1

                    w1 -= lr * delta_w1
                    w2 -= lr * delta_w2

    with open("word2vec.pkl", "wb") as f:
        pickle.dump([w1, word_2_index, index_2_word, w2], f)  

