# word2vec with tensorflow 2.0
- https://byeongkijeong.github.io/Word2vec-from-scratch-using-keras/

In [1]:
import tensorflow as tf
print(tf.__version__)

2.0.0-alpha0


In [114]:
import os
from collections import Counter
from time import time

import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Dot, Embedding, Input, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras import optimizers

from nltk.corpus import stopwords

### Stopword(불용어) 사전 다운로드 

In [115]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/yhhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 1. Corpus Preprocessing (말뭉치 전처리)

- 텍스트에 대해 모두 소문자로 변경
- 정규식(Regex)을 이용하여 숫자/알파벳/공백을 제외하고 전부 제거
- sampling_rate 변수는 테스트를 위해 전체문서 중에서 일부만 샘플링해서 사용하려고 할 때 사용하는 값이며 0~1 사이의 값

In [116]:
def preprocessing_corpus(corpus, sampling_rate=1.0):
    if sampling_rate is not 1.0:
        corpus = corpus.sample(frac=sampling_rate, replace=False)
    corpus = corpus.str.lower()
    corpus = corpus.str.replace(r'[^A-Za-z0-9\s]', ' ', regex=True)
    return corpus.values.tolist()

In [117]:
corpus = pd.read_csv("sample_text.csv").iloc[:,1] 
corpus = preprocessing_corpus(corpus, sampling_rate=0.7)

In [118]:
print(corpus)

['act fire witnesses must be aware of defamation', 'a g calls for infrastructure protection summit']


In [119]:
corpus = pd.read_csv("sample_text.csv").iloc[:,1] 
corpus = preprocessing_corpus(corpus, sampling_rate=1.0)

In [120]:
print(corpus)

['a g calls for infrastructure protection summit', 'aba decides against community broadcasting licence', 'act fire witnesses must be aware of defamation']


## 2. Making vocabrary (단어집 구성)

- Corpus에서 단어를 추출해서 단어집(Vocabrary)을 구성
- top_n_ratio는 Corpus 내에서 단어의 출현 빈도 기준 상위 몇%의 단어들을 이용하여 어휘집을 구성할 건지에 대한 파라미터 (범위는 0~1)
- 또한, NLTK에 있는 영어 불용어 사전을 이용하여 불용어 제거
- 불용어 및 등장횟수가 적어서 단어집에 포함되지 않은 단어들은 UNK로 처리

In [121]:
def making_vocab(corpus, top_n_ratio=1.0):
    words = np.concatenate(np.core.defchararray.split(corpus)).tolist()

    stopWords = set(stopwords.words('english'))
    words = [word for word in words if word not in stopWords]

    counter = Counter(words)
    
    print(counter)
    
    if top_n_ratio is not 1.0:
        counter = Counter(dict(counter.most_common(int(top_n_ratio*len(counter)))))
        
    unique_words = list(counter) + ['UNK']
    return unique_words

In [122]:
vocab = making_vocab(corpus, top_n_ratio=0.8)
print(vocab)

vocab_size = len(vocab)
print(vocab_size)

Counter({'g': 1, 'calls': 1, 'infrastructure': 1, 'protection': 1, 'summit': 1, 'aba': 1, 'decides': 1, 'community': 1, 'broadcasting': 1, 'licence': 1, 'act': 1, 'fire': 1, 'witnesses': 1, 'must': 1, 'aware': 1, 'defamation': 1})
['g', 'calls', 'infrastructure', 'protection', 'summit', 'aba', 'decides', 'community', 'broadcasting', 'licence', 'act', 'fire', 'UNK']
13


In [123]:
vocab = making_vocab(corpus, top_n_ratio=1.0)
print(vocab)

vocab_size = len(vocab)
print(vocab_size)

Counter({'g': 1, 'calls': 1, 'infrastructure': 1, 'protection': 1, 'summit': 1, 'aba': 1, 'decides': 1, 'community': 1, 'broadcasting': 1, 'licence': 1, 'act': 1, 'fire': 1, 'witnesses': 1, 'must': 1, 'aware': 1, 'defamation': 1})
['g', 'calls', 'infrastructure', 'protection', 'summit', 'aba', 'decides', 'community', 'broadcasting', 'licence', 'act', 'fire', 'witnesses', 'must', 'aware', 'defamation', 'UNK']
17


## 3. Indexing vocabrary (각 단어의 인덱스화)

- 단어집을 이용하여 단어를 숫자로, 숫자를 단어로 인덱싱(Indexing) 및 역 인덱싱(Reverse indexing)하는 Lookup 테이블 구성 및 반환

In [124]:
def vocab_indexing(vocab):
    word2index = {word:index for index, word in enumerate(vocab)}
    index2word = {index:word for word, index in word2index.items()}
    return word2index, index2word

In [125]:
word2index, index2word = vocab_indexing(vocab)
print(word2index)
print(index2word)

{'g': 0, 'calls': 1, 'infrastructure': 2, 'protection': 3, 'summit': 4, 'aba': 5, 'decides': 6, 'community': 7, 'broadcasting': 8, 'licence': 9, 'act': 10, 'fire': 11, 'witnesses': 12, 'must': 13, 'aware': 14, 'defamation': 15, 'UNK': 16}
{0: 'g', 1: 'calls', 2: 'infrastructure', 3: 'protection', 4: 'summit', 5: 'aba', 6: 'decides', 7: 'community', 8: 'broadcasting', 9: 'licence', 10: 'act', 11: 'fire', 12: 'witnesses', 13: 'must', 14: 'aware', 15: 'defamation', 16: 'UNK'}


## 4. Changing word to index in corpus

- 이전 함수에서 인덱싱 된 단어들을 이용하여, Corpus상의 단어들을 인덱스로 바꿔주는 함수
- A:0, B:1, C:2 로 인덱싱 되었다고 할 때, 'A B C A' --> [0,1,2,0]

In [126]:
def word_index_into_corpus(word2index, corpus):
    indexed_corpus = []
    for doc in corpus:
        indexed_corpus.append([word2index[word] if word in word2index else word2index['UNK'] for word in doc.split()])
    return indexed_corpus

In [127]:
print(corpus)
indexed_corpus = word_index_into_corpus(word2index=word2index, corpus=corpus)
print(indexed_corpus)

['a g calls for infrastructure protection summit', 'aba decides against community broadcasting licence', 'act fire witnesses must be aware of defamation']
[[16, 0, 1, 16, 2, 3, 4], [5, 6, 16, 7, 8, 9], [10, 11, 12, 13, 16, 14, 16, 15]]


## 5. Generating traning pairs

- 학습에 사용될 데이터 Pairs 생성
- 네거티브 샘플링 훈련 데이터
  - Positive sample(주변에 위치하는 단어 그룹) --> 1
  - Negative sample(주변에 위치하지 않는 단어 그룹) --> 0

- Keras의 skipgrams 사용
  - [[1,2,3,4,5,6]] -> [[[2,3], 1], [[2,6], 0]]
  - 설정된 Window size 안에 있는 단어끼리는 1, 아닌 단어끼리는 0을 Label로 생성

In [128]:
def generating_wordpairs(indexed_corpus, vocab_size, window_size=4):
    X = []
    Y = []
    for row in indexed_corpus:
        x, y = skipgrams(
            sequence=row,
            vocabulary_size=vocab_size,
            window_size=window_size,
            negative_samples=1.0,
            shuffle=True,
            categorical=False,
            sampling_table=None,
            seed=None
        )
        X = X + list(x)
        Y = Y + list(y)
    return X, Y

In [129]:
X, Y = generating_wordpairs(indexed_corpus=indexed_corpus, vocab_size=vocab_size, window_size=4)
print(X)

[[16, 10], [1, 2], [16, 3], [4, 11], [3, 16], [4, 2], [2, 1], [16, 2], [1, 16], [3, 1], [3, 6], [2, 3], [1, 4], [2, 8], [16, 1], [16, 1], [3, 1], [16, 4], [3, 4], [4, 1], [4, 3], [3, 2], [16, 4], [16, 5], [16, 16], [3, 6], [4, 1], [3, 14], [16, 2], [16, 16], [2, 12], [4, 14], [1, 16], [2, 3], [16, 3], [1, 2], [1, 5], [16, 4], [2, 11], [16, 6], [16, 9], [4, 16], [2, 4], [2, 4], [1, 9], [16, 13], [2, 16], [4, 1], [1, 6], [1, 3], [2, 16], [1, 11], [6, 7], [9, 11], [6, 9], [5, 14], [5, 4], [7, 13], [7, 9], [8, 5], [7, 11], [16, 7], [8, 10], [9, 16], [16, 9], [16, 8], [5, 2], [7, 6], [9, 6], [16, 15], [6, 8], [8, 16], [9, 3], [16, 5], [16, 9], [6, 11], [9, 8], [7, 9], [7, 5], [7, 16], [5, 16], [7, 1], [8, 9], [7, 3], [6, 9], [8, 15], [6, 5], [5, 6], [6, 9], [16, 11], [8, 16], [8, 6], [8, 7], [8, 3], [9, 7], [6, 12], [16, 12], [7, 8], [9, 14], [16, 6], [16, 12], [5, 8], [8, 8], [6, 11], [5, 9], [6, 16], [9, 7], [5, 7], [12, 10], [12, 14], [16, 13], [11, 12], [12, 16], [14, 15], [12, 13], [16

In [130]:
print(Y)

[0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1]


In [131]:
print(X[0], Y[0])
print(index2word[X[0][0]], index2word[X[0][1]], Y[0])

[16, 10] 0
UNK act 0


In [132]:
print(X[1], Y[1])
print(index2word[X[1][0]], index2word[X[1][1]], Y[1])

[1, 2] 0
calls infrastructure 0


## 6. Constructing model

- Embedding layer의 입력의 크기는 Vocabrary_size * Embedding dimension

In [133]:
def consructing_model(vocab_size, embedding_dim=300):
    embedding_layer = Embedding(vocab_size, embedding_dim, input_length=1)

    input_target = Input((1,))
    target_embedding = embedding_layer(input_target)
    target_embedding = Reshape((embedding_dim, 1))(target_embedding)

    input_context = Input((1,))
    context_embedding = embedding_layer(input_context)
    context_embedding = Reshape((embedding_dim, 1))(context_embedding)

    hidden_layer = Dot(axes=1)([target_embedding, context_embedding])
    hidden_layer = Reshape((1,))(hidden_layer)

    output = Dense(16, activation='sigmoid')(hidden_layer)
    output = Dense(1, activation='sigmoid')(output)
    
    model = Model(inputs=[input_target, input_context], outputs=output)
    
    nesterov = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy', optimizer=nesterov)
    return model

In [134]:
model = consructing_model(vocab_size=vocab_size, embedding_dim=5)
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 5)         85          input_19[0][0]                   
                                                                 input_20[0][0]                   
__________________________________________________________________________________________________
reshape_27 (Reshape)            (None, 5, 1)         0           embedding_9[0][0]          

## 7. Traning model

In [138]:
def training_model(model, epochs, batch_size, indexed_corpus, vocab_size):
    for i in range(epochs):
        idx_batch = np.random.choice(len(indexed_corpus), batch_size)
        X, Y = generating_wordpairs(np.array(indexed_corpus)[idx_batch].tolist(), vocab_size)

        word_target, word_context = zip(*X)
        word_target = np.array(word_target, dtype=np.int32)
        word_context = np.array(word_context, dtype=np.int32)

        target = np.zeros((1,))
        context = np.zeros((1,))
        label = np.zeros((1,))
        idx = np.random.randint(0, len(Y)-1)
        
        target[0] = word_target[idx]
        context[0] = word_context[idx]
        label[0] = Y[idx]
        
        loss = model.train_on_batch([target, context], label)
        
        if i % 50 == 0:
            print("Iteration {}, loss={}".format(i, loss))
    return model

## 8. Saving vector

In [139]:
def save_vectors(file_path, vocab_size, embedding_dim, model, word2index):
    f = open(file_path, 'w')
    f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
    vectors = model.get_weights()[0]
    for word, i in word2index.items():
        f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
    f.close()
    return file_path

## 9. Executing

In [155]:
if __name__ == "__main__":
    corpus = pd.read_csv("sample_text.csv").iloc[:,1] 
    corpus = preprocessing_corpus(corpus, sampling_rate=1.0)
    print("Corpus was loaded")
    
    vocab = making_vocab(corpus, top_n_ratio=0.8)
    vocab_size = len(vocab)
    print("Vocabulary was configured.")
    
    word2index, index2word = vocab_indexing(vocab)
    print("Vocabulary was indexed")
    
    indexed_corpus = word_index_into_corpus(word2index, corpus)
    print("Corpus was indexed")

    embedding_dim = 3
    model = consructing_model(vocab_size, embedding_dim=embedding_dim)
    print("Model was constructed")
    
    epochs = 1000
    batch_sentence_size = 512
    model = training_model(model, epochs, 512, indexed_corpus, vocab_size)
    print("Traning was done")

    save_path = save_vectors('simple_vectors_on_batch.txt', vocab_size, embedding_dim, model, word2index)
    print("Trained vector was saved")

Corpus was loaded
Counter({'aba': 1, 'decides': 1, 'community': 1, 'broadcasting': 1, 'licence': 1, 'g': 1, 'calls': 1, 'infrastructure': 1, 'protection': 1, 'summit': 1, 'act': 1, 'fire': 1, 'witnesses': 1, 'must': 1, 'aware': 1, 'defamation': 1})
Vocabulary was configured.
Vocabulary was indexed
Corpus was indexed
Model was constructed
Iteration 0, loss=1.3329920768737793
Iteration 50, loss=0.7628775835037231
Iteration 100, loss=0.5817394256591797
Iteration 150, loss=0.43404635787010193
Iteration 200, loss=0.4107160270214081
Iteration 250, loss=0.9342852830886841
Iteration 300, loss=1.4558486938476562
Iteration 350, loss=0.9269182085990906
Iteration 400, loss=0.327515184879303
Iteration 450, loss=0.5844373106956482
Iteration 500, loss=0.6577849984169006
Iteration 550, loss=1.0636420249938965
Iteration 600, loss=0.6128472089767456
Iteration 650, loss=0.669487714767456
Iteration 700, loss=0.31303921341896057
Iteration 750, loss=0.47006168961524963
Iteration 800, loss=1.0585472583770752

## 10. Loading the Trained Embedding

In [156]:
from gensim.models.keyedvectors import Word2VecKeyedVectors

file_name = "simple_vectors_on_batch.txt"
word_vectors = Word2VecKeyedVectors.load_word2vec_format(file_name, binary=False)

In [157]:
print(word_vectors.vector_size)
for idx in range(len(word_vectors.index2word)):
    word = word_vectors.index2word[idx]
    print(idx, word, word_vectors.get_vector(word))

3
0 aba [ 0.0334734  -0.03408869 -0.00693431]
1 decides [ 0.00384599 -0.00858271  0.02241573]
2 community [ 0.03848729 -0.04664322  0.0285083 ]
3 broadcasting [-0.00137986 -0.02642735 -0.04121242]
4 licence [0.03717165 0.02397118 0.02052025]
5 g [ 0.02094166 -0.02416403 -0.04791161]
6 calls [ 0.01294432 -0.02538341 -0.04600326]
7 infrastructure [0.03305928 0.03947913 0.01050359]
8 protection [-0.0239745   0.03978599 -0.02449598]
9 summit [ 0.00057333 -0.03502933 -0.0381165 ]
10 act [ 0.02697607  0.04306481 -0.02527941]
11 fire [ 0.02979267 -0.00152512 -0.02395339]


In [None]:
print(word_vectors.vectors)