# TF2 으로 skipgram 기반 word2vec 모델을 구현하고 학습하기

* https://www.tensorflow.org/tutorials/text/word2vec

In [2]:
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY
%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

In [1]:
import io
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## [연습] 스킵그램 및 네거티브 샘플링 

In [3]:
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(tokens)

['the', 'wide', 'road', 'shimmered', 'in', 'the', 'hot', 'sun']


In [5]:
# 어휘 목록
vocab, index = {}, 1
vocab['<pad>'] = 0
for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [6]:
# 역인덱스
word2idx = {v:k for k,v in vocab.items()}
word2idx

{0: '<pad>',
 1: 'the',
 2: 'wide',
 3: 'road',
 4: 'shimmered',
 5: 'in',
 6: 'hot',
 7: 'sun'}

In [7]:
# 문장 to integers
example_sentence = [vocab[token] for token in tokens]
example_sentence

[1, 2, 3, 4, 5, 1, 6, 7]

In [11]:
# 스킵그램 데이터 생성 - positive examples
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
    example_sentence, vocab_size, window_size=2,
    negative_samples=0
)
len(positive_skip_grams)

26

In [19]:
for skip_gram in positive_skip_grams[:5]:
    print(f'{skip_gram} -> ({word2idx[skip_gram[0]]}, {word2idx[skip_gram[1]]})')

[4, 2] -> (shimmered, wide)
[6, 7] -> (hot, sun)
[2, 3] -> (wide, road)
[3, 1] -> (road, the)
[2, 4] -> (wide, shimmered)


In [54]:
# 스킵그램 데이터 생성 - negative examples

SEED=1

# Set the number of negative samples per positive context. 
num_ns = 4

target_word, context_word = positive_skip_grams[0]
context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class, # class that should be sampled as 'positive'
    num_true=1, # each positive skip-gram has 1 positive context class
    num_sampled=num_ns, # number of negative context words to sample
    unique=True, # all the negative samples should be unique
    range_max=vocab_size, # pick index of the samples from [0, vocab_size]
    seed=SEED, # seed for reproducibility
    name="negative_sampling" # name of this operation
)
print(negative_sampling_candidates)
print([word2idx[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([5 1 4 0], shape=(4,), dtype=int64)
['in', 'the', 'shimmered', '<pad>']


In [57]:
# 각 target word 에 대해서 positive(1) pair와 negative pair(1) 를 배치 

# (num_ns, ) -> (num_ns, 1)
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# (num_ns+1, 1) : single positive followed by negatives
context = tf.concat([context_class, negative_sampling_candidates], 0)

label = tf.constant([1] + [0]*num_ns, dtype='int64')

target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [61]:
print(f"target_index    : {target}")
print(f"target_word     : {word2idx[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[word2idx[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 4
target_word     : shimmered
context_indices : [2 5 1 4 0]
context_words   : ['wide', 'in', 'the', 'shimmered', '<pad>']
label           : [1 0 0 0 0]


In [62]:
print(f"target  :", target)
print(f"context :", context )
print(f"label   :", label )

target  : tf.Tensor(4, shape=(), dtype=int32)
context : tf.Tensor([2 5 1 4 0], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


In [63]:
# 스킵그램 샘플링 테이블 
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


# 훈련 데이터 생성

## 말뭉치 준비

In [110]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

In [111]:
for text in text_ds.take(4):
    print(text)

tf.Tensor(b'First Citizen:', shape=(), dtype=string)
tf.Tensor(b'Before we proceed any further, hear me speak.', shape=(), dtype=string)
tf.Tensor(b'All:', shape=(), dtype=string)
tf.Tensor(b'Speak, speak.', shape=(), dtype=string)


In [146]:
# TextVectorization을 이용한 문장 벡터화

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')    

vocab_size = 8092
sequence_length = 10
vectorize_layer = TextVectorization(
    standardize = custom_standardization,
    max_tokens = vocab_size-1,
    output_mode = 'int',
    output_sequence_length=sequence_length
)
vectorize_layer.adapt(text_ds.batch(1024))  # fit에 해당 

# save the created voca for reference
inverse_vocab = vectorize_layer.get_vocabulary()
print(len(inverse_vocab), inverse_vocab[:20])

8090 [b'the', b'and', b'to', b'i', b'of', b'you', b'my', b'a', b'that', b'in', b'is', b'not', b'for', b'with', b'me', b'it', b'be', b'your', b'his', b'this']


In [147]:
# vectorize layer를 적용하려면 string, () shape을 shape=(1, )로 만들어야 한다.
# vectorize layer 적용 후에 다시 squeeze해서 원래대로 돌린다.
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    #return vectorize_layer(text)
    return tf.squeeze(vectorize_layer(text))

text_vector_ds = text_ds.batch(1024).map(vectorize_text).unbatch()

In [149]:
for vector in text_vector_ds:
    values = list(vector.numpy())
    if vocab_size in values:
        print(vector)

In [150]:
# 몇가지 옉시

for seq in text_vector_ds.take(4):
    print(f'{seq} -> {[ inverse_vocab[i] for i in seq ]}')

[ 89 270   0   0   0   0   0   0   0   0] -> [b'queen', b'bring', b'the', b'the', b'the', b'the', b'the', b'the', b'the', b'the']
[138  36 982 144 673 125  16 106   0   0] -> [b'both', b'her', b'morrow', b'away', b'court', b'god', b'be', b'york', b'the', b'the']
[34  0  0  0  0  0  0  0  0  0] -> [b'we', b'the', b'the', b'the', b'the', b'the', b'the', b'the', b'the', b'the']
[106 106   0   0   0   0   0   0   0   0] -> [b'york', b'york', b'the', b'the', b'the', b'the', b'the', b'the', b'the', b'the']


In [151]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences), len(sequences[0]))

32777 10


In [153]:
# 훈련 예제 생성

# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):
        
        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence, 
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)

        # Iterate over each positive skip-gram pair to produce training examples 
        # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1, 
                num_sampled=num_ns, 
                unique=True, 
                range_max=vocab_size, 
                seed=SEED, 
                name="negative_sampling")
            
            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
            negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels
    
targets, contexts, labels = generate_training_data(
    sequences=sequences, 
    window_size=2, 
    num_ns=4, 
    vocab_size=vocab_size, 
    seed=SEED)
print(len(targets), len(contexts), len(labels))    

100%|██████████| 32777/32777 [00:05<00:00, 5929.08it/s]

81583 81583 81583





In [154]:
# 학습 dataset으로 재구성

BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset.cache()

<CacheDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>

# 모델 및 훈련

In [155]:
class Word2Vec(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim):
        
        super(Word2Vec, self).__init__()
        
        self.target_embedding = Embedding(vocab_size, embedding_dim,
                                        input_length=1)
        self.context_embedding = Embedding(vocab_size, embedding_dim,
                                        input_length=num_ns+1)       
        self.dots = Dot(axes=(3,2))
        
        self.flatten = Flatten()
        
    def call(self, pair):
        target, context = pair
        
        target_embed = self.target_embedding(target)
        context_embed = self.context_embedding(context)
        dots = self.dots([context_embed, target_embed])
        return self.flatten(dots)

embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs/word2vec")
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff8b8c01780>