Text Embedding "Word2Vec"
===============

In [1]:
#System path 등록
import sys  
sys.path.insert(0, '../../../tensorflow_keras_example')

In [2]:


from datasetslib.text8 import Text8
text8 = Text8()
# downloads data, converts words to ids, converts files to a list of ids
text8.load_data()
print('Train:', text8.part['train'][0:5])
# print(text8.part['test'][0:5])
# print(text8.part['valid'][0:5])
print('Vocabulary Length = ', text8.vocab_len)

Already exists: ./datasets\text8\text8.zip
Train: [5233 3083   11    5  194]
Vocabulary Length =  253854


In [3]:
#skip_gram example
text8.skip_window = 2
text8.reset_index()
# in skip-gram input is the target word and output is the context word
x_batch, y_batch = text8.next_batch_sg()

print('The skip-gram pairs : target,context')
for i in range(5 * text8.skip_window):
    print('(', x_batch[i], text8.id2word[x_batch[i]],
          ',', y_batch[i], text8.id2word[y_batch[i]], ')')


The skip-gram pairs : target,context
( 11 as , 5233 anarchism )
( 11 as , 3083 originated )
( 11 as , 5 a )
( 11 as , 194 term )
( 5 a , 3083 originated )
( 5 a , 11 as )
( 5 a , 194 term )
( 5 a , 1 of )
( 194 term , 11 as )
( 194 term , 5 a )


In [4]:
import numpy as np

valid_size = 8
x_valid = np.random.choice(valid_size * 10, valid_size, replace=False)
print(x_valid)

[70 43 63 50 27 75 25 29]


In [74]:
# Word2vec Network value Define
"""
Negative smapling : 
Word Embedding 된 Metrics 는 [Voca_size, Embedding _size] 인데 
이를 Word2vec Skip-Gram 학습을 위해서 Voca_size 만큼 Softmax를 해야한다
당연히 연산량이 크므로 이를 해결하기 위해서 Negative Sampling 사용 
쉽게 말해서 모두 연산하지말고 Skip_window size(원래 맞추고자 하는 정답크기) + (정답과 거리가 먼 단어 5~20개를 샘플링하여)
Softmax함 이 예제에서는 64개, Negative 샘플링 사이즈는 조절가능
"""
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
embedding_hidden = 128
batch_size = 128
n_negative_sample= 64
text8.skip_window=2
n_epochs = 100
learning_rate = 0.9
text8.reset_index()
n_batch = text8.n_batches_wv()

input = tf.compat.v1.placeholder(dtype=tf.int32, shape=[batch_size])
output = tf.compat.v1.placeholder(dtype=tf.int32, shape=[batch_size, 1])
validation = tf.compat.v1.constant(x_valid,dtype=tf.int32)


In [75]:
#embedding network setup
random_uniform_dist = tf.random.uniform(shape=[text8.vocab_len, embedding_hidden], 
                                       minval=-1.0, maxval=1.0)
embedding_dist = tf.compat.v1.Variable(random_uniform_dist, name='embedding_matrix')
embedded_table = tf.nn.embedding_lookup(embedding_dist, ids=input)

In [76]:
# noise contrastive estimate loss 
nce_w = tf.compat.v1.Variable(tf.random.truncated_normal(shape=[text8.vocab_len, embedding_hidden], 
                                                         stddev=1/tf.sqrt(embedding_hidden*1.0)),
                              name='nce_weight')

nce_b = tf.compat.v1.Variable(tf.zeros(shape=[text8.vocab_len]), name='nce_biases')

loss = tf.compat.v1.reduce_mean(tf.nn.nce_loss(weights=nce_w, biases=nce_b, inputs=embedded_table, labels=output, 
                     num_sampled=n_negative_sample, num_classes=text8.vocab_len))


In [77]:
#Similarity Calculation
normal_embedding_table = embedding_dist/tf.sqrt(tf.reduce_sum(tf.square(embedding_dist), axis=1, keepdims=True))
validate_embedding_table = tf.nn.embedding_lookup(normal_embedding_table, validation)
similarity = tf.matmul(validate_embedding_table, normal_embedding_table, transpose_b=True)

optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

In [78]:
similarity

<tf.Tensor 'MatMul_5:0' shape=(8, 253854) dtype=float32>

In [79]:
from datasetslib import nputil
text8.reset_index()

with tf.compat.v1.Session() as sess: 
    sess.run(tf.compat.v1.global_variables_initializer())
    for i in range(n_epochs):
        avg_loss = 0
        for batch in range(n_batch):
            x_train, label = text8.next_batch_sg()
            label = nputil.to2d(label, unit_axis=1)
            res_loss, _ = sess.run([loss, optimizer], feed_dict={input:x_train, output:label})
            avg_loss+=res_loss
        print(f'epoch : {i} ,,,,avg_loss : {avg_loss/n_batch}')
    
    similarity_score = sess.run(similarity)
    
            

epoch : 0 ,,,,avg_loss : 135.58712847282112
epoch : 1 ,,,,avg_loss : 70.16661318281474
epoch : 2 ,,,,avg_loss : 49.144337317706714
epoch : 3 ,,,,avg_loss : 36.81560307906483
epoch : 4 ,,,,avg_loss : 28.941727477796018
epoch : 5 ,,,,avg_loss : 22.32977826975237
epoch : 6 ,,,,avg_loss : 19.900982078854266
epoch : 7 ,,,,avg_loss : 16.84210065618886
epoch : 8 ,,,,avg_loss : 14.306939469463465
epoch : 9 ,,,,avg_loss : 12.982754683109828
epoch : 10 ,,,,avg_loss : 11.575401850709756
epoch : 11 ,,,,avg_loss : 10.639240667937804
epoch : 12 ,,,,avg_loss : 9.621933877407994
epoch : 13 ,,,,avg_loss : 9.537169499915475
epoch : 14 ,,,,avg_loss : 8.99251304809577
epoch : 15 ,,,,avg_loss : 8.21186364425954
epoch : 16 ,,,,avg_loss : 8.288504419035823
epoch : 17 ,,,,avg_loss : 8.135611283406428
epoch : 18 ,,,,avg_loss : 7.564091099282437
epoch : 19 ,,,,avg_loss : 8.275065974370435
epoch : 20 ,,,,avg_loss : 7.696875309709702
epoch : 21 ,,,,avg_loss : 7.50614417048638
epoch : 22 ,,,,avg_loss : 7.331343930

In [80]:
#similarity score calculation
top_k = 5
similarity_score.ar
for i in range(valid_size):
    similarity_cosine = similarity_score[i]
    top_rank_5 = similarity_cosine.argsort(similarity_cosine)

AttributeError: 'numpy.ndarray' object has no attribute 'ar'

In [None]:
top_k = 5

for i in range(valid_size):
    similarity_cosine = similarity_score[i]
    top_rank_5 = np.argsort(similarity_cosine)
    start_num = text8.vocab_len - top_k-1
    end_num = text8.vocab_len-1
    top_rank_5_index = top_rank_5.tolist()[start_num:end_num]

    similar_str = 'Similar to {0:}:'.format(text8.id2word[x_valid[i]])
    for rank_word in top_rank_5_index:
        similar_str = '{0:} {1:},'.format(similar_str, text8.id2word[rank_word])
    
    print(similar_str)
