In [1]:
# word2vec model - skipgram
#---------------------------------------
#
# In this example, we will download and preprocess the movie
#  review data.
#
# From this data set we will compute/fit a Doc2Vec model to get
# Document vectors.  From these document vectors, we will split the
# documents into train/test and use these doc vectors to do sentiment
# analysis on the movie review dataset.

import tensorflow as tf
import numpy as np
import os
import pickle
import text_helpers_new as text_helpers

In [2]:
with open("data0Collect.pkl", 'rb') as f:
    data0Collect = pickle.load(f)

In [3]:
texts = list(data0Collect.values())

In [None]:
texts[:3]

In [6]:
from tensorflow.python.framework import ops
ops.reset_default_graph()

# ============ settings for global variables =====================
# Make a saving directory if it doesn't exist
data_folder_name = 'temp3'
if not os.path.exists(data_folder_name):
    os.makedirs(data_folder_name)

# Declare model parameters
batch_size = 200
vocabulary_size = 10000
generations = 100000
model_learning_rate = 0.01

embedding_size = 200   # Word embedding size
# doc_embedding_size = 500   # Document embedding size
# concatenated_size = embedding_size + doc_embedding_size

targetEpochs = 1

num_sampled = int(batch_size/2)    # Number of negative examples to sample.
window_size = 2       # How many words to consider to the left.

# Add checkpoints to training
save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 1000

# We pick a few test words for validation.
# sample_words = ['love', 'hate', 'happy', 'sad', 'man', 'woman']
# Later we will have to transform these into indices


# Texts must contain at least 3 words
# with open('.pkl', 'rb') as f:
#     texts = pickle.load(f)

texts = [x for x in texts  if len(x) > window_size ]

# Start a graph session
sess = tf.Session()

In [None]:
# Build our data set and dictionaries
print('Creating Dictionary')
word_dictionary = text_helpers.build_dictionary(texts, vocabulary_size, sentenceSplited=True)
word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))
text_data = text_helpers.text_to_numbers(texts, word_dictionary, sentenceSplited=True)
text_data_indices = list(range(len(text_data)))

numSet = np.random.choice(list(word_dictionary_rev.keys()), 4)
sample_words = [word_dictionary_rev[x] for x in numSet]

# Get validation word keys
indexOf_sample_words = [word_dictionary[x] for x in sample_words]

print('Creating Model')
# Define Embeddings:
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# doc_embeddings = tf.Variable(tf.random_uniform([len(texts), doc_embedding_size], -1.0, 1.0))

# NCE loss parameters
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                                               stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

# Create data/target placeholders
x_inputs = tf.placeholder(tf.int32, shape=[batch_size]) # plus 1 for doc index
y_target = tf.placeholder(tf.int32, shape=[batch_size, 1])
tf_indexOf_sample_words = tf.constant(indexOf_sample_words, dtype=tf.int32)

# Lookup the word embedding
# Add together element embeddings in window:
embed = tf.nn.embedding_lookup(embeddings, x_inputs)

# doc_indices = tf.slice(x_inputs, [0,window_size],[batch_size,1])
# doc_embed = tf.nn.embedding_lookup(doc_embeddings, doc_indices)

# concatenate embeddings
# final_embed = tf.concat(axis=1, values=[(1/window_size)*embed, tf.squeeze(doc_embed)])

# Get loss from prediction
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                     biases=nce_biases,
                                     labels=y_target,
                                     inputs=embed,
                                     num_sampled=num_sampled,
                                     num_classes=vocabulary_size))
                                     
# Create optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate=model_learning_rate)
train_step = optimizer.minimize(loss)

# Cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
sample_word_embeddings = tf.nn.embedding_lookup(normalized_embeddings, tf_indexOf_sample_words)
similarity = tf.matmul(sample_word_embeddings, normalized_embeddings, transpose_b=True)

# Create model saving operation
# saver = tf.train.Saver({"embeddings": embeddings, "doc_embeddings": doc_embeddings})
saver = tf.train.Saver({"embeddings": embeddings})

#Add variable initializer.
init = tf.global_variables_initializer()
sess.run(init)

In [7]:
# Run the doc2vec model.
print('Starting Training')
loss_vec = []
loss_x_vec = []

currentEpoch = 0
currentIndexPosition = -1
np.random.shuffle(text_data_indices)
previousBatch = []
previousLabel = []
method = 'skip_gram'
generation = 0

# while (currentEpoch < targetEpochs) and (generation < generations):
while (currentEpoch < targetEpochs):
    batch_inputs, batch_labels, previousBatch, previousLabel, currentIndexPosition, epochUp =\
        text_helpers.generate_batch_data(previousBatch, previousLabel, currentIndexPosition, text_data_indices, text_data, 
                                         batch_size, window_size, method)
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    # Run the train step
    sess.run(train_step, feed_dict=feed_dict)

    generation += 1
    currentEpoch += epochUp

    # Return the loss
    if generation % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(generation)
        print('Loss at step {} : {} / current epochs : {} / current sentence : {}'\
              .format(generation, loss_val, currentEpoch, currentIndexPosition))
      
    # Validation: Print some random words and top 5 related words
    if generation % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(sample_words)):
            valid_word = word_dictionary_rev[indexOf_sample_words[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},'.format(log_str, close_word)
            print(log_str)
            
    # Save dictionary + embeddings
    if generation % save_embeddings_every == 0:
        # Save vocabulary dictionary
        with open(os.path.join(data_folder_name,'data0analyzed.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)
        
        # Save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(), data_folder_name, 'word2vec_embeddings.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))

Creating Dictionary
Creating Model
Starting Training
Loss at step 1000 : 272.0592956542969 / current epochs : 0 / current sentence : 108
Loss at step 2000 : 242.02508544921875 / current epochs : 0 / current sentence : 230
Loss at step 3000 : 221.26219177246094 / current epochs : 0 / current sentence : 334
Loss at step 4000 : 201.0927276611328 / current epochs : 0 / current sentence : 458
Loss at step 5000 : 182.1846160888672 / current epochs : 0 / current sentence : 585
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 3292/NNP, 4127/NNG, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 17798/NNG, 4883/NNG,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 2740/NNG, 1408/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 10961/VV,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 6000 : 183.94854736328125 / current epochs : 0 / current sentence : 711
Loss at step 7000 : 175.04400634765625 / current epochs : 0 / current sentence : 841
Loss 

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 56000 : 36.29741668701172 / current epochs : 0 / current sentence : 6585
Loss at step 57000 : 13.644789695739746 / current epochs : 0 / current sentence : 6687
Loss at step 58000 : 14.221015930175781 / current epochs : 0 / current sentence : 6816
Loss at step 59000 : 20.086627960205078 / current epochs : 0 / current sentence : 6926
Loss at step 60000 : 30.820415496826172 / current epochs : 0 / current sentence : 7067
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 367/NNG, 2653/XR, 3023/NNG, 9179/NNG,
Nearest to 1400/NNG: 7349/NNG, 2084/VV, 1535/SN, 1332/NNG, 1784/MAG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 19067/NNG, 5253/NNG, 10961/VV,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 61000 : 17.673606872558594 / current epochs : 0 / current sentence : 7171
Loss at step 62000 : 20.069812774658203 / current epochs : 0 / current

Loss at step 110000 : 16.80658721923828 / current epochs : 0 / current sentence : 12872
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 367/NNG, 2653/XR, 3023/NNG, 9179/NNG,
Nearest to 1400/NNG: 7349/NNG, 2084/VV, 1535/SN, 1332/NNG, 4847/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 19067/NNG, 5253/NNG, 10961/VV,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 111000 : 13.5237455368042 / current epochs : 0 / current sentence : 12987
Loss at step 112000 : 6.28572940826416 / current epochs : 0 / current sentence : 13115
Loss at step 113000 : 11.377497673034668 / current epochs : 0 / current sentence : 13223
Loss at step 114000 : 8.598363876342773 / current epochs : 0 / current sentence : 13334
Loss at step 115000 : 9.061959266662598 / current epochs : 0 / current sentence : 13456
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 367/NNG, 2653/XR, 30

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 161000 : 10.265641212463379 / current epochs : 0 / current sentence : 18806
Loss at step 162000 : 11.183119773864746 / current epochs : 0 / current sentence : 18925
Loss at step 163000 : 8.337928771972656 / current epochs : 0 / current sentence : 19043
Loss at step 164000 : 8.450616836547852 / current epochs : 0 / current sentence : 19153
Loss at step 165000 : 12.35892105102539 / current epochs : 0 / current sentence : 19280
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 367/NNG, 2653/XR, 3023/NNG, 9179/NNG,
Nearest to 1400/NNG: 7349/NNG, 4847/NNG, 2084/VV, 1705/VV, 1332/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 19067/NNG, 5253/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 166000 : 10.564282417297363 / current epochs : 0 / current sentence : 19401
Loss at step 167000 : 10.70258903503418 / current epochs : 0

Loss at step 214000 : 7.865765571594238 / current epochs : 0 / current sentence : 24972
Loss at step 215000 : 7.070885181427002 / current epochs : 0 / current sentence : 25075
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 11190/NNG, 3292/NNP,
Nearest to 9822/NNG: 8792/NNG, 367/NNG, 2653/XR, 3023/NNG, 9179/NNG,
Nearest to 1400/NNG: 7349/NNG, 4847/NNG, 1705/VV, 1332/NNG, 1408/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 19067/NNG, 5253/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 216000 : 6.776515007019043 / current epochs : 0 / current sentence : 25195
Loss at step 217000 : 6.530062198638916 / current epochs : 0 / current sentence : 25306
Loss at step 218000 : 6.511025428771973 / current epochs : 0 / current sentence : 25426
Loss at step 219000 : 6.974463939666748 / current epochs : 0 / current sentence : 25542
Loss at step 220000 : 7.353188991546631 / current epochs : 0 / current sentence : 25660
Nearest to 19517/NNG: 2828/NNG, 193

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 266000 : 6.225986480712891 / current epochs : 0 / current sentence : 31028
Loss at step 267000 : 7.175601959228516 / current epochs : 0 / current sentence : 31145
Loss at step 268000 : 6.605291843414307 / current epochs : 0 / current sentence : 31275
Loss at step 269000 : 5.867856502532959 / current epochs : 0 / current sentence : 31403
Loss at step 270000 : 5.662939071655273 / current epochs : 0 / current sentence : 31512
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 3023/NNG,
Nearest to 1400/NNG: 7349/NNG, 4847/NNG, 1705/VV, 1408/NNG, 1332/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 19067/NNG, 5253/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 271000 : 6.460961818695068 / current epochs : 0 / current sentence : 31624
Loss at step 272000 : 6.7582879066467285 / current epochs : 0 

Loss at step 319000 : 5.728068828582764 / current epochs : 0 / current sentence : 37301
Loss at step 320000 : 6.1556243896484375 / current epochs : 0 / current sentence : 37424
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 3023/NNG,
Nearest to 1400/NNG: 7349/NNG, 4847/NNG, 1705/VV, 1408/NNG, 1332/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 321000 : 6.074635028839111 / current epochs : 0 / current sentence : 37552
Loss at step 322000 : 5.777130603790283 / current epochs : 0 / current sentence : 37674
Loss at step 323000 : 6.533941745758057 / current epochs : 0 / current sentence : 37794
Loss at step 324000 : 7.6979804039001465 / current epochs : 0 / current sentence : 37913
Loss at step 325000 : 5.861452102661133 / current epochs : 0 / current sentence : 38032
Nearest to 19517/NNG: 2828/NNG, 1

Loss at step 371000 : 6.3558197021484375 / current epochs : 0 / current sentence : 43488
Loss at step 372000 : 5.946108341217041 / current epochs : 0 / current sentence : 43594
Loss at step 373000 : 6.329118728637695 / current epochs : 0 / current sentence : 43714
Loss at step 374000 : 5.902500152587891 / current epochs : 0 / current sentence : 43828
Loss at step 375000 : 5.909588813781738 / current epochs : 0 / current sentence : 43954
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 3023/NNG,
Nearest to 1400/NNG: 7349/NNG, 4847/NNG, 1705/VV, 1408/NNG, 1332/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 376000 : 5.6472368240356445 / current epochs : 0 / current sentence : 44086
Loss at step 377000 : 5.8059210777282715 / current epochs : 0 / current sentence : 44186
Loss at step 378000 : 6.26024770

Loss at step 425000 : 5.649572849273682 / current epochs : 0 / current sentence : 49841
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 3023/NNG,
Nearest to 1400/NNG: 7349/NNG, 4847/NNG, 1705/VV, 1408/NNG, 1332/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 426000 : 5.6890435218811035 / current epochs : 0 / current sentence : 49955
Loss at step 427000 : 6.539106369018555 / current epochs : 0 / current sentence : 50083
Loss at step 428000 : 5.98325252532959 / current epochs : 0 / current sentence : 50194
Loss at step 429000 : 5.926799774169922 / current epochs : 0 / current sentence : 50306
Loss at step 430000 : 5.813455581665039 / current epochs : 0 / current sentence : 50408
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 476000 : 5.471065521240234 / current epochs : 0 / current sentence : 55844
Loss at step 477000 : 6.171506404876709 / current epochs : 0 / current sentence : 55957
Loss at step 478000 : 5.335268497467041 / current epochs : 0 / current sentence : 56071
Loss at step 479000 : 5.679461479187012 / current epochs : 0 / current sentence : 56186
Loss at step 480000 : 6.001565933227539 / current epochs : 0 / current sentence : 56302
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 4883/NNG,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 1332/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 481000 : 5.085681915283203 / current epochs : 0 / current sentence : 56411
Loss at step 482000 : 5.8595147132873535 / current epochs : 0 

Loss at step 529000 : 5.703823089599609 / current epochs : 0 / current sentence : 62101
Loss at step 530000 : 5.462324142456055 / current epochs : 0 / current sentence : 62225
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 4883/NNG,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 2740/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 531000 : 5.559506893157959 / current epochs : 0 / current sentence : 62354
Loss at step 532000 : 5.477602481842041 / current epochs : 0 / current sentence : 62465
Loss at step 533000 : 5.544808387756348 / current epochs : 0 / current sentence : 62595
Loss at step 534000 : 4.860015869140625 / current epochs : 0 / current sentence : 62719
Loss at step 535000 : 5.623081207275391 / current epochs : 0 / current sentence : 62834
Nearest to 19517/NNG: 2828/NNG, 193

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 581000 : 5.510408878326416 / current epochs : 0 / current sentence : 68222
Loss at step 582000 : 5.739825248718262 / current epochs : 0 / current sentence : 68332
Loss at step 583000 : 5.723079204559326 / current epochs : 0 / current sentence : 68436
Loss at step 584000 : 5.37954044342041 / current epochs : 0 / current sentence : 68541
Loss at step 585000 : 5.475448131561279 / current epochs : 0 / current sentence : 68653
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 10513/MM,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 2740/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 586000 : 5.4882707595825195 / current epochs : 0 / current sentence : 68769
Loss at step 587000 : 5.559427261352539 / current epochs : 0 /

Loss at step 634000 : 5.178554534912109 / current epochs : 0 / current sentence : 74363
Loss at step 635000 : 5.528728008270264 / current epochs : 0 / current sentence : 74480
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 10513/MM,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 2740/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 636000 : 5.407294750213623 / current epochs : 0 / current sentence : 74592
Loss at step 637000 : 5.621829032897949 / current epochs : 0 / current sentence : 74709
Loss at step 638000 : 5.738527774810791 / current epochs : 0 / current sentence : 74822
Loss at step 639000 : 5.500565052032471 / current epochs : 0 / current sentence : 74930
Loss at step 640000 : 5.644519805908203 / current epochs : 0 / current sentence : 75037
Nearest to 19517/NNG: 2828/NNG, 193

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 686000 : 5.730341911315918 / current epochs : 0 / current sentence : 80306
Loss at step 687000 : 5.660640239715576 / current epochs : 0 / current sentence : 80423
Loss at step 688000 : 5.456586837768555 / current epochs : 0 / current sentence : 80554
Loss at step 689000 : 5.402646541595459 / current epochs : 0 / current sentence : 80665
Loss at step 690000 : 5.6796875 / current epochs : 0 / current sentence : 80772
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 10513/MM,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 2740/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 691000 : 5.5713791847229 / current epochs : 0 / current sentence : 80902
Loss at step 692000 : 5.367216110229492 / current epochs : 0 / current s

Loss at step 739000 : 5.449301719665527 / current epochs : 0 / current sentence : 86588
Loss at step 740000 : 5.3905348777771 / current epochs : 0 / current sentence : 86701
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 9179/NNG, 10513/MM,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 2740/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 741000 : 5.509417533874512 / current epochs : 0 / current sentence : 86809
Loss at step 742000 : 5.618802547454834 / current epochs : 0 / current sentence : 86931
Loss at step 743000 : 5.682900428771973 / current epochs : 0 / current sentence : 87043
Loss at step 744000 : 5.588076114654541 / current epochs : 0 / current sentence : 87159
Loss at step 745000 : 5.412570953369141 / current epochs : 0 / current sentence : 87250
Nearest to 19517/NNG: 2828/NNG, 19389

Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 791000 : 5.570853233337402 / current epochs : 0 / current sentence : 92511
Loss at step 792000 : 5.232716083526611 / current epochs : 0 / current sentence : 92632
Loss at step 793000 : 5.69318962097168 / current epochs : 0 / current sentence : 92753
Loss at step 794000 : 5.3804802894592285 / current epochs : 0 / current sentence : 92868
Loss at step 795000 : 5.494643688201904 / current epochs : 0 / current sentence : 92973
Nearest to 19517/NNG: 2828/NNG, 19389/NNG, 4127/NNG, 3292/NNP, 11190/NNG,
Nearest to 9822/NNG: 8792/NNG, 2653/XR, 367/NNG, 10513/MM, 9179/NNG,
Nearest to 1400/NNG: 4847/NNG, 7349/NNG, 1705/VV, 1408/NNG, 2740/NNG,
Nearest to 16277/NNG: 11830/NNG, 30581/NNG, 5253/NNG, 19067/NNG, 1706/NNG,
Model saved in file: E:\kakao\temp3\word2vec_embeddings.ckpt
Loss at step 796000 : 5.017559051513672 / current epochs : 0 / current sentence : 93064
Loss at step 797000 : 5.551601409912109 / current epochs : 0 /

In [None]:
embeddings

In [None]:
word_dictionary_rev[1]

In [None]:
embeddings[1]

In [None]:
gx = tf.matmul(normalized_embeddings, normalized_embeddings[1:2], transpose_b=True)

In [None]:
sess.run(gx)

In [None]:
tt = sorted(enumerate(sess.run(gx)), key=lambda x : x[1][0], reverse=True)

In [None]:
tt

In [None]:
word_dictionary_rev[184]

In [None]:
embeddings

In [None]:
vec = np.zeros(200)

In [None]:
text_data[0]

In [None]:
for i in text_data[0]:
    vec += embeddings[i]

In [None]:
vec

In [None]:
len(text_data[0])

In [None]:
allTexts = []

for c, t in enumerate(text_data) :
    vec = np.zeros(200)
    for i in t : 
        vec += em[i]
    allTexts.append(vec)
    
    if c% 100 == 0 : 
        print(c)

In [None]:
em = sess.run(embeddings)

In [None]:
em.shape

In [None]:
allTexts = np.array(allTexts)

In [None]:
from sklearn.preprocessing import normalize

In [None]:
allTexts = normalize(allTexts)

In [None]:
allTexts.shape

In [None]:
coss = np.dot(allTexts, allTexts[1].transpose())

In [None]:
tf = sorted(enumerate(coss), key = lambda x : x[1], reverse=True)

In [None]:
tf