In [12]:
import numpy as np
import tensorflow as tf
import json
import collections
import math
import random

In [13]:
vocabulary_size = 500000
data_index = 0

In [14]:
data=np.load('gutenburg_as_ids_500000.npy',mmap_mode='r')

In [15]:
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
  if data_index + span > len(data):
    data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      batch[i * num_skips + j] = buffer[skip_window]
      labels[i * num_skips + j, 0] = buffer[context_word]
    if data_index == len(data):
      buffer.extend(data[0:span])
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels


In [16]:
batch_size = 128
embedding_size = 100  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

In [17]:
tf.reset_default_graph()
tf.set_random_seed(10)
def graph():

    with tf.device('/device:GPU:0'):
        x=tf.placeholder(tf.int32,shape=(batch_size),name="x")
        y=tf.placeholder(tf.int32,shape=(batch_size,1),name="y")
        embedding_matrix=tf.Variable(tf.random_uniform(shape=(vocabulary_size,embedding_size),maxval=1.0/embedding_size,minval=-1.0/embedding_size),name="embedding_matrix")
    
        softmax_weight=tf.Variable(tf.truncated_normal(shape=(vocabulary_size,embedding_size),mean=0,stddev=0.003),name="softmax_weight")
        softmax_bias=tf.Variable(tf.zeros(shape=(vocabulary_size,)),name="softmax_bias")

        embed=tf.nn.embedding_lookup(embedding_matrix,x)
        
        with tf.device('/device:CPU:0'):
            loss=tf.reduce_mean(tf.nn.nce_loss(weights=softmax_weight,biases=softmax_bias,labels=y,inputs=embed,num_sampled=num_sampled,num_classes=vocabulary_size,remove_accidental_hits=True,partition_strategy="div"))
        train_op=tf.train.GradientDescentOptimizer(0.01).minimize(loss)

        normalized_embedding=tf.nn.l2_normalize(embedding_matrix)
        
    return x,y,embedding_matrix,normalized_embedding,loss,train_op
    
    
    

In [18]:
x,y,embedding_matrix,normalized_embedding,loss,train_op=graph()

In [19]:
init=tf.global_variables_initializer()

In [20]:
saver=tf.train.Saver()

In [21]:
with tf.Session(graph=tf.get_default_graph()) as sess:
    init.run()
    for iteration in range((len(data)//batch_size)*2):
        batch_x,batch_y=generate_batch(batch_size,num_skips,skip_window=1)
        
        fd={x:batch_x,y:batch_y}        
        l,_=sess.run([loss,train_op],fd)
        
        if iteration%50000==0:
            print("Iteration",iteration,"Loss: ",l)
    nem=normalized_embedding.eval()
    np.save("normalized_embed",nem)
    saver.save(sess,"C:\\MLDatabases\\JNotebooks\\word embedding training\\models\\skipgram")
del data
    
    

Iteration 0 Loss:  364.606
Iteration 50000 Loss:  178.509
Iteration 100000 Loss:  100.238
Iteration 150000 Loss:  91.1575
Iteration 200000 Loss:  89.6562
Iteration 250000 Loss:  51.5277
Iteration 300000 Loss:  116.913
Iteration 350000 Loss:  78.2076
Iteration 400000 Loss:  55.2738
Iteration 450000 Loss:  67.8976
Iteration 500000 Loss:  70.5004
Iteration 550000 Loss:  64.7648
Iteration 600000 Loss:  50.826
Iteration 650000 Loss:  62.6653
Iteration 700000 Loss:  39.7162
Iteration 750000 Loss:  60.6925
Iteration 800000 Loss:  50.1791
Iteration 850000 Loss:  32.0693
Iteration 900000 Loss:  43.4146
Iteration 950000 Loss:  46.0973
Iteration 1000000 Loss:  26.8475
Iteration 1050000 Loss:  26.7666
Iteration 1100000 Loss:  33.9741
Iteration 1150000 Loss:  61.5399
Iteration 1200000 Loss:  32.8451
Iteration 1250000 Loss:  32.8994
Iteration 1300000 Loss:  23.8557
Iteration 1350000 Loss:  34.9174
Iteration 1400000 Loss:  12.5337
Iteration 1450000 Loss:  20.6105
Iteration 1500000 Loss:  48.4436
Iter

In [22]:
del data

NameError: name 'data' is not defined