In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

from process_data import process_data

  from ._conv import register_converters as _register_converters


In [2]:
embeddings2 = tf.Variable(tf.random_uniform([4, 5], -1.0, 1.0))
with tf.Session() as sess:
    init_op2 = tf.global_variables_initializer()
    sess.run(init_op2)
    print(sess.run(embeddings2))

[[-0.5130625   0.9315889   0.40724492  0.2455759   0.8564441 ]
 [ 0.5759721   0.46080518  0.7878475   0.6381359  -0.80485654]
 [ 0.5920913  -0.67697525  0.9465809   0.02209878  0.41966677]
 [ 0.6062257  -0.4364462  -0.70048285 -0.661649   -0.7818403 ]]


In [3]:
VOCAB_SIZE = 50000
BATCH_SIZE = 128
EMBED_SIZE = 128 # dimension of the word embedding vectors
SKIP_WINDOW = 1 # the context window
NUM_SAMPLED = 64    # Number of negative examples to sample.
LEARNING_RATE = 1.0
NUM_TRAIN_STEPS = 10000
SKIP_STEP = 2000 # how many steps to skip before reporting the loss

def word2vec(batch_gen):
    """ Build the graph for word2vec model and train it """
    # Step 1: define the placeholders for input and output
    # center_words have to be int to work on embedding lookup

    with tf.variable_scope("data"):
        train_input = tf.placeholder(tf.int32, [BATCH_SIZE])
        train_labels = tf.placeholder(tf.int32, [BATCH_SIZE, 1])

    # Step 2: define weights. In word2vec, it's actually the weights that we care about
    # vocab size x embed size
    # initialized to random uniform -1 to 1
    #生成词向量矩阵E
    with tf.variable_scope("inference"):
        embeddings = tf.Variable(tf.random_uniform([VOCAB_SIZE, EMBED_SIZE], -1.0, 1.0))

    # Step 3: define the inference
    # get the embed of input words using tf.nn.embedding_lookup
    # embed = tf.nn.embedding_lookup(embed_matrix, center_words, name='embed')

    embed = tf.nn.embedding_lookup(embeddings, train_input)

    # Step 4: construct variables for NCE loss
    # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
    # nce_weight (vocab size x embed size), intialized to truncated_normal stddev=1.0 / (EMBED_SIZE ** 0.5)
    # bias: vocab size, initialized to 0

    with tf.variable_scope("loss"):
        global_step = tf.Variable(0,dtype=tf.int32, trainable=False, name='global_step')
        # seed 控制随机性，在一个session内追踪随机种子，不同的Session会重新启动随机数生成器。
        nce_weight = tf.Variable(tf.truncated_normal([VOCAB_SIZE, EMBED_SIZE], stddev=1.0 / (EMBED_SIZE ** 0.5), seed=12))
        nce_bias = tf.Variable(tf.zeros([VOCAB_SIZE]))

    # define loss function to be NCE loss function
    # tf.nn.nce_loss(weights, biases, labels, inputs, num_sampled, num_classes, ...)
    # need to get the mean accross the batch
    # num_sampled 采样出多少个负样本
    # num_true 实际的正样本个数
    # log_uniform_candidate_sampler

        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight,
                                      biases=nce_bias,
                                      labels=train_labels,
                                      inputs=embed,
                                      num_sampled=NUM_SAMPLED,
                                      num_classes=VOCAB_SIZE))

    # Step 5: define optimizer
    # 优化器会在每次更新权重后对global_step加1
    with tf.variable_scope("optimizer"):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss,global_step=global_step)

    with tf.name_scope("summaries"):
        # 1.0 用来显示标量信息
        tf.summary.scalar("loss", loss)
        # 2.0 直方图变化
        tf.summary.histogram("embedding", embeddings)
        # merge them all
        # 3.0 将所有summary全部保存到磁盘
        summary_op = tf.summary.merge_all()
            
    logdir = './my_graph/word2vec{}/'.format(LEARNING_RATE)
    with tf.Session() as sess:
        
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        # Saver
        saver = tf.train.Saver()

        total_loss = 0.0 # we use this to calculate the average loss in the last SKIP_STEP steps
        writer = tf.summary.FileWriter('./my_graph/no_frills/', sess.graph)
        for index in range(NUM_TRAIN_STEPS):
            centers, targets = next(batch_gen)
            # TO DO: create feed_dict, run optimizer, fetch loss_batch
            feed_dict = {train_input: centers, train_labels: targets}
            _, loss_batch = sess.run([optimizer, loss], feed_dict=feed_dict)
            total_loss += loss_batch
            if (index + 1) % SKIP_STEP == 0:
                summary = sess.run(summary_op, feed_dict=feed_dict)
                writer.add_summary(summary, global_step=index)
                # 保存训练过程中的global_step=index的步
                saver.save(sess, logdir + "model.ckpt", global_step=global_step)
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / SKIP_STEP))
                total_loss = 0.0
        writer.close()

In [4]:
batch_gen = process_data(VOCAB_SIZE, BATCH_SIZE, SKIP_WINDOW)
# center_batch[index]=5328, target_batch[index]=[55.],
word2vec(batch_gen)

vocab_size=50000, batch_size=128, skip_window=1
DATA_FOLDER:  ./data/  file_name:  text8.zip
Dataset ready
file_path: ./data/text8.zip
words length:17005207
words=originated
words=as
words=a
words=term
top 4 count
('the', 1061396)
('of', 593677)
('and', 416629)
after build_vocab
words=originated
words=as
words=a
words=term
type of index_words: <class 'list'>
after convert_words_to_index
words=3081
words=12
words=6
words=195
single_gen <generator object generate_sample at 0x000001B748A59A40>
Average loss at step 1999: 113.8
Average loss at step 3999:  52.9
Average loss at step 5999:  33.2
Average loss at step 7999:  23.3
Average loss at step 9999:  17.7
