1. Restore model from checkpoint.<br>
2. Evaluate word embeddings using closest words.<br>

In [1]:
import tensorflow as tf
import numpy as np
import gensim
import random
import csv
import os
from sklearn.preprocessing import normalize



In [2]:
def sg_model_fn(features, labels, mode):
    with tf.name_scope("embeddings"):
        embeddings = tf.get_variable("embedding", shape=[vocab_size, emb_dim])
        embed = tf.nn.embedding_lookup(embeddings, features["word"])
        print("Embedded shape:", embed.shape)
    with tf.name_scope("weights"):
        nce_weights = tf.get_variable("W", shape=[vocab_size, emb_dim])
        print("nce_weights shape:", nce_weights.shape)
    with tf.name_scope("biases"):
        nce_biases = tf.get_variable("b", shape=[vocab_size])
        print("nce_biases shape:", nce_biases.shape)

    with tf.name_scope("loss"):
        loss = tf.reduce_mean(tf.nn.nce_loss(
            weights=nce_weights, biases=nce_biases,
            inputs=embed, labels=labels[:, None], 
            num_sampled=5, num_classes=vocab_size))
    tf.summary.scalar("loss", loss)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

In [3]:
# read word indexes
index2word = {}
word2index = {}
with open(r"D:\stocktwits_text\2018 all\word_index.csv", "r", encoding="utf-8") as in_f:
    for row in csv.reader(in_f):
        index2word[int(row[1])] = row[0]
        word2index[row[0]] = int(row[1])
print(len(index2word))
print(len(word2index))

3489467
3489467


In [4]:
def evaluate(model_dir):
    # Copy model definition and classifier configuration from the training code
    strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=2, prefetch_on_device=True)
    config = tf.estimator.RunConfig(
        save_summary_steps=200,
        train_distribute=strategy,
        save_checkpoints_secs = 20*60,
        keep_checkpoint_max = 3,
        model_dir=model_dir)
    classifier = tf.estimator.Estimator(model_fn=sg_model_fn, 
                                        config=config)
    print("Lastest checkpoint:", classifier.latest_checkpoint())
    print("Current global step:", classifier.get_variable_value("global_step"))
    print("Variable names:", classifier.get_variable_names())
    
    fixed_words = ["bull", "bullish", "bear", "bearish", "breakout", "bottom", "bounce", "short", "rally", "strength",
                   "nice", "dip","puts","aapl","down","earnings", "dividend", "subpoena", "estimate", "volume", "risk",
                   ":)", "<rocket>", ":(", "<fire>"]
    fixed_examples = [word2index[w] for w in fixed_words]
    valid_size = 16               # Random set of words to evaluate similarity on.
    valid_window = 100

    embeddings = classifier.get_variable_value("embedding")
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+valid_window), valid_size//2))
    valid_examples = np.append(valid_examples, fixed_examples)
    norm_embed = normalize(embeddings)
    valid_embed = np.vstack(list(map(lambda x: norm_embed[x], valid_examples)))
    similarity = np.matmul(valid_embed, np.transpose(norm_embed))

    for i in range(len(valid_examples)):
        valid_word = index2word[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-similarity[i, :]).argsort()[1:top_k+1]
        log = '%12s:\t' % valid_word
        for k in range(top_k):
            close_word = index2word[nearest[k]]
            log = '%s %s,' % (log, close_word)
        print(log)

In [5]:
evaluate(r"F:\w2v_model_dir") 

INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Using config: {'_model_dir': 'F:\\w2v_model_dir', '_tf_random_seed': None, '_save_summary_steps': 200, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 1200, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x0000026A1D837390>, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026A1D8374E0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_

In [6]:
evaluate(r"F:\w2v_model_dir_8192") # batch_size = 8192

INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Using config: {'_model_dir': 'F:\\w2v_model_dir_8192', '_tf_random_seed': None, '_save_summary_steps': 200, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 1200, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x0000026A1D837320>, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026A1D837518>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '

In [7]:
evaluate(r"F:\w2v_model_dir_8192_shuffle")

INFO:tensorflow:Initializing RunConfig with distribution strategies.
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Using config: {'_model_dir': 'F:\\w2v_model_dir_8192_shuffle', '_tf_random_seed': None, '_save_summary_steps': 200, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 1200, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x0000026A1D837518>, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026A1D8379B0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief':