<strong>Train skip grams word embeddings using multiple GPUs.</strong><br>
Hyperparameter settings based on:<br>
Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., & Dean, J. (2013). Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems (pp. 3111-3119).

In [None]:
import tensorflow as tf
import numpy as np
import csv
import os
import math

In [None]:
batch_size = 4096
emb_dim = 300
vocab_size = 539952

In [None]:
in_dir = r"D:\stocktwits_text\2018 all\skipgrams_noneg"
file_names = os.listdir(in_dir)
file_names = [os.path.join(in_dir, file_name) for file_name in file_names]
file_names

In [None]:
line_counts = [586828054, 677051750, 598760681, 583052985, 547069608,
               536936140, 551959422, 552639908, 551994643, 571124852]
steps = math.ceil(sum(line_counts)/batch_size)
steps_gpu = math.ceil(steps/2)
# each 100 step takes about 12 seconds
minutes = int(steps_gpu/100*12/60)
print("Total steps:", steps)
print("Total steps with 2 GPUs:", steps_gpu)
print("Total estimated minutes (GPU):", minutes)

In [None]:
def _parse_line(line):
    fields = tf.decode_csv(line, [[0], [0]])
    return {"word": fields[0]}, fields[1]

In [None]:
def train_input_fn_distributed():
    dataset = tf.data.TextLineDataset(file_names)
    dataset = dataset.shuffle(batch_size*100)
    dataset = dataset.batch(batch_size).map(_parse_line, num_parallel_calls=2).prefetch(batch_size)
    return dataset

In [None]:
def train_input_fn():
    dataset = tf.data.TextLineDataset(file_names)
    dataset = dataset.shuffle(batch_size*100)
    dataset = dataset.batch(batch_size).map(_parse_line, num_parallel_calls=2).prefetch(batch_size)
    data_iter = dataset.make_one_shot_iterator()
    return data_iter.get_next()     

In [None]:
def sg_model_fn(features, labels, mode):
    with tf.name_scope("embeddings"):
        embeddings = tf.get_variable("embedding", shape=[vocab_size, emb_dim])
        embed = tf.nn.embedding_lookup(embeddings, features["word"])
        print("Embedded shape:", embed.shape)
    with tf.name_scope("weights"):
        nce_weights = tf.get_variable("W", shape=[vocab_size, emb_dim])
        print("nce_weights shape:", nce_weights.shape)
    with tf.name_scope("biases"):
        nce_biases = tf.get_variable("b", shape=[vocab_size])
        print("nce_biases shape:", nce_biases.shape)

    with tf.name_scope("loss"):
        loss = tf.reduce_mean(tf.nn.nce_loss(
            weights=nce_weights, biases=nce_biases,
            inputs=embed, labels=labels[:, None], 
            num_sampled=5, num_classes=vocab_size))
    tf.summary.scalar("loss", loss)
    tf.summary.scalar("my_global_step", tf.train.get_global_step())
    merged = tf.summary.merge_all()

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer()
        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)    

In [None]:
# %%time
# # batch 65536; no device specification
# classifier = tf.estimator.Estimator(model_fn=sg_model_fn, model_dir=r"F:\tf_model_dir6")
# classifier.train(input_fn=train_input_fn, steps=1000)

In [None]:
strategy = tf.contrib.distribute.MirroredStrategy(num_gpus=2, prefetch_on_device=True)
config = tf.estimator.RunConfig(
    save_summary_steps=500,
    train_distribute=strategy,
#     save_checkpoints_secs = 20*60,
    save_checkpoints_steps = 5000,
    keep_checkpoint_max = 3,
    model_dir=r"F:\w2v_model_dir_4096_shuffle")
classifier = tf.estimator.Estimator(model_fn=sg_model_fn, 
                                    config=config)

In [None]:
%%time
classifier.train(input_fn=train_input_fn_distributed)