In [1]:
import tensorflow as tf
import os
import collections
import random
import numpy as np
from tensorflow.contrib.tensorboard.plugins import projector

In [2]:
filename = "./enwik9_cleaned.txt"
vocabulary_size = 50000

In [3]:
def read_data(filename):
    with open(filename, errors='replace') as fin:
        for line in fin:
            yield line.strip()

In [4]:
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words()).most_common(n_words - 1))
    
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words():
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


In [5]:
data, count, dictionary, reverse_dictionary = build_dataset(
    lambda: read_data(filename), vocabulary_size)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])


Most common words (+UNK) [['UNK', 3790053], ('the', 6473971), ('of', 3695441), ('and', 2502325), ('in', 2081782)]
Sample data [19649, 7677, 0, 0, 0, 14049, 13031, 13031, 2966, 11] ['aaa', 'algeria', 'UNK', 'UNK', 'UNK', 'ada', 'anarchism', 'anarchism', 'originated', 'as']


In [6]:
def sample(index_words, window_size):
    print("Generating samples...")
    while True:
        for index, center in enumerate(index_words):
            context_window_size = random.randint(1, window_size)

            for target in index_words[max(0, index - context_window_size):index]:
                yield center, target

            for target in index_words[index + 1:min(len(index_words), index + context_window_size + 1)]:
                yield center, target
        print("Completed sampling one full round of data")
    print("Done with generating samples!")

In [7]:
def generate_batch(index_words, batch_size, window_size):
    sample_iter = sample(index_words, window_size)
    
    print("Generating batches...")
    while True:
        center_batch = np.zeros((batch_size), dtype=np.int32)
        target_batch = np.zeros((batch_size, 1), dtype=np.int32)
        
        for index in range(batch_size):
            center_batch[index], target_batch[index] = next(sample_iter)
            
        yield center_batch, target_batch
    
    print("Done with generating batches!")

In [8]:
batch_size = 128
embedding_size = 128
skip_window = 10
num_sampled = 64
learning_rate = 1.0
num_train_steps = 3863999
log_folder = "processed/"
skip_step = 2000

In [9]:
class SkipGramModel:
    def __init__(self, vocab_size, embed_size, batch_size, num_sampled, learning_rate):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.batch_size = batch_size
        self.num_sampled = num_sampled
        self.lr = learning_rate
        self.global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name="global_step")
        
    def _create_placeholders(self):
        with tf.name_scope("inputs"):
            self.center_words = tf.placeholder(tf.int32, shape=[self.batch_size], name="center_words")
            self.target_words = tf.placeholder(tf.int32, shape=[self.batch_size, 1], name="target_words")

    def _create_embedding(self):
        with tf.name_scope("embed"):
            self.embed_matrix = tf.Variable(tf.random_uniform([self.vocab_size, self.embed_size], -1.0, 1.0),
                                            name="embed_matrix")
            
    def _create_loss(self):
        with tf.name_scope("loss"):
            embed = tf.nn.embedding_lookup(self.embed_matrix, self.center_words, name="embed")
            
            nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embed_size],
                                                         stddev=1.0 / (self.embed_size ** 0.5)),
                                     name="nce_weight")
            nce_bias = tf.Variable(tf.zeros([self.vocab_size]), name="nce_bias")
            
            self.loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weight, 
                                                      biases=nce_bias, 
                                                      labels=self.target_words, 
                                                      inputs=embed, 
                                                      num_sampled=self.num_sampled,
                                                      num_classes=self.vocab_size),
                                       name="loss")
            
    def _create_optimizer(self):
        self.optimizer = tf.train.GradientDescentOptimizer(self.lr).minimize(self.loss,
                                                                             global_step=self.global_step)
        
    
    def _create_summaries(self):
        with tf.name_scope("summaries"):
            tf.summary.scalar("loss", self.loss)
            tf.summary.histogram("histogram_loss", self.loss)
            self.summary_op = tf.summary.merge_all()
            
            
    def build_graph(self):
        self._create_placeholders()
        self._create_embedding()
        self._create_loss()
        self._create_optimizer()
        self._create_summaries()

In [10]:
def train_model(model, batch_gen, num_train_steps, weights_fld, skip_step=skip_step, lr=learning_rate):
    saver = tf.train.Saver()
    
    initial_step = 0
    
    with tf.Session() as sess:
        print("Global variables initializing...")
        sess.run(tf.global_variables_initializer())
        
        # If a checkpoint exists, restore from the checkpoint.
        ckpt = tf.train.get_checkpoint_state(os.path.dirname("checkpoints/checkpoint"))
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            
        total_loss = 0.0
        
        writer = tf.summary.FileWriter('improved_graph/lr' + str(lr), sess.graph)
        initial_step = model.global_step.eval()    
        
        print("Start training...")
        for index in range(initial_step, initial_step + num_train_steps):    
            centers, targets = next(batch_gen)
            feed_dict = {
                model.center_words: centers,
                model.target_words: targets,
            }
            loss_batch, _, summary = sess.run([model.loss, model.optimizer, model.summary_op],
                                             feed_dict=feed_dict)
            writer.add_summary(summary, global_step=index)
            total_loss += loss_batch
            if(index + 1) % skip_step == 0:
                print('Average loss at step {}: {:5.1f}'.format(index, total_loss / skip_step))
                total_loss = 0.0
                saver.save(sess, 'checkpoints/checkpoint', index)
                
        final_embed_matrix = sess.run(model.embed_matrix)
        
        # it has to variable. constants don't work here. you can't reuse model.embed_matrix
        embedding_var = tf.Variable(final_embed_matrix, name='embedding')
        sess.run(embedding_var.initializer)

        config = projector.ProjectorConfig()
        summary_writer = tf.summary.FileWriter('processed')

        # add embedding to the config file
        embedding = config.embeddings.add()
        embedding.tensor_name = embedding_var.name
        
        # link this tensor to its metadata file, in this case the first 500 words of vocab
        embedding.metadata_path = 'vocab.tsv'

        # saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)
        saver_embed = tf.train.Saver([embedding_var])
        saver_embed.save(sess, 'processed/model3.ckpt', 1)
        

In [13]:
with open('processed/vocab.tsv', "wb") as f:
    for v in dictionary:
        f.write(v.encode('utf-8') + b'\n')

In [12]:
model = SkipGramModel(vocabulary_size, embedding_size, batch_size, num_sampled, learning_rate)
model.build_graph()
batch_gen = generate_batch(data, batch_size, skip_window)
train_model(model, batch_gen, num_train_steps, log_folder)

Global variables initializing...
Start training...
Generating batches...
Generating samples...
Average loss at step 1999: 118.9
Average loss at step 3999:  59.9
Average loss at step 5999:  38.2
Average loss at step 7999:  30.0
Average loss at step 9999:  23.1
Average loss at step 11999:  18.9
Average loss at step 13999:  16.6
Average loss at step 15999:  15.1
Average loss at step 17999:  12.7
Average loss at step 19999:  12.1
Average loss at step 21999:  11.7
Average loss at step 23999:  11.7
Average loss at step 25999:  10.0
Average loss at step 27999:  10.5
Average loss at step 29999:   9.4
Average loss at step 31999:   9.5
Average loss at step 33999:   9.4
Average loss at step 35999:   8.8
Average loss at step 37999:   9.2
Average loss at step 39999:   8.6
Average loss at step 41999:   8.6
Average loss at step 43999:   7.7
Average loss at step 45999:   8.0
Average loss at step 47999:   7.9
Average loss at step 49999:   7.7
Average loss at step 51999:   7.8
Average loss at step 53999

Average loss at step 467999:   5.1
Average loss at step 469999:   4.9
Average loss at step 471999:   5.1
Average loss at step 473999:   5.0
Average loss at step 475999:   4.9
Average loss at step 477999:   5.1
Average loss at step 479999:   5.0
Average loss at step 481999:   5.0
Average loss at step 483999:   5.0
Average loss at step 485999:   4.9
Average loss at step 487999:   5.1
Average loss at step 489999:   4.9
Average loss at step 491999:   5.0
Average loss at step 493999:   4.9
Average loss at step 495999:   5.0
Average loss at step 497999:   5.1
Average loss at step 499999:   5.1
Average loss at step 501999:   5.0
Average loss at step 503999:   5.1
Average loss at step 505999:   4.9
Average loss at step 507999:   4.9
Average loss at step 509999:   5.0
Average loss at step 511999:   5.1
Average loss at step 513999:   5.0
Average loss at step 515999:   4.9
Average loss at step 517999:   4.8
Average loss at step 519999:   5.0
Average loss at step 521999:   5.0
Average loss at step

Average loss at step 937999:   4.9
Average loss at step 939999:   4.9
Average loss at step 941999:   4.9
Average loss at step 943999:   4.9
Average loss at step 945999:   4.8
Average loss at step 947999:   4.8
Average loss at step 949999:   4.8
Average loss at step 951999:   4.8
Average loss at step 953999:   4.8
Average loss at step 955999:   4.9
Average loss at step 957999:   4.8
Average loss at step 959999:   4.8
Average loss at step 961999:   4.7
Average loss at step 963999:   4.6
Average loss at step 965999:   4.9
Average loss at step 967999:   4.6
Average loss at step 969999:   4.7
Average loss at step 971999:   4.5
Average loss at step 973999:   4.6
Average loss at step 975999:   4.7
Average loss at step 977999:   4.9
Average loss at step 979999:   4.8
Average loss at step 981999:   4.8
Average loss at step 983999:   4.8
Average loss at step 985999:   4.8
Average loss at step 987999:   4.8
Average loss at step 989999:   4.9
Average loss at step 991999:   4.8
Average loss at step

Average loss at step 1395999:   6.8
Average loss at step 1397999:   4.6
Average loss at step 1399999:   4.5
Average loss at step 1401999:   4.5
Average loss at step 1403999:   4.6
Average loss at step 1405999:   4.8
Average loss at step 1407999:   4.8
Average loss at step 1409999:   4.8
Average loss at step 1411999:   4.8
Average loss at step 1413999:   4.7
Average loss at step 1415999:   4.7
Average loss at step 1417999:   4.6
Average loss at step 1419999:   4.7
Average loss at step 1421999:   4.8
Average loss at step 1423999:   4.9
Average loss at step 1425999:   4.7
Average loss at step 1427999:   4.8
Average loss at step 1429999:   4.8
Average loss at step 1431999:   4.7
Average loss at step 1433999:   4.7
Average loss at step 1435999:   4.8
Average loss at step 1437999:   4.8
Average loss at step 1439999:   4.7
Average loss at step 1441999:   4.7
Average loss at step 1443999:   4.6
Average loss at step 1445999:   4.8
Average loss at step 1447999:   4.8
Average loss at step 1449999

Average loss at step 1851999:   4.8
Average loss at step 1853999:   4.7
Average loss at step 1855999:   4.8
Average loss at step 1857999:   4.7
Average loss at step 1859999:   4.7
Average loss at step 1861999:   4.7
Average loss at step 1863999:   4.7
Average loss at step 1865999:   4.8
Average loss at step 1867999:   4.8
Average loss at step 1869999:   4.8
Average loss at step 1871999:   4.7
Average loss at step 1873999:   4.7
Average loss at step 1875999:   4.8
Average loss at step 1877999:   4.6
Average loss at step 1879999:   4.5
Average loss at step 1881999:   4.7
Average loss at step 1883999:   4.7
Average loss at step 1885999:   4.7
Average loss at step 1887999:   4.7
Average loss at step 1889999:   4.8
Average loss at step 1891999:   4.7
Average loss at step 1893999:   4.5
Average loss at step 1895999:   4.8
Average loss at step 1897999:   4.8
Average loss at step 1899999:   4.8
Average loss at step 1901999:   4.7
Average loss at step 1903999:   4.7
Average loss at step 1905999

Average loss at step 2307999:   4.7
Average loss at step 2309999:   4.7
Average loss at step 2311999:   4.7
Average loss at step 2313999:   4.8
Average loss at step 2315999:   4.7
Average loss at step 2317999:   4.7
Average loss at step 2319999:   4.8
Average loss at step 2321999:   4.7
Average loss at step 2323999:   4.6
Average loss at step 2325999:   4.6
Average loss at step 2327999:   4.7
Average loss at step 2329999:   4.7
Average loss at step 2331999:   4.7
Average loss at step 2333999:   4.7
Average loss at step 2335999:   4.6
Average loss at step 2337999:   4.8
Average loss at step 2339999:   4.7
Average loss at step 2341999:   4.7
Average loss at step 2343999:   4.8
Average loss at step 2345999:   4.7
Average loss at step 2347999:   4.8
Average loss at step 2349999:   4.8
Average loss at step 2351999:   4.7
Average loss at step 2353999:   4.8
Average loss at step 2355999:   4.7
Average loss at step 2357999:   4.7
Average loss at step 2359999:   4.7
Average loss at step 2361999

Average loss at step 2763999:   4.6
Average loss at step 2765999:   4.7
Average loss at step 2767999:   4.7
Average loss at step 2769999:   4.7
Average loss at step 2771999:   4.7
Average loss at step 2773999:   4.7
Average loss at step 2775999:   4.6
Average loss at step 2777999:   4.6
Average loss at step 2779999:   4.7
Average loss at step 2781999:   4.7
Average loss at step 2783999:   4.6
Average loss at step 2785999:   4.5
Average loss at step 2787999:   4.5
Average loss at step 2789999:   4.7
Average loss at step 2791999:   4.7
Average loss at step 2793999:   4.7
Average loss at step 2795999:   4.6
Average loss at step 2797999:   4.6
Average loss at step 2799999:   4.6
Average loss at step 2801999:   4.7
Average loss at step 2803999:   4.6
Average loss at step 2805999:   4.7
Average loss at step 2807999:   4.7
Average loss at step 2809999:   4.7
Average loss at step 2811999:   4.8
Average loss at step 2813999:   4.6
Average loss at step 2815999:   4.7
Average loss at step 2817999

Average loss at step 3219999:   3.8
Average loss at step 3221999:   3.7
Average loss at step 3223999:   3.8
Average loss at step 3225999:   3.9
Average loss at step 3227999:   3.9
Average loss at step 3229999:   3.9
Average loss at step 3231999:   4.2
Average loss at step 3233999:   4.3
Average loss at step 3235999:   3.7
Average loss at step 3237999:   4.3
Average loss at step 3239999:   4.8
Average loss at step 3241999:   4.8
Average loss at step 3243999:   4.7
Average loss at step 3245999:   4.7
Average loss at step 3247999:   4.7
Average loss at step 3249999:   4.7
Average loss at step 3251999:   4.5
Average loss at step 3253999:   4.2
Average loss at step 3255999:   3.8
Average loss at step 3257999:   3.9
Average loss at step 3259999:   3.9
Average loss at step 3261999:   4.0
Average loss at step 3263999:   4.0
Average loss at step 3265999:   4.5
Average loss at step 3267999:   4.6
Average loss at step 3269999:   4.0
Average loss at step 3271999:   3.6
Average loss at step 3273999

Average loss at step 3675999:   3.6
Average loss at step 3677999:   3.7
Average loss at step 3679999:   3.6
Average loss at step 3681999:   3.5
Average loss at step 3683999:   3.7
Average loss at step 3685999:   3.6
Average loss at step 3687999:   3.6
Average loss at step 3689999:   3.8
Average loss at step 3691999:   3.9
Average loss at step 3693999:   3.8
Average loss at step 3695999:   3.7
Average loss at step 3697999:   3.8
Average loss at step 3699999:   3.6
Average loss at step 3701999:   3.7
Average loss at step 3703999:   3.6
Average loss at step 3705999:   3.7
Average loss at step 3707999:   3.8
Average loss at step 3709999:   3.7
Average loss at step 3711999:   3.7
Average loss at step 3713999:   3.6
Average loss at step 3715999:   4.1
Average loss at step 3717999:   3.7
Average loss at step 3719999:   3.5
Average loss at step 3721999:   3.6
Average loss at step 3723999:   3.5
Average loss at step 3725999:   3.7
Average loss at step 3727999:   3.6
Average loss at step 3729999