In [1]:
import tensorflow as tf
import zipfile
import collections
import numpy as np
import os
from tensorflow.contrib.tensorboard.plugins import projector
from tempfile import gettempdir
from six.moves import urllib

In [2]:
LOG_DIR = 'processed'
VOCAB_FILE = 'vocab.tsv'
MODEL_NAME = 'model.ckpt'

In [3]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    local_filename = os.path.join(gettempdir(), filename)
    if not os.path.exists(local_filename):
        local_filename, _ = urllib.request.urlretrieve(url + filename,
                                                   local_filename)
    statinfo = os.stat(local_filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify ' + local_filename +
                    '. Can you get to it with a browser?')
    return local_filename


filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [8]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

words = read_data(os.path.join(gettempdir(), 'text8.zip'))
print('Data size', len(words))

Data size 17005207


In [7]:
vocabulary_size = 10000
embed_size = 300
window_size = 5

In [9]:
def build_vocabulary(words, n_words):
    print('Building vocabulary...')
    vocab = ['UNK']
    vocab.extend([word for word, _ in collections.Counter(words).most_common(n_words - 1)])
    if not os.path.exists(LOG_DIR):
        os.makedirs(LOG_DIR)
    with open(os.path.join(LOG_DIR, VOCAB_FILE), 'w') as f:
        f.write('\n'.join(vocab))
    return {word: i for i, word in enumerate(vocab)}

def words_to_indicies(words, index):
    print('Converting words to indicies...')
    return [index[w] if w in index else 0 for w in words]

def get_pairs(words, window_size):
    print('Getting word pairs...')
    for i, center in enumerate(words):
        targets = words[max(0, i - window_size): i]
        targets.extend(words[i + 1: i + window_size + 1])
        for t in targets:
            yield center, t

In [10]:
index = build_vocabulary(words, vocabulary_size)
index_words = words_to_indicies(words, index)
del words

print('Building co-occurence matrix...')
occurence = np.zeros([vocabulary_size, vocabulary_size])
for center, target in get_pairs(index_words, window_size):
    occurence[center][target] += 1

Building vocabulary...
Converting words to indicies...
Building co-occurence matrix...
Getting word pairs...


In [11]:
np.savetxt(os.path.join(LOG_DIR, "occurence_mat"), occurence)

In [5]:
occurence = np.loadtxt(os.path.join(LOG_DIR, "occurence_mat"))

In [12]:
print('Building and running graph...')

occurence_var = tf.placeholder(tf.float32)
mean_occurence = tf.reduce_mean(occurence_var, axis=1, keepdims=True)
mean_centered_occurence = tf.subtract(occurence_var, mean_occurence)
svd = tf.svd(mean_centered_occurence)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    s, u, _ = sess.run(svd, feed_dict={occurence_var: occurence})

    embedding_var = tf.Variable(np.dot(u, np.diag(s)[:, :embed_size]), name='embedding')
    sess.run(embedding_var.initializer)

    config = projector.ProjectorConfig()
    summary_writer = tf.summary.FileWriter(LOG_DIR)

    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_var.name
    embedding.metadata_path = VOCAB_FILE

    projector.visualize_embeddings(summary_writer, config)
    saver_embed = tf.train.Saver([embedding_var])
    saver_embed.save(sess, os.path.join(LOG_DIR, MODEL_NAME))


Building and running graph...
