In [1]:
import numpy as np
import json
import tensorflow as tf

In [2]:
import math
import collections 
import random
import os

In [3]:
def get_dicts():
    with open('vocab_dictionary.json', 'r') as f:
        vocab_dictionary = json.load(f)
    with open('reversed_dictionary.json', 'r') as f:
        reversed_dictionary = json.load(f)
    with open('wordcount.json', 'r') as f:
        word_count = json.load(f)
    with open("dataloc.txt", "r") as f:
        data = []
        for line in f:
            data.append(line)
    return data, vocab_dictionary, reversed_dictionary, word_count

In [4]:
%%time
data, vocab_dictionary, reversed_dictionary, word_count = get_dicts()
print("done")

done
CPU times: user 2min 17s, sys: 26.8 s, total: 2min 44s
Wall time: 5min 27s


In [5]:
data_index = 0
def generate_batch(batch_size, data, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
            
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

## Parameters for the Word2Vec Model

In [6]:
batch_size = 256
embedding_size = 512
skip_window = 1
num_skips = 2
num_sampled = 64
n_words = 50000

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [7]:
graph = tf.Graph()

with graph.as_default():
    
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(
            tf.random_uniform([n_words, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    with tf.name_scope("weights"):
        
        nce_weights = tf.Variable(tf.truncated_normal([n_words, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        
    with tf.name_scope('biases'):
        
        nce_biases = tf.Variable(tf.zeros([n_words]))
        
    with tf.name_scope('loss'):
      loss = tf.reduce_mean(
          tf.nn.nce_loss(
              weights=nce_weights,
              biases=nce_biases,
              labels=train_labels,
              inputs=embed,
              num_sampled=num_sampled,
              num_classes=n_words))
    
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [8]:
n_steps = 135001 # Can increase 
log_dir = "182/LSTMsAndInterpretability"

with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    average_loss = 0
    for step in range(n_steps):
        batch_inputs, batch_labels = generate_batch(batch_size,
                                                  data,
                                                  num_skips,
                                                  skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        run_metadata = tf.RunMetadata()
    
        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
    
        average_loss += loss_val
    
        if step % 5000 == 0:
            if step > 0:
                average_loss /= 5000
            # The average loss is an estimate of the loss over the last 5000
            # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
    final_embeddings = normalized_embeddings.eval()
        
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))

Initialized
Average loss at step  0 :  268.5702819824219
Average loss at step  5000 :  72.89216162166595
Average loss at step  10000 :  22.195492863130568
Average loss at step  15000 :  11.500786938548089
Average loss at step  20000 :  8.013700315523147
Average loss at step  25000 :  6.417343465995788
Average loss at step  30000 :  5.797176605558396
Average loss at step  35000 :  5.3609576342344285
Average loss at step  40000 :  5.390969744110107
Average loss at step  45000 :  4.898157170629501
Average loss at step  50000 :  4.789572329854965
Average loss at step  55000 :  4.668594528675079
Average loss at step  60000 :  4.632663997912407
Average loss at step  65000 :  4.570204546451569
Average loss at step  70000 :  4.500336844372749
Average loss at step  75000 :  4.446901403260231
Average loss at step  80000 :  4.447044924449921
Average loss at step  85000 :  4.359970426797867
Average loss at step  90000 :  4.374514587569236
Average loss at step  95000 :  4.344972170615196
Average lo

In [9]:
n_words = 50000
embeddings_dict = {}
for i in range(n_words):
    embeddings_dict[i] = final_embeddings[i].tolist()
with open('embeddings.json', 'w') as fp:
    json.dump(embeddings_dict, fp)