In [10]:
import numpy as np
import json
import tensorflow as tf

In [5]:
import math
import collections 
import random
import os
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [6]:
book = open("/Users/jeffreylandes/Lit/LightInAugust.txt", "r")
a = book.read()
a = a.replace(" William Faulkner LIGHT IN AUGUST ", " PPP ")
all_words = text_to_word_sequence(a)
all_words = all_words[42:]

In [7]:
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

In [11]:
data_index = 0
def generate_batch(batch_size, data, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
            
        if data_index == len(data):
            buffer.extend(data[0:span])
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

## Parameters for the Word2Vec Model

In [12]:
batch_size = 256
embedding_size = 512
skip_window = 1
num_skips = 2
num_sampled = 64
n_words = 8500

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [14]:
data, count, dictionary, reversed_dictionary = build_dataset(all_words, n_words)

In [15]:
graph = tf.Graph()

with graph.as_default():
    
    with tf.name_scope('inputs'):
        train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    with tf.name_scope('embeddings'):
        embeddings = tf.Variable(
            tf.random_uniform([n_words, embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    with tf.name_scope("weights"):
        
        nce_weights = tf.Variable(tf.truncated_normal([n_words, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        
    with tf.name_scope('biases'):
        
        nce_biases = tf.Variable(tf.zeros([n_words]))
        
    with tf.name_scope('loss'):
      loss = tf.reduce_mean(
          tf.nn.nce_loss(
              weights=nce_weights,
              biases=nce_biases,
              labels=train_labels,
              inputs=embed,
              num_sampled=num_sampled,
              num_classes=n_words))
    
    tf.summary.scalar('loss', loss)
    
    with tf.name_scope('optimizer'):
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)
    
    merged = tf.summary.merge_all()
    
    init = tf.global_variables_initializer()
    
    saver = tf.train.Saver()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [17]:
n_steps = 5000 # Can increase 
log_dir = "182/LSTMsAndInterpretability"

with tf.Session(graph=graph) as session:
    
    writer = tf.summary.FileWriter(log_dir, session.graph)
    
    init.run()
    print('Initialized')
    
    average_loss = 0
    for step in range(n_steps):
        batch_inputs, batch_labels = generate_batch(batch_size,
                                                  data,
                                                  num_skips,
                                                  skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        run_metadata = tf.RunMetadata()
    
        _, summary, loss_val = session.run([optimizer, merged, loss],
                                         feed_dict=feed_dict,
                                         run_metadata=run_metadata)
    
        average_loss += loss_val
    
        if step % 500 == 0:
            if step > 0:
                average_loss /= 5000
            # The average loss is an estimate of the loss over the last 5000
            # batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
            
    final_embeddings = normalized_embeddings.eval()
        
    saver.save(session, os.path.join(log_dir, 'model.ckpt'))

Initialized
Average loss at step  0 :  220.94557189941406
Average loss at step  500 :  8.224890459060669
Average loss at step  1000 :  3.1309832467079164
Average loss at step  1500 :  1.697612082195282
Average loss at step  2000 :  1.1203050239562988
Average loss at step  2500 :  0.8408830364227295
Average loss at step  3000 :  0.6546102355480194
Average loss at step  3500 :  0.5825531640052796
Average loss at step  4000 :  0.51396307888031
Average loss at step  4500 :  0.4854077645778656


In [18]:
final_embeddings

array([[-0.05790628,  0.07284391, -0.00995954, ..., -0.03629309,
        -0.02657741,  0.0031691 ],
       [ 0.03655732,  0.08539885,  0.04355387, ..., -0.01935608,
        -0.06208573,  0.05084924],
       [ 0.02219536, -0.00940335,  0.0740024 , ..., -0.04992952,
         0.00114734, -0.00273277],
       ...,
       [ 0.05574044, -0.05015   , -0.06531989, ...,  0.06387366,
         0.03086378,  0.04506752],
       [ 0.05086208,  0.06152659, -0.05950092, ...,  0.06278881,
        -0.00957943, -0.01103408],
       [-0.02800961,  0.02059964, -0.00675114, ...,  0.04614358,
         0.02614468, -0.03138208]], dtype=float32)

In [21]:
dictionary['joe']

180

In [22]:
dictionary['christmas']

99

In [23]:
with open("dictionary.json", "w") as outfile:
    json.dump(dictionary, outfile)

In [24]:
with open("reversed_dictionary.json", "w") as outfile:
    json.dump(reversed_dictionary, outfile)

In [29]:
with open('embeddings.json', 'w') as outfile:
    json.dump(final_embeddings.tolist(), outfile)