In [1]:
import re
import tensorflow as tf
import numpy as np
import time
import os
import collections
import random
import time
import nltk
import pickle

In [2]:
def clearstring(string):
    string = re.sub('[^A-Za-z\- ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    return [y.lower() for y in string if len(y) > 3 and y.find('nbsp') < 0]

def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 0)
        if index == 0:  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data_index = 0

# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    global data
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        context_words = [w for w in range(span) if w != skip_window]
        words_to_use = random.sample(context_words, num_skips)
        for j, context_word in enumerate(words_to_use):
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[context_word]
        if data_index == len(data):
            for word in data[:span]:
                buffer.append(word)
            data_index = span
        else:
            buffer.append(data[data_index])
            data_index += 1
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

In [3]:
with open('books/HalfBloodPrince.txt', 'r',encoding = "ISO-8859-1") as fopen:
    vocabulary = clearstring(' '.join(nltk.word_tokenize(fopen.read())))
print('example 10 words:',vocabulary[:10])
print('size corpus:',len(vocabulary))
vocabulary_size = len(list(set(vocabulary)))
print('size of unique words:',vocabulary_size)
dimension = 128
skip_window = 1
num_skips = 2
batch_size = 64
location = os.getcwd()

example 10 words: ['nearing', 'midnight', 'prime', 'minister', 'sitting', 'alone', 'office', 'reading', 'long', 'memo']
size corpus: 98994
size of unique words: 10682


In [4]:
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,vocabulary_size)
del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [['UNK', 1], ('harry', 2771), ('said', 2450), ('that', 2123), ('with', 1072)]
Sample data [4227, 1529, 140, 103, 188, 221, 155, 1050, 97, 5090] ['nearing', 'midnight', 'prime', 'minister', 'sitting', 'alone', 'office', 'reading', 'long', 'memo']


In [5]:
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [6]:
graph = tf.Graph()

with graph.as_default():

    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(tf.random_uniform([vocabulary_size, dimension], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, dimension],
                                                      stddev=1.0 / np.sqrt(dimension)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
        loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                                             biases=nce_biases,
                                             labels=train_labels,
                                             inputs=embed,
                                             num_sampled=batch_size / 2,
                                             num_classes=vocabulary_size))
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        init = tf.global_variables_initializer()

num_steps = 200000

In [7]:
with tf.Session(graph=graph) as session:
    init.run()
    print('Initialized')

    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

        if step % 10000 == 0:
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reverse_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  120.179695129
Nearest to very: induce, happy, fishy, approached, impervious, concealed, daily, footprints,
Nearest to only: booklist, vibrant, highly, prowling, unwise, closing, thus, queen,
Nearest to have: harpies, start-of-term, striding, wheezy, suggested, gloating, roaring, palm,
Nearest to malfoy: dried-up, answering, markings, dunno, intact, parted, unreadable, exploits,
Nearest to tell: half-blood, conductor, teaches, evidently, sectum-sempra, beat, holds, constant,
Nearest to voice: drove, sky-high, appar-ition, comical, recounted, nicknames, lurked, conga,
Nearest to about: preoccupied, inclination, repaired, induce, hopefully, sherry, reworded, anymon,
Nearest to know: slanted, malfoyd, compensations, damp, ceasing, untruthfully, malfoy-is-a-death-eater, prac-,
Nearest to through: commentator, ruff, looked, enjoyment, send-off, arry, stubbs, bobbing,
Nearest to voldemort: teaching, cancelled, modify, obscuring, squad, ankle-deep, calcul

Average loss at step  42000 :  3.06403649056
Average loss at step  44000 :  3.02928800601
Average loss at step  46000 :  3.02364887989
Average loss at step  48000 :  2.96089247376
Average loss at step  50000 :  2.95863304031
Nearest to very: lend, sipping, depart, induce, considering, fishy, automati-cally, wheezing,
Nearest to only: prejudiced, booklist, foresaw, queen, pockets, wilhout, comically, affair,
Nearest to have: embraced, sorted, smashing, pasted, alley, mutilate, stretches, mood,
Nearest to malfoy: discomfited, answering, dried-up, raving, deliver, normally, disbelieving, madness,
Nearest to tell: half-blood, mostc, wedding, whiskey, imagine, squashy, gorilla, breeding,
Nearest to voice: mcgonagall, weapons, realms, wag-ging, somewhat, endured, emanating, appar-ition,
Nearest to about: assurances, intently, useful, the-, nagini, earnest, inhaled, dursley,
Nearest to know: nowadays, aware, malfoyd, wrong, nonsense, slanted, assuming, ceasing,
Nearest to through: speck, amel

Average loss at step  92000 :  2.61380171108
Average loss at step  94000 :  2.57052056539
Average loss at step  96000 :  2.6076737389
Average loss at step  98000 :  2.55887676299
Average loss at step  100000 :  2.55192895091
Nearest to very: lend, depart, spared, fishy, automati-cally, considering, tasty, alarmed,
Nearest to only: prejudiced, foresaw, tawny, sprig, motioned, wilhout, drink, aspect,
Nearest to have: sorted, pasted, embraced, mutilate, purpose, event, fletcher, sentry,
Nearest to malfoy: raving, normally, discomfited, dried-up, answering, bluntly, cracks, disbelieving,
Nearest to tell: half-blood, breeding, mostc, wedding, unimpressed, confirmed, imagine, genius,
Nearest to voice: wag-ging, captaincy, realms, weapons, gestured, twist, shunpike, waxy,
Nearest to about: assurances, useful, the-, intently, comforted, forcefully, unusual, dursley,
Nearest to know: nowadays, nonsense, malfoyd, compensations, assuming, sup-ported, poker, knew,
Nearest to through: speck, amelia

Average loss at step  142000 :  2.42711773506
Average loss at step  144000 :  2.38078848428
Average loss at step  146000 :  2.39784763211
Average loss at step  148000 :  2.41269604078
Average loss at step  150000 :  2.36077362335
Nearest to very: lend, automati-cally, spared, alarmed, depart, gashes, unusually, two-and-a-half,
Nearest to only: motioned, prejudiced, sprig, drink, foresaw, clung, aspect, whim,
Nearest to have: sorted, event, they, mutilate, purpose, despairingly, jubilantly, advisory,
Nearest to malfoy: normally, raving, dried-up, discomfited, buzzing, answering, bluntly, confess,
Nearest to tell: breeding, imagine, half-blood, proof, wedding, teaches, unyielding, confirmed,
Nearest to voice: wag-ging, twist, weapons, represent, transport, captaincy, shouldnt, idea,
Nearest to about: dursley, assurances, forcefully, inhaled, comforted, intently, glamour, nagini,
Nearest to know: nowadays, nonsense, flare, assuming, tobias, sup-ported, compensations, hermione,
Nearest to 

Average loss at step  192000 :  2.33581276321
Average loss at step  194000 :  2.31930258992
Average loss at step  196000 :  2.30138353711
Average loss at step  198000 :  2.35729946288


In [11]:
with open('halfblood-list.p', 'wb') as fopen:
    pickle.dump(list(reverse_dictionary.values()), fopen)

In [12]:
with open('halfblood-vector.p', 'wb') as fopen:
    pickle.dump(final_embeddings, fopen)