In [2]:
!pip install numpy
!pip install scipy
!pip install matplotlib
!pip install pandas



In [3]:
import collections
import math
import os
import random
import zipfile
from six.moves import urllib
from six.moves import xrange
import numpy as np
import tensorflow as tf

In [4]:
DOWNLOADED_FILENAME = 'SampleText.zip'

def maybe_download(url_path, expected_bytes):
    if not os.path.exists(DOWNLOADED_FILENAME):
        filename, _ = urllib.request.urlretrieve(url_path, DOWNLOADED_FILENAME)
    statinfo = os.stat(DOWNLOADED_FILENAME)
    if statinfo.st_size == expected_bytes:
        print('Found and verified file from this path: ',  url_path)
        print('Downloaded file: ', DOWNLOADED_FILENAME)
    else:
        print(statinfo.st_size)
        raise Exception('Failed to verify file from: ' + url_path + '. Can you get to it with a browser')

In [5]:
def read_words():
    with zipfile.ZipFile(DOWNLOADED_FILENAME) as f:
        firstfile = f.namelist()[0]
        filestring = tf.compat.as_str(f.read(firstfile))
        words = filestring.split()
        
        return words

In [6]:
URL_PATH = 'http://mattmahoney.net/dc/text8.zip'
FILESIZE = 31344016
maybe_download(URL_PATH, FILESIZE)

Found and verified file from this path:  http://mattmahoney.net/dc/text8.zip
Downloaded file:  SampleText.zip


In [7]:
vocabulary = read_words()

In [8]:
len(vocabulary)

17005207

In [9]:
vocabulary[:25]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against',
 'early',
 'working',
 'class',
 'radicals',
 'including',
 'the',
 'diggers',
 'of',
 'the',
 'english',
 'revolution',
 'and',
 'the',
 'sans',
 'culottes']

In [10]:
def build_dataset(words, n_words):
    word_counts = [['UNKNOWN', -1]]
    
    counter = collections.Counter(words)
    word_counts.extend(counter.most_common(n_words -  1))
    
    dictionary = dict()
    
    for word, _ in word_counts:
        dictionary[word] = len(dictionary)
        
    word_indexes = list()
        
    unknown_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 # dictionary['UNKNOWN']
            unknown_count += 1
            
        word_indexes.append(index)
        
    word_counts[0][1] = unknown_count
    
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    return word_counts, word_indexes, dictionary, reversed_dictionary
    

In [11]:
VOCABULARY_SIZE = 5000
word_counts, word_indexes, dictionary, reversed_dictionary = build_dataset(vocabulary, VOCABULARY_SIZE)

In [12]:
word_counts[:10]

[['UNKNOWN', 2735459],
 ('the', 1061396),
 ('of', 593677),
 ('and', 416629),
 ('one', 411764),
 ('in', 372201),
 ('a', 325873),
 ('to', 316376),
 ('zero', 264975),
 ('nine', 250430)]

In [13]:
word_indexes[:25]

[0,
 3081,
 12,
 6,
 195,
 2,
 3134,
 46,
 59,
 156,
 128,
 742,
 477,
 0,
 134,
 1,
 0,
 2,
 1,
 103,
 855,
 3,
 1,
 0,
 0]

In [14]:
import random

for key in random.sample(list(dictionary), 10):
    print(key, ':', dictionary[key])

si : 3629
julian : 3779
college : 354
language : 141
somewhat : 1355
southeast : 3186
safe : 3226
off : 338
vacuum : 3988
map : 936


In [15]:
import random

for key in random.sample(list(reversed_dictionary), 10):
    print(key, ':', reversed_dictionary[key])

587 : female
480 : particular
4924 : microwave
225 : human
4529 : dialogue
639 : received
3050 : driver
3898 : grace
1117 : ability
2296 : regard


In [16]:
del vocabulary

In [17]:
# Global index into words maintained across batches
global_index = 0 

In [18]:
def generate_batch(word_indexes, batch_size, num_skips, skip_window):
    global global_index

    # For every input we find num_skips context words within a window
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    
    # batch = [1, 2, 3, .... batch_size]
    # labels = [[1], [2], [3], ..., [batch_size]]
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    # The span of a window includes the skip_window elements on each side
    # of the input word plus the word itself
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]

    # A deque is double-ended queue which supports memory efficient appends
    # and pops from each side
    buffer = collections.deque(maxlen=span)

    # Initialize the deque with the first words in the deque
    for _ in range(span):
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]

        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            labels[i * num_skips + j, 0] = buffer[target]  # these are the context words
        
        # The first word from the buffer is removed automatically when a new word
        # is added in at the end
        buffer.append(word_indexes[global_index])
        global_index = (global_index + 1) % len(word_indexes)
    
    # Backtrack a little bit to avoid skipping words in the end of a batch, these
    # words will be captured in the next batch
    global_index = (global_index + len(word_indexes) - span) % len(word_indexes)

    return batch, labels

In [19]:
batch, labels = generate_batch(word_indexes, 10, 2, 5)

In [20]:
batch

array([   2,    2, 3134, 3134,   46,   46,   59,   59,  156,  156])

In [21]:
labels

array([[3081],
       [3134],
       [  59],
       [ 156],
       [ 195],
       [   2],
       [ 128],
       [  46],
       [   0],
       [  46]])

In [22]:
for i in range(9):
    print(reversed_dictionary[batch[i]], ': ', reversed_dictionary[labels[i][0]])

of :  originated
of :  abuse
abuse :  used
abuse :  against
first :  term
first :  of
used :  early
used :  first
against :  UNKNOWN


In [23]:
global_index = 0
valid_size = 16
valid_window = 100

valid_examples = np.random.choice(valid_window, valid_size, replace=False)

In [24]:
batch_size = 128
embedding_size = 50
skip_window = 2
num_skips = 2

In [25]:
tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

In [26]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

In [27]:
embeddings = tf.Variable(
    tf.random_uniform([VOCABULARY_SIZE, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [28]:
embeddings

<tf.Variable 'Variable:0' shape=(5000, 50) dtype=float32_ref>

In [29]:
embed

<tf.Tensor 'embedding_lookup:0' shape=(128, 50) dtype=float32>

In [30]:
weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

In [31]:
hidden_out

<tf.Tensor 'add:0' shape=(128, 5000) dtype=float32>

In [32]:
train_one_hot = tf.one_hot(train_labels, VOCABULARY_SIZE)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, labels=train_one_hot))

In [33]:
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

In [34]:
l2_norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))

normalized_embeddings = embeddings / l2_norm

In [35]:
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)

In [36]:
valid_embeddings

<tf.Tensor 'embedding_lookup_1:0' shape=(16, 50) dtype=float32>

In [37]:
normalized_embeddings

<tf.Tensor 'truediv:0' shape=(5000, 50) dtype=float32>

In [38]:
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [43]:
init = tf.global_variables_initializer()

In [44]:
num_steps = 20001

In [None]:
with tf.Session() as session:
    init.run()
    
    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch(
            word_indexes, batch_size, num_skips, skip_window)
        
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        
        average_loss += loss_val
        
        # average loss at every step
        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
                
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0
        
        # note that this is expensive (compute heavy)
        if step % 10000 == 0:
            sim = similarity.eval()
            
            for i in xrange(valid_size):
                valid_word = reversed_dictionary[valid_examples[i]]
                top_k = 8 # number of nearest neighbors
                
                # argsort sorts from highest to lowest, flip the sign so the largest distance away is the smallest number
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                
                log_str = 'Nearest to %s' % valid_word
                
                for k in range(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
                
            print('\n')
                    
        
        

Average loss at step  0 :  8.61723136902
Nearest to more using, murray, balls, marvel, blood, learned, judge, fran,
Nearest to were technological, ages, era, design, kennedy, suicide, syrian, rocket,
Nearest to than titled, outer, job, forward, hunt, operational, m, save,
Nearest to many currently, chapter, ms, identify, particles, describes, thought, below,
Nearest to often constantine, monster, amiga, incident, discovery, specialized, empire, senator,
Nearest to two received, importance, invaded, come, foot, spend, org, differential,
Nearest to seven bills, confidence, real, subjects, santa, before, coverage, mainly,
Nearest to and into, chance, monetary, optical, gases, rise, buddhism, grey,
Nearest to or user, abstract, usually, binary, seems, angel, membrane, communities,
Nearest to will defeat, struggle, diplomatic, colonies, chance, gnu, agave, defensive,
Nearest to its medal, entertainment, ownership, park, soil, depends, objective, demographics,
Nearest to people mammals, elev