In [1]:
""" Basic word2vec example.
From Tensorflow's Official Github
Read this for the details: https://www.tensorflow.org/tutorials/word2vec

As a practice and commented and modified by Kin
"""


import collections
import math
import os
import random
import zipfile

import numpy as np
import matplotlib.pyplot as plt
from six.moves import urllib
import tensorflow as tf

In [2]:
# Step 1: Get the data

url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    '''Download a file if not present'''
    if not os.path.exists(filename):
        filename, _ = urllib.request.urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified (correct size)', filename)
    else:
        print(statinfo.st_size)
        raise Exception(
        'Failed to verify {}. Can you get to it by yourself?'.format(filename))
    return filename

filename = maybe_download('text8.zip', 31344016)

# Read the data into a list of strings
def read_data(filename):
    '''Extract the first file enclosed in a zip file as a list of words'''
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data

vocabulary = read_data(filename)
print('Data size (num of items in the list)', len(vocabulary))

Found and verified (correct size) text8.zip
Data size (num of items in the list) 17005207


In [3]:
# Checking
# vocabulary[0:100]

In [4]:
# Step 2: Build the dictionary and replace rare words with UNK token.

vocabulary_size = 50000


def build_dataset(words, n_words):
    '''Process raw inputs into a dataset'''
    count = [['UNK', -1]]
    # get the most common 50000 words as the basic of the dictionary
    # Counter() for count the 
    count.extend(collections.Counter(words).most_common(n_words - 1))
    # create the dict for the whole corpus, with numbers
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)  # [Good, but maybe slower] for putting numbers into dict
    # create a data list by using the corpus's number-encoding
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count +=1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data, count, dictionary, reversed_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)

del vocabulary  # [Good Practice] to reduce memory

In [5]:
print('Top 5 Most commom words (+UNK)', count[:5])
print('\nSample Data', data[:10], [reversed_dictionary[i] for i in data[:10]])

data_index = 0

Top 5 Most commom words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764)]

Sample Data [5235, 3083, 12, 6, 195, 2, 3137, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']


In [6]:
2 % len(data)

2

In [7]:
# Step 3: Function to generate a training batch for the skip-gram model

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    # : Assert the correct arguments 
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window | target | skip_window],
                                # here the window size is 1, so span is 3
    
    # [Buffer (A Span)] create a Deque list with fixed size
    buffer = collections.deque(maxlen=span)  # now span is 3
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
#     print(buffer)    
    #  >____<"
    for i in range(batch_size // num_skips):  # : use // to get the integer for range
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
#         print('==i:', i,'- target:', target, 'target_avoid:', targets_to_avoid)
        for j in range(num_skips):
#             print('j:', j)
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)  # to randomly select the skip_window (0 or 2)
#                 print('target in while loop', target)
            targets_to_avoid.append(target)
#             print('target_avoid:', targets_to_avoid)
            batch[i * num_skips + j ] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
#             print('buffer[skip_window](batch):', i*2+j,' ', buffer[skip_window],'  buffer[skip_window]:',buffer[target])
        buffer.append(data[data_index])
#         print('\n', buffer)
        data_index = (data_index + 1) % len(data)
    # backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)

In [8]:
for i in range(8):
    print(batch[i], reversed_dictionary[batch[i]], '->', labels[i, 0],
          reversed_dictionary[labels[i, 0]])
    

3083 originated -> 12 as
3083 originated -> 5235 anarchism
12 as -> 3083 originated
12 as -> 6 a
6 a -> 12 as
6 a -> 195 term
195 term -> 6 a
195 term -> 2 of


In [9]:
# Step 4: Build the tensorflow Graph to train the model
# using "Noise-contrastive estimation" Loss
# Details of Candidate sampling: https://www.tensorflow.org/api_guides/python/nn#Candidate_Sampling

batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2

# Construct the validation set, with some most frequent words
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64  # num of negative samples

graph = tf.Graph()

with graph.as_default():
    
    # [4.1 - Set Placeholders/Constant] input data
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # [4.2 - Set Device] Ops and variables pin to the CPU (with GPU implememtation)
    with tf.device('/cpu:0'):
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1, 1))
        # nn.embedding_lookup: help get the embedding for train_inputs(which are some indexes)
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    # [4.3 - Set Variable] Construct the variables for the NCE loss/ embed-to-output matrix
    nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size]))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    
    # [4.4 - Loss Function] Compute the avg NCE loss for the batch
    # tf.nce_loss automatically draws a new sample of the negative label each
    # time we evaluate the loss
    loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,  # Weights
                      biases=nce_biases,  # Biases
                      labels=train_labels,  # Y
                      inputs=embed,  # X
                      num_sampled=num_sampled,  
                      num_classes=vocabulary_size))
    
    # [4.5 - Optimization] Construct the SGD optimizer using a learning rate (1.0)
    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
    
    # >___<'' [4.6 - Cosine similarity] Cosine Similarity between minibatch examples and all embeddings
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
    valid_embeddings, normalized_embeddings, transpose_b=True)
    
    # make a variable initializer
    init = tf.global_variables_initializer()

# Step 5: Trainning Begins!
num_steps = 100001

with tf.Session(graph=graph) as sess:
    # initialize all variables before use it
    init.run()
    print('Initialized')
    
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(
            batch_size, num_skips, skip_window)
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
        
        # Perform one update step by evaluating the optimizer op
        # (including it in the list of returned values for sess.run())
        _, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        # This step is expensive ( ~20% slowdown if computed every 500 steps)
        if step % 10000 == 0:
            print('\n==== Step', step)
            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reversed_dictionary[valid_examples[i]]
                top_k = 8  # top k nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log_str = 'Nearest to %s:' % valid_word
                for k in range(top_k):
                    close_word = reversed_dictionary[nearest[k]]
                    log_str = '%s %s,' % (log_str, close_word)
                print(log_str)
    final_embeddings = normalized_embeddings.eval()

    

Initialized

==== Step 0
Nearest to would: dragster, prophylactic, interviewed, mez, kossuth, ataxia, corner, hooked,
Nearest to into: uniquely, splendor, adultery, tack, woven, insulated, here, bayonets,
Nearest to for: objectivity, occurred, ascending, dayan, miscellaneous, nymph, worshipped, lundy,
Nearest to other: propelled, gpled, thereby, paglia, cadets, pj, impartial, unedited,
Nearest to he: honduran, sitter, respects, muds, hertz, blackberry, melt, plucking,
Nearest to world: choose, persists, ohlin, danner, convincingly, moisture, circular, kenyan,
Nearest to of: the, hyperlinks, nawab, worldview, reducible, mur, rutgers, veda,
Nearest to will: borealis, himalayas, culmination, infarction, reclaimed, gambler, foods, value,
Nearest to all: amaranth, ft, lynyrd, lighters, undead, strictly, employ, inscription,
Nearest to eight: blocking, adolphe, spotters, theocracy, conquests, identically, az, glucagon,
Nearest to b: karachi, cues, leakey, blackboard, staunchly, retractable, 


==== Step 80000
Nearest to would: might, must, will, may, should, could, can, cannot,
Nearest to into: under, from, through, without, within, with, and, against,
Nearest to for: in, using, without, during, within, through, under, or,
Nearest to other: various, many, different, including, especially, specific, some, whose,
Nearest to he: she, they, never, it, who, thus, and, originally,
Nearest to world: today, city, ii, near, book, against, largest, during,
Nearest to of: whose, within, including, through, like, in, between, near,
Nearest to will: would, could, must, may, should, cannot, might, can,
Nearest to all: some, these, both, many, each, instead, especially, various,
Nearest to eight: seven, six, nine, five, four, three, two, one,
Nearest to b: r, g, c, m, n, d, l, f,
Nearest to there: which, it, still, they, thus, usually, she, who,
Nearest to from: through, during, including, in, under, against, and, by,
Nearest to has: had, have, is, was, having, but, since, when,
Nearest t

In [13]:
# Step 6: Visualize the embeddings (2D)

def plot_with_labels(low_dim_embs, labels, filename='tsne_1000.png'):   
    assert low_dim_embs.shape[0] >=len(labels), 'More labels than embeddings'
    plt.figure(figsize=(36, 36))  # in inches
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label,
                    xy=(x, y),
                    xytext=(5,2),
                    textcoords='offset points',
                    ha='right',
                    va='bottom')
    plt.savefig(filename)

try:
    from sklearn.manifold import TSNE
    
    tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
    plot_top_n_words =1000
    # Fit argument into an embedded space and return that transformed output.
    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_top_n_words, :])
    labels = [reversed_dictionary[i] for i in range(plot_top_n_words)]
    plot_with_labels(low_dim_embs, labels)
    
except ImportError:
    print('Please install all the basic dependencies as a deep learner.')