In [None]:
import tensorflow as tf
import numpy as np
import zipfile
import random
import math
import collections
import itertools
from enum import Enum


In [None]:
FILENAME = 'text8.zip'
VOCABULARY_SIZE = 50000

## Read Data

In [None]:
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    with zipfile.ZipFile(filename) as f:
        data = tf.compat.as_str(f.read(f.namelist()[0])).split()
    return data
  
words = read_data(FILENAME)
print('Data size %d' % len(words))

## Build Dataset

### Subsampling 
Drop words with probability $$ P(w_i) = 1 - \sqrt{\frac{t}{f(w_i)}} $$, where t = 1e-5 and $f(w_i)$ = frequency of the term among dataset.

Besides we remove those words of length == 1

In [None]:
THRESHOLD = 1e-5

def subsampling(words):
    counts = collections.Counter(words)
    deleted = []
    for w, c in list(counts.items()):
        freq = c / len(words)
        if len(w) <=2 or random.random() < 1 - math.sqrt(THRESHOLD/freq):
            deleted.append(w)
            del counts[w]
    return deleted, counts

In [None]:
def build_dataset(words, with_subsampling = True):
    dictionary = {}
    counter = [('UNK', -1)]
    if with_subsampling:
        deleted, subsampled_words = subsampling(words)
        counter.extend(subsampled_words.most_common(VOCABULARY_SIZE - 1))
    else:
        deleted = []
        counter.extend(collections.Counter(words).most_common(VOCABULARY_SIZE - 1))
    ## Build Dictionary
    for word, _ in counter:
        dictionary[word] = len(dictionary)
    unk_count = 0
    words_idx = []
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count +=1
        words_idx.append(index)
    counter[0] = ('UNK', unk_count)
    return deleted, counter, words_idx, dictionary, dict(zip(dictionary.values(),dictionary.keys()))

In [None]:
deleted, counts, words_idx, dictionary, reverse_dictionary = build_dataset(words, with_subsampling=True)
print('Words:', words[:10])
print('Deleted:', deleted[:10])
print('Sample data:',words_idx[:10])
print('Sample data:',[reverse_dictionary[w] for w in words_idx[:10]])
print('Common words:', counts[:5])
print('Reverse Dictionary:', list(reverse_dictionary.keys())[:10])

## Generate Training data

In [None]:
def cbow_generator(data, batch_size, window_size):
    batch = []
    labels = []
    span = window_size * 2 + 1 #[skip_window key skip_window]

    buffer = collections.deque(maxlen=span)
    for word in data:
        if word != 0:
            buffer.append(word)
            if len(buffer) == span:
                labels.append(buffer[window_size])
                batch.extend(random.sample(list(buffer)[:window_size] + list(buffer)[window_size + 1:], window_size * 2))
                if len(labels) == batch_size:
                    yield np.array(batch).reshape(-1, window_size * 2), np.array(labels)
                    batch = []
                    labels = []

print ('data:', [reverse_dictionary[idx] for idx in words_idx[:50] if idx != 0])

for w_s in (1,2):
    generator = cbow_generator(words_idx, batch_size=8, window_size=w_s)
    batch, labels = list(itertools.islice(generator, 0,1,1))[0]
    print('\nwith window_size =',  w_s)
    shape = batch.shape
    print('-batch:\n', np.array([reverse_dictionary[bi] for bi in batch.flat]).reshape(shape))
    print('-labels:', [reverse_dictionary[li] for li in labels])


In [None]:
def skipgram_generator(data, batch_size, ngram_size, window_size):
    assert batch_size % ngram_size == 0
    assert ngram_size <= 2 * window_size
    batch = []
    labels = []

    span = window_size * 2 + 1 #[skip_window key skip_window]
    #Fill the buffer until span
    buffer = collections.deque(maxlen=span)
    for word in data:
        if word != 0:
            buffer.append(word)
            if len(buffer) ==  span:
                batch.extend([buffer[window_size]] * ngram_size)
                labels.extend(random.sample(list(buffer)[:window_size] + list(buffer)[window_size + 1:], ngram_size))
                if len(batch) == batch_size:
                    yield np.array(batch, dtype=np.int32), np.array(labels, dtype=np.int32)
                    batch = []
                    labels = []

print('data:',[reverse_dictionary[w] for w in words_idx[:50] if w != 0] )

for ngram_size, window_size in [(2,1),(1,2), (2, 4)]:
    generator = skipgram_generator(words_idx, batch_size=8, ngram_size=ngram_size, window_size=window_size)
    batch, labels = list(itertools.islice(generator, 0,1,1))[0]
    print('\nwith ngram_size = %d and window_size = %d:' % (ngram_size, window_size))
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels])

## Network Design

In [None]:
class Model(Enum):
    CBOW = 1
    SKIPGRAM = 0

NGRAM_SIZE = 2
WINDOW_SIZE = 1
BATCH_SIZE = 64
EMBEDDINGS_SIZE =  100
NEGATIVE_SAMPLE = 50
NUM_VALIDATION_WORDS = 15
TOP_K_RELATED_WORDS = 8
MODEL = Model.CBOW


graph = tf.Graph()
with tf.Session(graph=graph) as session:
    #Input Data
    if MODEL == Model.CBOW:
        inputs = tf.placeholder(shape=(BATCH_SIZE, WINDOW_SIZE * 2), dtype=tf.int32, name='inputs')
    else:
        inputs = tf.placeholder(shape=(BATCH_SIZE), dtype=tf.int32, name='inputs')
    
    labels = tf.placeholder(shape=(BATCH_SIZE,1), dtype=tf.int32, name='labels')
    
    
    #Embedding
    embedding = tf.Variable(tf.random_uniform((VOCABULARY_SIZE, EMBEDDINGS_SIZE), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, inputs) 
    
    #Weights
    softmax_weights = tf.Variable(tf.truncated_normal([VOCABULARY_SIZE, EMBEDDINGS_SIZE],
                                                      stddev=1.0 / math.sqrt(EMBEDDINGS_SIZE)))
    softmax_biases = tf.Variable(tf.zeros([VOCABULARY_SIZE]))

    
    #Loss
    
    if MODEL == Model.CBOW:
        loss = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=tf.reduce_sum(embed,1), 
                                       labels=labels, num_sampled=NEGATIVE_SAMPLE, num_classes=VOCABULARY_SIZE))
    else:
        loss = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
                                       labels=labels, num_sampled=NEGATIVE_SAMPLE, num_classes=VOCABULARY_SIZE))
    
    optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

    #Evaluation
    valid_examples = np.random.choice(range(1,100), NUM_VALIDATION_WORDS, replace=False)

    norm_embeddings = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True))
    eval_words = np.random.randint(VOCABULARY_SIZE, size = 10)
    eval_dataset = tf.constant(valid_examples, dtype=tf.int32)
    eval_embedding = tf.nn.embedding_lookup(norm_embeddings, eval_dataset)
    #Compute cosine of eval_embed vectors
    cosine = tf.matmul(eval_embedding, tf.transpose(norm_embeddings))
    top_k = tf.nn.top_k(cosine, TOP_K_RELATED_WORDS)
    

In [None]:
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized using '{}'".format(MODEL))
    step = 0
    epochs = 10
    
    for epoch in range(epochs):
        if MODEL == Model.CBOW:
            generator = cbow_generator(words_idx, batch_size=BATCH_SIZE, window_size=WINDOW_SIZE)
        else:
            generator = skipgram_generator(words_idx, batch_size=BATCH_SIZE, 
                                           ngram_size=NGRAM_SIZE, window_size=WINDOW_SIZE)
        
        for b, l in generator:
            step +=1
            _, ls = session.run([optimizer, loss], feed_dict={inputs:b, labels: l.reshape(-1,1)})
            if step % 10000 == 0:
                print('   Epoch:{}. Step:{}. Loss:{}'.format(epoch, step, ls))    
        
        print('Final Step:',step)
        _, top_idx = session.run(top_k)
        for top in top_idx:
            print("   Nearest to '{}': {}".format(reverse_dictionary[top[0]], 
                                                      [reverse_dictionary[w] for w in top[1:]]))
        step = 0