# Doc2Vec demo

### Contents:
- Format of data
- Training graph
- Inference graph (Need to make the inference results deterministic)

### Reference links:
- https://amsterdam.luminis.eu/2017/02/21/coding-doc2vec/
- https://amsterdam.luminis.eu/2017/01/30/implementing-doc2vec/
- https://github.com/luminis-ams/blog-doc2vec/blob/master/pvdbow.ipynb

In [3]:
import collections
from collections import Counter

from itertools import chain
import itertools
import re

import numpy as np
import pandas as pd
import tensorflow as tf

import nltk
from nltk.corpus import reuters
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
%matplotlib inline

import util
import importlib
importlib.reload(util)

<module 'util' from '/usr/local/google/home/haojing/workspaces/git/tutorials/util.py'>

## Download and Format Data

In [4]:
nltk.download('reuters')
nltk.download('punkt')

[nltk_data] Downloading package reuters to
[nltk_data]     /usr/local/google/home/haojing/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/google/home/haojing/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
# tokenization

def accept(word):
    # Accept if not only Unicode non-word characters are present
    return re.sub(r'\W', '', word) != ''

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def normalize(word):
    return '<NUM>' if is_number(word) else word.lower()

In [6]:
MIN_WORD_COUNT = 10
TOKEN_UNKNOWN = '__UNK__'
TOKEN_NULL = '__NULL__' # to fill in for docs shorter than text window size
TEXT_WINDOW_SIZE = 8
BATCH_SIZE = 10 * TEXT_WINDOW_SIZE

In [7]:
# document dictionary
document_dict = {
        fileid: [normalize(word) for word in word_tokenize(reuters.raw(fileid)) if accept(word)]
        for fileid in reuters.fileids()}

# count tokens
word_count = collections.Counter([word for words in document_dict.values() for word in words])
# remove words with count less than 10
unknown_count = sum([wc[1] for wc in word_count.most_common() if wc[1] < MIN_WORD_COUNT])
word_count = [wc for wc in word_count.most_common() if wc[1] >= MIN_WORD_COUNT]

# add token unknown
word_count.append((TOKEN_UNKNOWN, unknown_count))
doc_lens = list(map(lambda tokens: len(tokens), document_dict.values()))

null_count = sum([dl for dl in doc_lens if dl < TEXT_WINDOW_SIZE])
word_count.append((TOKEN_NULL, null_count))

In [8]:
# create dictionary
dictionary = {word : index for index, (word, _) in enumerate(word_count)}
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

# convert token docs to token-id docs
document_token_ids = dict(
    map(lambda item:(item[0], [dictionary.get(t, dictionary['__UNK__']) for t in item[1]]),
        document_dict.items()))

In [9]:
# generate all doc-id to token-id pairs
data = []
for doc_id, (fileid, tk_ids) in enumerate(document_token_ids.items()):
    data_tmp = [(doc_id, tk_id) for tk_id in tk_ids]
    if len(data_tmp) < TEXT_WINDOW_SIZE:
        len_pad = TEXT_WINDOW_SIZE-len(data_tmp)
        data_tmp.extend([(doc_id, dictionary[TOKEN_NULL])] * len_pad)
    data.extend(data_tmp)

In [10]:
# get document starting indexes, and center indexes of text windows inside documents
doc_start_indexes = [ind for (ind, (fileid, tkid)) in enumerate(data) if ind==0 or fileid != data[ind-1][0]]
text_window_center_index_for_docs = [
    list(range(doc_start_indexes[i] + (TEXT_WINDOW_SIZE - 1) // 2, 
               doc_start_indexes[i + 1] - TEXT_WINDOW_SIZE // 2)) 
    for i in range(len(doc_start_indexes)-1)]
text_window_center_index = list(itertools.chain.from_iterable(text_window_center_index_for_docs))

In [11]:
print('Number of documents:', len(document_token_ids))
print('Number of tokens:', len(data))
print('Number of unique tokens:', len(word_count))
print('Most common words:', word_count[:5])
print('Least common words:', word_count[-5:])
print('Sample data:', data[:5])
vocab_size =  len(word_count)
print('Effective vocab size:', vocab_size)

Number of documents: 10788
Number of tokens: 1395343
Number of unique tokens: 7164
Most common words: [('the', 69245), ('<NUM>', 65145), ('of', 36749), ('to', 36275), ('in', 29217)]
Least common words: [('burger', 10), ('tandy', 10), ('dominate', 10), ('__UNK__', 78486), ('__NULL__', 170)]
Sample data: [(0, 2880), (0, 15), (0, 16), (0, 7162), (0, 7162)]
Effective vocab size: 7164


## Generate Batch Data

In [12]:
"""
Returns an eternal generator, periodically shuffling the order
l_ is a list of integers; an internal copy of it is maintained.
"""
def repeater_shuffler(l_, shuffle_every_x_epoch):
    l = np.array(l_, dtype=np.int32)
    epoch = 0
    while epoch >= 0:
        if epoch % shuffle_every_x_epoch == 0:
            np.random.shuffle(l)
        for i in l:
            yield i
        epoch += 1

def generate_batch_single_twcp(twcp, i, batch, labels, text_window_size):
    tw_start = twcp - (text_window_size - 1) // 2
    tw_end = twcp + text_window_size // 2 + 1
    docids, wordids = zip(*data[tw_start:tw_end])
    batch_slice = slice(i * text_window_size,
                        (i+1) * text_window_size)
    batch[batch_slice] = docids
    labels[batch_slice, 0] = wordids
    
def generate_batch(twcp_gen, batch_size, text_window_size):
    batch = np.ndarray(shape=(batch_size,), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    for i in range(batch_size // text_window_size):
        generate_batch_single_twcp(next(twcp_gen), i, batch, labels, text_window_size)
    return batch, labels

def train(session, loss, optimizer, batch_generator, num_steps):
    avg_training_loss = 0
    for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_generator, BATCH_SIZE, TEXT_WINDOW_SIZE)
        _, l = session.run(
                [optimizer, loss],
                feed_dict={dataset: batch_data, labels: batch_labels})
        avg_training_loss += l
        if step > 0 and step % REPORT_EVERY_X_STEPS == 0:
            avg_training_loss = \
                    avg_training_loss / REPORT_EVERY_X_STEPS
            # The average loss is an estimate of the loss over the
            # last REPORT_EVERY_X_STEPS batches
            print('Average loss at step {:d}: {:.1f}'.format(
                    step, avg_training_loss))

In [13]:
shuffle_every_x_epoch = 2
batch_generator = repeater_shuffler(text_window_center_index, shuffle_every_x_epoch)

## Create Training Graph

In [16]:
# model parameters
EMBEDDING_SIZE = 300
LEARNING_RATE = 0.1
NUM_SAMPLED = 64
REPORT_EVERY_X_STEPS = 10000

# input data
dataset = tf.placeholder(tf.int32, shape=[BATCH_SIZE])
labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, 1])

# weights
embeddings = tf.Variable(
    tf.random_uniform([len(doc_lens), EMBEDDING_SIZE], -1.0, 1.0), 
    name='embeddings')
softmax_weights = tf.Variable(
    tf.truncated_normal([vocab_size, EMBEDDING_SIZE],stddev=1.0 / np.sqrt(EMBEDDING_SIZE)),
    name='weights')
softmax_biases = tf.Variable(tf.zeros([vocab_size]), name='biases')

# model
embed = tf.nn.embedding_lookup(embeddings, dataset)
loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(
        softmax_weights, softmax_biases, labels, embed, NUM_SAMPLED, vocab_size))

# optimizer
optimizer = tf.train.AdagradOptimizer(LEARNING_RATE).minimize(loss)

util.show_graph(tf.get_default_graph())

## Train Model

In [17]:
session = tf.Session()
session.run(tf.global_variables_initializer())

train(session, loss, optimizer, batch_generator, 1000000)

## Inference

In [18]:
# Create inference graph
trained_embedding = session.run(embeddings)
trained_softmax_weights = session.run(softmax_weights)
trained_softmax_biases = session.run(softmax_biases)

tf.reset_default_graph()

In [28]:
tf.reset_default_graph()

tf.set_random_seed(1)

inference_emb = tf.Variable(
    tf.random_uniform([1, EMBEDDING_SIZE], -1.0, 1.0), 
    name='inference_emb')

inference_labels = tf.placeholder(tf.int32, shape=[None, 1], name='inference_label')
inference_loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(
        trained_softmax_weights, trained_softmax_biases, inference_labels, inference_emb, NUM_SAMPLED, vocab_size),
    name='inference_loss')

inference_optimizer = tf.train.AdagradOptimizer(LEARNING_RATE).minimize(inference_loss)

In [29]:
session = tf.Session()
session.run(tf.global_variables_initializer())

for i in range(3):
    loss_tmp, optimizer_tmp = session.run([inference_loss, inference_optimizer], feed_dict={inference_labels: np.array([1,2,3,4,5]).reshape(5, 1)})
    print(loss_tmp)
    
    emb_tmp = session.run(inference_emb)
    print(emb_tmp[0, :3])

3.10389
[ 0.5604108   0.60230744  0.44916135]
3.35557
[ 0.55855113  0.60058695  0.44389474]
3.39116
[ 0.56201524  0.59949732  0.44399595]
