# Candidate-Sampling with Tensorflow

We investigate some candidate sampling methods such as:
* noise contrastive estimation
* negative sampling
* sampled softmax

Additionally, we also implement hierarchical-softmax.

## 1. Classification

* minimized VGG model
* cifar-100 dataset

To download cifar-100 dataset, use this command:
```sh
curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
tar -xvf cifar-100-binary.tar.gz
```

### 1.1. Model

In [None]:
import numpy as np
import tensorflow as tf

from cifar_input import build_input

In [None]:
images, labels = build_input('cifar100', './cifar-100-binary/train.bin', 100, 'train')
drop_rate = tf.placeholder(tf.bool, ())

In [None]:
def simpleCNN(X, rate, reuse=False):
    X = tf.layers.conv2d(X, 64, [3, 3], padding='same', activation=tf.nn.relu, name='c11', reuse=reuse)
    X = tf.layers.conv2d(X, 64, [3, 3], padding='same', activation=tf.nn.relu, name='c12', reuse=reuse)
    X = tf.layers.max_pooling2d(X, [2, 2], [2, 2])
    
    X = tf.layers.conv2d(X, 128, [3, 3], padding='same', activation=tf.nn.relu, name='c21', reuse=reuse)
    X = tf.layers.conv2d(X, 128, [3, 3], padding='same', activation=tf.nn.relu, name='c22', reuse=reuse)
    X = tf.layers.max_pooling2d(X, [2, 2], [2, 2])
    
    X = tf.layers.conv2d(X, 256, [3, 3], padding='same', activation=tf.nn.relu, name='c31', reuse=reuse)
    X = tf.layers.conv2d(X, 256, [3, 3], padding='same', activation=tf.nn.relu, name='c32', reuse=reuse)
    X = tf.layers.conv2d(X, 256, [3, 3], padding='same', activation=tf.nn.relu, name='c33', reuse=reuse)
    X = tf.layers.max_pooling2d(X, [2, 2], [2, 2])
    
    X = tf.layers.conv2d(X, 512, [3, 3], padding='same', activation=tf.nn.relu, name='c41', reuse=reuse)
    X = tf.layers.conv2d(X, 512, [3, 3], padding='same', activation=tf.nn.relu, name='c42', reuse=reuse)
    X = tf.layers.conv2d(X, 512, [3, 3], padding='same', activation=tf.nn.relu, name='c43', reuse=reuse)
    X = tf.layers.max_pooling2d(X, [2, 2], [2, 2])
    
    X = tf.contrib.layers.flatten(X)
    X = tf.layers.dense(X, 1048, activation=tf.nn.relu, name='d1', reuse=reuse)
    X = tf.layers.dropout(X, rate)
    X = tf.layers.dense(X, 1048, activation=tf.nn.relu, name='d2', reuse=reuse)
    X = tf.layers.dropout(X, rate)
    X = tf.layers.dense(X, 100, name='d3', reuse=reuse)
    
    return X

In [None]:
logit = simpleCNN(images, drop_rate)

images_tst, labels_tst = build_input('cifar100', './cifar-100-binary/test.bin', 100, 'test')
logit_tst = simpleCNN(images_tst, drop_rate, reuse=True)

### 1.2. Training Type

* softmax
* sampled-softmax
* hierarchical-softmax

#### 1.2.1. Softmax

In [None]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=labels))

#### 1.2.2. Sampled-softmax

In [None]:
l_weights = tf.Variable(tf.truncated_normal([100, 1048],
                                              stddev=1.0 / np.sqrt(1048)))
l_biases = tf.Variable(tf.zeros([100]))


# loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
num_true = 1
num_sampled = 30
num_classes = 100

sampled_values = tf.nn.uniform_candidate_sampler(
          true_classes=tf.reshape(tf.arg_max(labels, 1), [-1,1]),
          num_true=num_true,
          num_sampled=num_sampled,
          unique=True,
          range_max=num_classes)

cost = tf.reduce_mean(
  tf.nn.sampled_softmax_loss(weights=l_weights,
                 biases=l_biases,
                 labels=tf.reshape(tf.arg_max(labels, 1), [-1,1]),
                 inputs=logit.graph.get_operation_by_name('dropout_2/Identity').outputs[0],
                 num_sampled=num_sampled,
                 num_classes=num_classes,
                 sampled_values=sampled_values))
                 
logit = tf.matmul(logit.graph.get_operation_by_name('dropout_2/Identity').outputs[0], tf.transpose(l_weights)) + l_biases
logit_tst = tf.matmul(logit_tst.graph.get_operation_by_name('dropout_4/Identity').outputs[0], tf.transpose(l_weights)) + l_biases

#### 1.2.3. Hierarchical-softmax

For convinience, we use two-layer hierarchical-softmax.

Actually, two-layer approach is best for CIFAR-100, because all training/test labels are uniformly distributed.

---
**Note**

* This Hierarchical-Softmax is not memory-efficient nor gpu-friendly.
* That is the reason why tensorflow-version hierarchical-softmax is not implemented publicly.

In [None]:
ids = tf.arg_max(labels, 1)
ids_1st = tf.cast(ids // 10, tf.int32)
ids_2nd = tf.cast(ids % 10, tf.int32)

In [None]:
inputs = logit.graph.get_operation_by_name('dropout_2/Identity').outputs[0]
hs_1st = tf.nn.softmax(tf.layers.dense(inputs, 10, name='hs_1st'))

In [None]:
p1 = tf.gather(tf.reshape(hs_1st, [-1,1]), ids_1st + tf.range(0, 1000, 10))

p2 = []
for i in range(10):
    matched_nums = tf.reshape(tf.where(tf.equal(ids_1st, i)), [-1])
    matched_ids_2nd = tf.gather(ids_2nd, matched_nums)
    matched_inputs = tf.gather(inputs, matched_nums)
    
    matched_outputs = tf.nn.softmax(tf.layers.dense(matched_inputs, 10, name='hs_2nd_%i' % i))
    matched_ps = tf.gather(tf.reshape(matched_outputs, [-1,1]), matched_ids_2nd + tf.range(0, tf.reduce_prod(tf.shape(matched_outputs)), 10))
    p2.append(matched_ps)
    
p2 = tf.concat(p2, 0)

In [None]:
p_t = p1 * p2
cost = -tf.reduce_mean(tf.log(p_t))

In [None]:
logit_1 = tf.nn.softmax(tf.layers.dense(inputs, 10, name='hs_1st', reuse=True))
logit_2 = [tf.nn.softmax(tf.layers.dense(inputs, 10, name='hs_2nd_%i' % i, reuse=True)) for i in range(10)]
logit_2_concat = tf.concat(logit_2, 1)

logit_mul_shape = tf.concat([tf.shape(logit_1), [tf.shape(logit_2)[-1]]], 0)

logit = tf.reshape(tf.expand_dims(logit_1, 2) * 
                   tf.reshape(logit_2_concat, logit_mul_shape), 
                   tf.shape(logit_2_concat))

In [None]:
inputs_tst = logit.graph.get_operation_by_name('dropout_4/Identity').outputs[0]
logit_tst_1 = tf.nn.softmax(tf.layers.dense(inputs_tst, 10, name='hs_1st', reuse=True))
logit_tst_2 = [tf.nn.softmax(tf.layers.dense(inputs_tst, 10, name='hs_2nd_%i' % i, reuse=True)) for i in range(10)]
logit_tst_2_concat = tf.concat(logit_tst_2, 1)

logit_tst_mul_shape = tf.concat([tf.shape(logit_tst_1), [tf.shape(logit_tst_2)[-1]]], 0)

logit_tst = tf.reshape(tf.expand_dims(logit_tst_1, 2) * 
                   tf.reshape(logit_tst_2_concat, logit_tst_mul_shape), 
                   tf.shape(logit_tst_2_concat))

### 1.3. Training & Test & Logging

In [None]:
a = tf.placeholder(tf.float32, ())
optimizer = tf.train.GradientDescentOptimizer(a)
train = optimizer.minimize(cost)

init = tf.global_variables_initializer()

sess = tf.Session()
tf.train.start_queue_runners(sess)
sess.run(init)

# logging
tf.summary.scalar('learning_rate', a)
tf.summary.scalar('cost', cost)
summaries = tf.summary.merge_all()

summary_writer = tf.summary.FileWriter('./')

In [None]:
training_epochs = 50000
display_step = 500

for epoch in range(training_epochs+1):
    if epoch < 20000:
        lrn_rate = 0.1
    elif epoch < 30000:
        lrn_rate = 0.01
    elif epoch < 40000:
        lrn_rate = 0.001
    else:
        lrn_rate = 0.0001
        
    _, s, c = sess.run([train, summaries, cost], feed_dict={a: lrn_rate, drop_rate: 0.1})
    summary_writer.add_summary(s, epoch)
    
    print ("Epoch:", '%04d' %(epoch), "cost:", "{:0.9f}".format(c), end='\r')

    if epoch % display_step == 0:
        print()
        rets, anss = [], []
        for i in range(100):
            ret, ans = sess.run([logit_tst, labels_tst], feed_dict={drop_rate: 0.0})
            rets.extend(ret)
            anss.extend(ans)
        precision = np.mean(np.argmax(rets, 1) == np.argmax(anss, 1)) 
        print(precision)
        
        precision_summ = tf.Summary()
        precision_summ.value.add(tag='precision', simple_value=precision)
        summary_writer.add_summary(precision_summ, epoch)

### 1.4. Results

![title](logs/candidate-sampling/precision.png)

## 2. Word2Vec

* minimized VGG model
* cifar-100 dataset

To download cifar-100 dataset, use this command:
```sh
curl -o cifar-100-binary.tar.gz https://www.cs.toronto.edu/~kriz/cifar-100-binary.tar.gz
tar -xvf cifar-100-binary.tar.gz
```