# Distilling the Knowledge in a Neural Network

* Paper
    * Hinton, Geoffrey, Oriol Vinyals, and Jeff Dean. "Distilling the knowledge in a neural network." arXiv preprint arXiv:1503.02531 (2015).
* Problem
    * 앙상블 모델은 강력하지만 무거움. 학습할 때 빡센거야 그렇다 치더라도 사용할 때 (인퍼런스 할 때) 는 가벼워야 실제로 쓸 수 있다. 하지만 앙상블 모델은 네트워크 자체를 여러개를 불러와야 하고, 다 돌려봐야 하므로 N개의 네트워크를 사용해서 앙상블을 하면 그대로 N배 무거워진다.
* Method
    * 따라서 이 논문에서는 앙상블 모델을 학습한 후 그 'knowledge' 를 single model 로 'distilling' 한다. 즉 모델 하나로 앙상블 모델의 knowledge 를 학습하는 것.
    * 이를 위해 싱글 모델에서 앙상블 모델의 knowledge 에 해당하는 probability (softmax) 를 학습한다. 
    * 다만, softmax 결과는 너무 confident 한 경향이 있으므로 이를 해결하기 위해 soft target 을 사용한다. 
        * soft target: $q_i={exp(z_i/T) \over \sum_j exp(z_j/T)}$
    * 이렇게 하면 probability 가 너무 confident 해지는걸 막고 soft 하게 만들어줄 수 있다.
    * single model 은 이 soft target 학습. 학습할때도 마찬가지로 soft target 을 사용하고, inference 때에는 T=1 로 추정한다.
        * inference 할때 T=1 로 하는게 의미가 있나? 어차피 classification 결과는 T와 상관없이 동일한데.
        * 내가 이해를 잘못한건가?.?
    * 이러한 soft target 만 가지고 학습을 하는 것은 아니고, original true target (original label) 도 같이 사용함.
    * alpha 로 가중치를 두고 weighted sum 을 하는데, 논문에서는 0.5로 동등하게 사용하였음.
* Details
    * T=2~5

# Build model

In [1]:
# This codes are based on DeepLearningZeroToAll 11-5.

# Lab 11 MNIST and Deep learning CNN
# https://www.tensorflow.org/tutorials/layers
import tensorflow as tf
import numpy as np

from tensorflow.examples.tutorials.mnist import input_data

tf.set_random_seed(777)  # reproducibility

mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
# Check out https://www.tensorflow.org/get_started/mnist/beginners for
# more information about the mnist dataset

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
class Model:

    def __init__(self, sess, name, lr=0.001, drop_rate=0.7, init_fn=None):
        self.sess = sess
        self.name = name
        self.lr = lr
        self.drop_rate = drop_rate
        self.init_fn = init_fn
        self._build_net()

    def _build_net(self):
        with tf.variable_scope(self.name):
            # dropout (keep_prob) rate  0.7~0.5 on training, but should be 1
            # for testing
            self.training = tf.placeholder(tf.bool)

            # input place holders
            self.X = tf.placeholder(tf.float32, [None, 784])

            # img 28x28x1 (black/white), Input Layer
            X_img = tf.reshape(self.X, [-1, 28, 28, 1])
            self.Y = tf.placeholder(tf.float32, [None, 10])

            # Convolutional Layer #1
            conv1 = tf.layers.conv2d(inputs=X_img, filters=32, kernel_size=[3, 3],
                                     kernel_initializer=self.init_fn,
                                     padding="SAME", activation=tf.nn.relu)
            # Pooling Layer #1
            pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2],
                                            padding="SAME", strides=2)
            dropout1 = tf.layers.dropout(inputs=pool1,
                                         rate=self.drop_rate, training=self.training)

            # Convolutional Layer #2 and Pooling Layer #2
            conv2 = tf.layers.conv2d(inputs=dropout1, filters=64, kernel_size=[3, 3],
                                     kernel_initializer=self.init_fn,
                                     padding="SAME", activation=tf.nn.relu)
            pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2],
                                            padding="SAME", strides=2)
            dropout2 = tf.layers.dropout(inputs=pool2,
                                         rate=self.drop_rate, training=self.training)

            # Convolutional Layer #3 and Pooling Layer #3
            conv3 = tf.layers.conv2d(inputs=dropout2, filters=128, kernel_size=[3, 3],
                                     kernel_initializer=self.init_fn,
                                     padding="SAME", activation=tf.nn.relu)
            pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2],
                                            padding="SAME", strides=2)
            dropout3 = tf.layers.dropout(inputs=pool3,
                                         rate=self.drop_rate, training=self.training)

            # Dense Layer with Relu
            flat = tf.reshape(dropout3, [-1, 128 * 4 * 4])
            dense4 = tf.layers.dense(inputs=flat, kernel_initializer=self.init_fn,
                                     units=625, activation=tf.nn.relu)
            dropout4 = tf.layers.dropout(inputs=dense4,
                                         rate=self.drop_rate, training=self.training)

            # Logits (no activation) Layer: L5 Final FC 625 inputs -> 10 outputs
            self.logits = tf.layers.dense(inputs=dropout4, units=10)

        # define cost/loss & optimizer
        self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
            logits=self.logits, labels=self.Y))
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(self.cost)

        correct_prediction = tf.equal(
            tf.argmax(self.logits, 1), tf.argmax(self.Y, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    def predict(self, x_test, training=False):
        return self.sess.run(self.logits,
                             feed_dict={self.X: x_test, self.training: training})

    def get_accuracy(self, x_test, y_test, training=False):
        return self.sess.run(self.accuracy,
                             feed_dict={self.X: x_test,
                                        self.Y: y_test, self.training: training})

    def train(self, x_data, y_data, training=True):
        return self.sess.run([self.cost, self.optimizer], feed_dict={
            self.X: x_data, self.Y: y_data, self.training: training})

In [3]:
# hyper parameters
learning_rate = 0.001
training_epochs = 50
batch_size = 100

## Ensemble

In [4]:
# initialize
tf.reset_default_graph()

sess = tf.Session()

models = []
# num_models = 5
models.append(Model(sess, "model1", lr=0.001, drop_rate=0.5, init_fn=None))
models.append(Model(sess, "model2", lr=0.001, drop_rate=0.5, init_fn=tf.contrib.layers.xavier_initializer()))
models.append(Model(sess, "model3", lr=0.001, drop_rate=0.3, init_fn=tf.contrib.layers.xavier_initializer()))
models.append(Model(sess, "model4", lr=0.0007, drop_rate=0.3, init_fn=tf.contrib.layers.xavier_initializer()))
models.append(Model(sess, "model5", lr=0.0007, drop_rate=0.3, init_fn=tf.contrib.layers.variance_scaling_initializer()))
# for m in range(num_models):
#     models.append(Model(sess, "model" + str(m)))

sess.run(tf.global_variables_initializer())

In [5]:
print('Learning Started!')

# train my model
for epoch in range(training_epochs):
    avg_cost_list = np.zeros(len(models))
    total_batch = int(mnist.train.num_examples / batch_size)
    for i in range(total_batch):
        batch_xs, batch_ys = mnist.train.next_batch(batch_size)

        # train each model
        for m_idx, m in enumerate(models):
            c, _ = m.train(batch_xs, batch_ys)
            avg_cost_list[m_idx] += c / total_batch

    print('Epoch:', '%04d' % (epoch + 1), 'cost =', avg_cost_list)
    
    if epoch%10 == 9:
        test_accs = []
        for m_idx, m in enumerate(models):
            test_accs.append(m.get_accuracy(mnist.test.images, mnist.test.labels))
        print('test acc: {}'.format(test_accs))

print('Learning Finished!')

Learning Started!
('Epoch:', '0001', 'cost =', array([ 0.42743961,  0.409541  ,  0.25891181,  0.25897675,  0.47106397]))
('Epoch:', '0002', 'cost =', array([ 0.14201703,  0.13348314,  0.08126428,  0.07938832,  0.11699478]))
('Epoch:', '0003', 'cost =', array([ 0.10888712,  0.10356844,  0.05945898,  0.06044351,  0.08238382]))
('Epoch:', '0004', 'cost =', array([ 0.09428023,  0.09103809,  0.05287405,  0.05060412,  0.07044538]))
('Epoch:', '0005', 'cost =', array([ 0.08647937,  0.08050445,  0.04370158,  0.04372451,  0.0595039 ]))
('Epoch:', '0006', 'cost =', array([ 0.0781533 ,  0.07459385,  0.04158848,  0.0391683 ,  0.05317622]))
('Epoch:', '0007', 'cost =', array([ 0.07741929,  0.06749518,  0.03583228,  0.03475736,  0.04706765]))
('Epoch:', '0008', 'cost =', array([ 0.06977531,  0.06747254,  0.0329352 ,  0.03268618,  0.04120996]))
('Epoch:', '0009', 'cost =', array([ 0.06703566,  0.06423782,  0.0320037 ,  0.03037934,  0.03977831]))
('Epoch:', '0010', 'cost =', array([ 0.06361486,  0.060

In [6]:
# Test model and check accuracy
test_size = len(mnist.test.labels)
predictions = np.zeros(test_size * 10).reshape(test_size, 10)
for m_idx, m in enumerate(models):
    print(m_idx, 'Accuracy:', m.get_accuracy(
        mnist.test.images, mnist.test.labels))
    p = m.predict(mnist.test.images)
    predictions += p

ensemble_correct_prediction = tf.equal(tf.argmax(predictions, 1), tf.argmax(mnist.test.labels, 1))
ensemble_accuracy = tf.reduce_mean(tf.cast(ensemble_correct_prediction, tf.float32))
print('Ensemble accuracy:', sess.run(ensemble_accuracy))

(0, 'Accuracy:', 0.99380016)
(1, 'Accuracy:', 0.9951002)
(2, 'Accuracy:', 0.99410015)
(3, 'Accuracy:', 0.99410015)
(4, 'Accuracy:', 0.99430025)
('Ensemble accuracy:', 0.9957)


# Build Distilling Model

In [7]:
# same network architecture with Model
class DistillModel:
    def __init__(self, sess, name, lr=0.001, drop_rate=0.7, init_fn=None, alpha=0.5):
        self.sess = sess
        self.name = name
        self.lr = lr
        self.drop_rate = drop_rate
        self.init_fn = init_fn
#         self.T = T # temperature
        self.alpha = alpha
        self._build_net()

    def _build_net(self):
        with tf.variable_scope(self.name):
            # dropout (keep_prob) rate  0.7~0.5 on training, but should be 1
            # for testing
            self.training = tf.placeholder(tf.bool)
            self.T = tf.placeholder(tf.float32) # temperature

            # input place holders
            self.X = tf.placeholder(tf.float32, [None, 784])

            # img 28x28x1 (black/white), Input Layer
            X_img = tf.reshape(self.X, [-1, 28, 28, 1])
            self.soft_target = tf.placeholder(tf.float32, [None, 10]) # soft target
            self.true_target = tf.placeholder(tf.float32, [None, 10]) # true target

            # Convolutional Layer #1
            conv1 = tf.layers.conv2d(inputs=X_img, filters=32, kernel_size=[3, 3],
                                     kernel_initializer=self.init_fn,
                                     padding="SAME", activation=tf.nn.relu)
            # Pooling Layer #1
            pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2],
                                            padding="SAME", strides=2)
            dropout1 = tf.layers.dropout(inputs=pool1,
                                         rate=self.drop_rate, training=self.training)

            # Convolutional Layer #2 and Pooling Layer #2
            conv2 = tf.layers.conv2d(inputs=dropout1, filters=64, kernel_size=[3, 3],
                                     kernel_initializer=self.init_fn,
                                     padding="SAME", activation=tf.nn.relu)
            pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2],
                                            padding="SAME", strides=2)
            dropout2 = tf.layers.dropout(inputs=pool2,
                                         rate=self.drop_rate, training=self.training)

            # Convolutional Layer #3 and Pooling Layer #3
            conv3 = tf.layers.conv2d(inputs=dropout2, filters=128, kernel_size=[3, 3],
                                     kernel_initializer=self.init_fn,
                                     padding="SAME", activation=tf.nn.relu)
            pool3 = tf.layers.max_pooling2d(inputs=conv3, pool_size=[2, 2],
                                            padding="SAME", strides=2)
            dropout3 = tf.layers.dropout(inputs=pool3,
                                         rate=self.drop_rate, training=self.training)

            # Dense Layer with Relu
            flat = tf.reshape(dropout3, [-1, 128 * 4 * 4])
            dense4 = tf.layers.dense(inputs=flat, kernel_initializer=self.init_fn,
                                     units=625, activation=tf.nn.relu)
            dropout4 = tf.layers.dropout(inputs=dense4,
                                         rate=self.drop_rate, training=self.training)

            # Logits (no activation) Layer: L5 Final FC 625 inputs -> 10 outputs
            self.logits = tf.layers.dense(inputs=dropout4, units=10)
            
        # define cost/loss & optimizer
        self.soft_logits = self.logits / self.T
        soft_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
            logits=self.soft_logits, labels=self.soft_target))
        true_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
            logits=self.soft_logits, labels=self.true_target))
        self.cost = self.alpha*soft_cost + (1-self.alpha)*true_cost
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=learning_rate).minimize(self.cost)

        correct_prediction = tf.equal(
            tf.argmax(self.soft_logits, 1), tf.argmax(self.true_target, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        
    def predict(self, x_test, training=False, T=1):
        return self.sess.run(self.soft_logits,
                             feed_dict={self.X: x_test, self.training: training, self.T: T})

    def get_accuracy(self, x_test, y_test, training=False, T=1):
        return self.sess.run(self.accuracy,
                             feed_dict={self.X: x_test,
                                        self.true_target: y_test, 
                                        self.training: training, 
                                        self.T: T})

    def train(self, x_data, soft_target, true_target, training=True, T=3):
        return self.sess.run([self.cost, self.optimizer], feed_dict={
            self.X: x_data, 
            self.soft_target: soft_target, 
            self.true_target: true_target,
            self.training: training, 
            self.T: T
        })

In [8]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return e_x / np.sum(e_x, axis=1, keepdims=True)

In [9]:
def get_soft_targets(logits, T=3):
    return softmax(logits/T)

## Distilling

* 먼저 train dataset 에 대해 ensemble model 의 평균 logits 을 구한다.
* 그리고 그걸 soft_target 으로 변환. 
* 그러면 train_y 대신 soft_target 을 target 으로 하여 distilled model 을 학습!

In [10]:
print('Get train data')
# mnist.train 을 사용할 수 없으므로, train dataset 을 먼저 확보한다

train_images = mnist.train.images
train_labels = mnist.train.labels
N = len(train_images)

Get train data


In [11]:
print('Make soft target')

# avg_logits = np.array()
batch_size = 1000
logits = np.zeros([N, 10])

for i, m in enumerate(models):
    for j in range(0, N, batch_size):
        batch_x = train_images[j:j+batch_size]
        cur_logits = m.predict(batch_x)
        logits[j:j+batch_size] += cur_logits

# logit averaging is right?
logits /= len(models)
soft_targets = get_soft_targets(logits, T=3)

Make soft target


In [12]:
# test soft_targets
# shape
assert soft_targets.shape == train_labels.shape
# sum of softmax = 1
v = np.sum(soft_targets, axis=1) - 1
assert (v < 1e-6).all()

In [13]:
# distilling model
# 0.9934 params
distill_model = DistillModel(sess, "distill-model", lr=0.001, drop_rate=0.3, 
                             init_fn=tf.contrib.layers.xavier_initializer(), alpha=0.5)
# 이름이 distill-model2 인 이유는... distill-model 을 만들었는데 잘못만듦. 그래서 지우고 싶었는데 
# tf.Graph() 는 append-only structure 래. -.-

# 그럼 만약 이걸 수정하고 싶다면 어떻게 해야 할까?
# 천천히 알아보자.

In [17]:
sess.run(tf.global_variables_initializer())
# 이런식으로도 초기화가 가능한가보다! => 안되는가보다!
# var_distill = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="distill-model")
# sess.run(tf.variables_initializer(var_list=var_distill))

batch_size = 100
training_epochs = 100
# 앙상블 모델은 각각 20번씩 학습하였으나 얘는 좀더 해도 되겠지.
# 더 하면 좀 이상한것도 같다. 애초에 앙상블 모델을 최대한 많이 해서 수렴할때까지 해줘야 제대로 된 검증인 듯! 그리고 얘도 수렴할때까지 하고.
# 더 하면 왜 이상하냐면 애초에 앙상블에서 사용한 모델들이 수렴을 안했으니 (실제로 20번만 하면 그러함) 더 학습시키면 당연히 어큐러시가 올라감

print('Learning Started!')

# train_data for data shuffling
train_data = np.concatenate([train_images, train_labels, soft_targets], axis=1)
assert train_data.shape == (N, 804) # (55000, 784+10+10)

# train my model
for epoch in range(training_epochs):
    np.random.shuffle(train_data)
    
    avg_cost = 0
    for i in range(0, N, batch_size):
        batch_xs = train_data[i:i+batch_size, :784]
        batch_true_targets = train_data[i:i+batch_size, 784:794]
        batch_soft_targets = train_data[i:i+batch_size, 794:804]
        c, _ = distill_model.train(batch_xs, soft_target=batch_soft_targets, true_target=batch_true_targets, T=3)
        avg_cost += c / (N // batch_size)
        
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', avg_cost)
    
    if epoch % 10 == 9:
        # check test accuracy
        print('Accuracy: {}'.format(distill_model.get_accuracy(mnist.test.images, mnist.test.labels, T=1)))

print('Learning Finished!')

Learning Started!
('Epoch:', '0001', 'cost =', 0.30836233073337527)
('Epoch:', '0002', 'cost =', 0.1139974741231312)
('Epoch:', '0003', 'cost =', 0.093903585540300052)
('Epoch:', '0004', 'cost =', 0.082794651741331299)
('Epoch:', '0005', 'cost =', 0.077219824012030189)
('Epoch:', '0006', 'cost =', 0.073890537683936705)
('Epoch:', '0007', 'cost =', 0.070381064838306473)
('Epoch:', '0008', 'cost =', 0.068570296476510528)
('Epoch:', '0009', 'cost =', 0.065557966642081772)
('Epoch:', '0010', 'cost =', 0.063093038777058769)
Accuracy: 0.994400143623
('Epoch:', '0011', 'cost =', 0.062747025828469943)
('Epoch:', '0012', 'cost =', 0.061259193037721253)
('Epoch:', '0013', 'cost =', 0.060055479197339535)
('Epoch:', '0014', 'cost =', 0.059618993035771638)
('Epoch:', '0015', 'cost =', 0.058081561845134605)
('Epoch:', '0016', 'cost =', 0.058570654551413852)
('Epoch:', '0017', 'cost =', 0.0565283328497952)
('Epoch:', '0018', 'cost =', 0.056902495267039041)
('Epoch:', '0019', 'cost =', 0.0558207766406