## unpack the data

In [1]:
# %load unpack_data.py

import numpy as np
import struct

train_images_idx3_ubyte_file = './data/train-images-idx3-ubyte'
train_labels_idx1_ubyte_file = './data/train-labels-idx1-ubyte'

test_images_idx3_ubyte_file = './data/t10k-images-idx3-ubyte'
test_labels_idx1_ubyte_file = './data/t10k-labels-idx1-ubyte'


def decode_idx3_ubyte(idx3_ubyte_file):
    with open(idx3_ubyte_file, 'rb') as f:
        bin_data = f.read()

    # parse header
    offset = 0
    fmt_header = '>iiii'
    magic_number, num_images, num_rows, num_cols = struct.unpack_from(
        fmt_header, bin_data, offset)
    print('total images: %d, image size: %d*%d' % (
        num_images, num_rows, num_cols))

    # parse data
    image_size = num_rows * num_cols
    offset += struct.calcsize(fmt_header)
    fmt_image = '>' + str(image_size) + 'B'
    images = np.empty((num_images, num_rows, num_cols))
    for i in range(num_images):
        if (i + 1) % 10000 == 0:
            print('parsed %d' % (i + 1))
        images[i] = np.array(
            struct.unpack_from(fmt_image, bin_data, offset)).reshape(
            (num_rows, num_cols))
        offset += struct.calcsize(fmt_image)
    return images


def decode_idx1_ubyte(idx1_ubyte_file):
    with open(idx1_ubyte_file, 'rb') as f:
        bin_data = f.read()

    # parse header
    offset = 0
    fmt_header = '>ii'
    magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
    print('labels number: %d' % (num_images))

    # parse data
    offset += struct.calcsize(fmt_header)
    fmt_image = '>B'
    labels = np.empty(num_images)
    for i in range(num_images):
        if (i + 1) % 10000 == 0:
            print(
                'parsed %d' % (i + 1))
        labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
        offset += struct.calcsize(fmt_image)
    return labels.astype(np.uint8)


def unpack_data():
    train_images = decode_idx3_ubyte(train_images_idx3_ubyte_file)
    train_labels = decode_idx1_ubyte(train_labels_idx1_ubyte_file)
    test_images = decode_idx3_ubyte(test_images_idx3_ubyte_file)
    test_labels = decode_idx1_ubyte(test_labels_idx1_ubyte_file)

    perm = np.arange(0, len(train_images))
    np.random.shuffle(perm)
    train_images = train_images[perm]
    train_labels = train_labels[perm]

    np.save('./data/train_data.npy', train_images)
    np.save('./data/train_label.npy', train_labels)
    np.save('./data/test_data.npy', test_images)
    np.save('./data/test_label.npy', test_labels)


if __name__ == '__main__':
    unpack_data()

total images: 60000, image size: 28*28
parsed 10000
parsed 20000
parsed 30000
parsed 40000
parsed 50000
parsed 60000
labels number: 60000
parsed 10000
parsed 20000
parsed 30000
parsed 40000
parsed 50000
parsed 60000
total images: 10000, image size: 28*28
parsed 10000
labels number: 10000
parsed 10000


## run a random forest for a quick baseline

In [2]:
# %load random_forest.py

import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

train_data = np.load('./data/train_data.npy')
train_data = np.reshape(train_data, [train_data.shape[0], -1])
train_label = np.load('./data/train_label.npy')
test_data = np.load('./data/test_data.npy')
test_data = np.reshape(test_data, [test_data.shape[0], -1])
test_label = np.load('./data/test_label.npy')

print('random forest baseline')
rf = RandomForestClassifier(n_estimators=70, n_jobs=-1, bootstrap=True)
rf.fit(train_data, train_label)
print('training finished')
accuracy = rf.score(test_data, test_label)
print('test accuracy: %f' % accuracy)

random forest baseline
training finished
test accuracy: 0.873500


## try a basic LeNet5

It's a basic LeNet5 model with slightly modified. It can reach a pretty good accuracy on original MNIST, so let's see how well it performs on fashion-MNIST

In [None]:
# %load models/LeNet5.py

'''

@author: ZiqiLiu


@file: LeNet5.py

@time: 2017/11/3 下午10:39

@desc:
'''
import tensorflow as tf


class LeNet5(object):
    def __init__(self, config):
        self.config = config
        # collect layers to calculate MI
        self.layers_collector = []

        if config.initializer == 'xavier':
            self.initializer = tf.contrib.layers.xavier_initializer_conv2d()
        else:
            self.initializer = tf.truncated_normal_initializer(stddev=0.1)

        if config.activate_func == 'sigmoid':
            self.activate_func = tf.nn.sigmoid
        elif config.activate_func == 'relu':
            self.activate_func = tf.nn.relu
        elif config.activate_func == 'tanh':
            self.activate_func = tf.nn.tanh
        else:
            raise Exception('activation function not defined!')

        self.input = tf.placeholder(tf.float32, [None, 28, 28], name='input')
        self._input = tf.expand_dims(self.input, 3)
        self.label = tf.placeholder(tf.float32, [None, 10], name='label')

        # first conv+pooling
        self.h1_conv = self.conv2d(self._input, 32, [5, 5], name='hidden1')
        self.h1 = tf.layers.max_pooling2d(self.h1_conv, [2, 2], [2, 2],
                                          name='pooling1')

        # second conv+pooling
        self.h2_conv = self.conv2d(self.h1, 64, [5, 5], name='hidden2')
        self.h2 = tf.layers.max_pooling2d(self.h2_conv, [2, 2], [2, 2],
                                          name='pooling2')

        # flatten
        self.flatten = tf.reshape(self.h2, [-1, 7 * 7 * 64], 'flatten')

        # fc1
        self.fc1 = tf.layers.dense(self.flatten, 1024, self.activate_func,
                                   kernel_initializer=self.initializer,
                                   name='fc1')

        # dropout
        if config.dropout:
            self.dropout = tf.nn.dropout(self.fc1, config.keep_prob)
        else:
            self.dropout = self.fc1

        self.fc2 = tf.layers.dense(self.fc1, 10,
                                   kernel_initializer=self.initializer,
                                   name='fc2')
        self.softmax = tf.nn.softmax(logits=self.fc2, name='softmax')

        self.accuracy = tf.reduce_mean(tf.cast(
            tf.equal(tf.argmax(self.softmax, 1), tf.argmax(self.label, 1)),
            tf.float32), name='accuracy')

        # loss and gradient
        self.global_step = tf.Variable(0, trainable=False)
        initial_learning_rate = tf.Variable(
            config.learning_rate, trainable=False)
        self.learning_rate = tf.train.exponential_decay(
            initial_learning_rate, self.global_step, self.config.decay_step,
            self.config.lr_decay,
            name='lr') if config.use_lr_decay else initial_learning_rate
        if self.config.optimizer == 'adam':
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        else:
            self.optimizer = tf.train.GradientDescentOptimizer(
                self.learning_rate)

        self.loss = -tf.reduce_sum(self.label * tf.log(self.softmax))
        self.train_op = self.optimizer.minimize(self.loss,
                                                global_step=self.global_step)

        self.layers_collector.append(self.input)
        self.layers_collector.append(self.transpose(self.h1))
        self.layers_collector.append(self.transpose(self.h2))
        self.layers_collector.append(tf.expand_dims(self.fc1, 1))
        self.layers_collector.append(self.softmax)

    def transpose(self, layer):
        return tf.transpose(layer, [0, 3, 1, 2])

    def conv2d(self, input, channel, kernel, name=None):
        l2_regularizer = tf.contrib.layers.l2_regularizer(
            scale=self.config.l2_beta) if self.config.l2_norm else None

        conv = tf.layers.conv2d(input, channel, kernel,
                                strides=(1, 1), padding='SAME',
                                use_bias=True,
                                kernel_initializer=self.initializer,
                                kernel_regularizer=l2_regularizer)
        if self.config.batch_norm:
            conv = tf.layers.batch_normalization(conv)
        activate = self.activate_func(conv, name)
        return activate


## the runner

In [5]:
# %load runner.py

'''

@author: ZiqiLiu


@file: runner.py

@time: 2017/11/3 下午11:34

@desc:
'''
from reader import read_dataset
from models.LeNet5 import LeNet5
from models.ResNet import ResNet
import tensorflow as tf
from glob import glob
import os
import sys
import signal
from config import get_config
from entropy import entropy
from plot import plot_info_plain
import pickle


class Runner(object):
    def __init__(self, config, model):
        self.config = config
        self.dataset = read_dataset(config.batch_size, config.valid_size,
                                    config.sample_size)
        self.graph = tf.Graph()
        self.model = None
        self.restore = False
        if not os.path.exists(self.config.model_path):
            os.mkdir(self.config.model_path)
        for key in config.__dict__:
            print(key, config.__dict__[key])
        with self.graph.as_default():
            self.model = model(self.config)

        self.IXT = []
        self.ITY = []

    def run(self):
        with self.graph.as_default(), tf.Session() as sess:
            self.restore = True
            model_path = os.path.join(self.config.model_path,
                                      self.config.model_name)
            saver = tf.train.Saver()
            files = glob(os.path.join(self.config.model_path, '*.ckpt.*'))

            if len(files) > 0:
                saver.restore(sess, model_path)
                print(('Model restored from:' + model_path))
            else:
                print("Model doesn't exist.\nInitializing........")
                sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            def handler_stop_signals(signum, frame):

                print(
                    'training shut down,  the model will be save in %s' % (
                        model_path))
                saver.save(sess, save_path=model_path)
                sys.exit(0)

            signal.signal(signal.SIGINT, handler_stop_signals)
            signal.signal(signal.SIGTERM, handler_stop_signals)

            total_loss = 0
            while self.dataset.epoch < self.config.max_epoch:
                data, label = self.dataset.next_training_batch()
                _, step, loss, layers = sess.run(
                    [self.model.train_op, self.model.global_step,
                     self.model.loss, self.model.layers_collector],
                    feed_dict={self.model.input: data,
                               self.model.label: label})
                total_loss += loss
                # ixt, ity = entropy(layers)
                # self.IXT.append(ixt)
                # self.ITY.append(ity)
                if step % self.config.valid_step == 0:
                    valid_data, valid_label = self.dataset.valid_batch()
                    accu = sess.run(self.model.accuracy,
                                    feed_dict={self.model.input: valid_data,
                                               self.model.label: valid_label})
                    print('step %d, epoch %d, loss %f, valid accuracy: %f' % (
                        step, self.dataset.epoch,
                        total_loss / self.config.valid_step, accu))
                    total_loss = 0
                if step % self.config.info_plane_interval == 0:
                    sample_data = self.dataset.sample_batch()
                    layers = sess.run(self.model.layers_collector,
                                      feed_dict={self.model.input: sample_data
                                                 })
                    ixt, ity = entropy(layers)
                    # print(ixt)
                    # print(ity)
                    with open('layers.pkl', 'wb') as f:
                        pickle.dump(layers, f)

                    self.IXT.append(ixt)
                    self.ITY.append(ity)

            self._test(sess)
            saver.save(sess, save_path=model_path)
            print(
                'training finished,  the model will be save in %s' % (
                    self.config.model_path))

    def test(self):
        with self.graph.as_default(), tf.Session() as sess:
            files = glob(os.path.join(self.config.model_path, '*.ckpt.*'))
            assert len(files) > 0
            saver = tf.train.Saver()
            saver.restore(sess, os.path.join(self.config.model_path,
                                             self.config.model_name))
            print(('Model restored from:' + self.config.model_path))
            self._test(sess)
            self.plot_info_plane()

    def plot_info_plane(self):
        with open('ixt', 'wb') as f:
            pickle.dump(self.IXT, f)
        with open('ity', 'wb') as f:
            pickle.dump(self.ITY, f)
        plot_info_plain(self.IXT, self.ITY)

    def _test(self, sess):
        test_data, test_label = self.dataset.test_batch()
        accu = sess.run(self.model.accuracy,
                        feed_dict={self.model.input: test_data,
                                   self.model.label: test_label})
        print('test accuracy:%f' % accu)


if __name__ == '__main__':

    model = LeNet5
    if len(sys.argv) > 1:
        if sys.argv[1] == 'LeNet5':
            print("Using LetNet5")
            model = LeNet5
        elif sys.argv[1] == 'miniResNet':
            print("Using miniResNet")
            model = ResNet
    if model:
        runner = Runner(get_config(), LeNet5)
        runner.run()
    else:
        print('model not defined!')
        sys.exit(1)


use_lr_decay True
batch_norm False
l2_norm True
model_path ./trained_model/
decay_step 1200
learning_rate 0.0005
optimizer adam
initializer xavier
valid_step 1200
valid_size 1000
info_plane_interval 1200
dropout True
keep_prob 0.6
max_epoch 20
sample_size 2000
model_name latest.ckpt
lr_decay 0.85
l2_beta 0.01
batch_size 50
activate_func tanh
Model doesn't exist.
Initializing........
step 1200, epoch 1, loss 19.575270, valid accuracy: 0.889000
step 2400, epoch 2, loss 12.733609, valid accuracy: 0.892000
step 3600, epoch 3, loss 10.252765, valid accuracy: 0.919000
step 4800, epoch 4, loss 8.187800, valid accuracy: 0.924000
step 6000, epoch 5, loss 6.482375, valid accuracy: 0.913000
step 7200, epoch 6, loss 4.781872, valid accuracy: 0.928000
step 8400, epoch 7, loss 3.553936, valid accuracy: 0.931000
step 9600, epoch 8, loss 2.397165, valid accuracy: 0.933000
step 10800, epoch 9, loss 1.493087, valid accuracy: 0.935000
step 12000, epoch 10, loss 0.973497, valid accuracy: 0.936000
step 132

## try ResNet
based on the idea of resnet, I build a mini resnet, with 6 CNN layers and 2 fc layers. See if can achieve better accuracy

In [None]:
# %load models/ResNet.py

'''

@author: ZiqiLiu


@file: LeNet5.py

@time: 2017/11/3 下午10:39

@desc:
'''
import tensorflow as tf


class ResNet(object):
    def __init__(self, config):
        self.config = config
        # collect layers to calculate MI
        self.layers_collector = []

        if config.initializer == 'xavier':
            self.initializer = tf.contrib.layers.xavier_initializer_conv2d()
        else:
            self.initializer = tf.truncated_normal_initializer(stddev=0.1)

        if config.activate_func == 'sigmoid':
            self.activate_func = tf.nn.sigmoid
        elif config.activate_func == 'relu':
            self.activate_func = tf.nn.relu
        elif config.activate_func == 'tanh':
            self.activate_func = tf.nn.tanh
        else:
            raise Exception('activation function not defined!')

        self.input = tf.placeholder(tf.float32, [None, 28, 28], name='input')
        self._input = tf.expand_dims(self.input, 3)
        self.label = tf.placeholder(tf.float32, [None, 10], name='label')
        self.layers_collector.append(self.input)

        # first res block
        self.block1 = self.res_block(self._input, 32)
        self.pooling1 = tf.layers.max_pooling2d(self.block1, [2, 2], [2, 2],
                                                name='pooling1')

        # second res block
        self.block2 = self.res_block(self.pooling1, 64)
        self.pooling2 = tf.layers.max_pooling2d(self.block2, [2, 2], [2, 2],
                                                name='pooling2')

        # flatten
        self.flatten = tf.reshape(self.pooling2, [-1, 7 * 7 * 64], 'flatten')

        # fc1
        self.fc1 = tf.layers.dense(self.flatten, 1024, self.activate_func,
                                   kernel_initializer=self.initializer,
                                   name='fc1')

        # dropout
        if config.dropout:
            self.dropout = tf.nn.dropout(self.fc1, config.keep_prob)
        else:
            self.dropout = self.fc1

        self.fc2 = tf.layers.dense(self.fc1, 10,
                                   kernel_initializer=self.initializer,
                                   name='fc2')
        self.softmax = tf.nn.softmax(logits=self.fc2, name='softmax')

        self.accuracy = tf.reduce_mean(tf.cast(
            tf.equal(tf.argmax(self.softmax, 1), tf.argmax(self.label, 1)),
            tf.float32), name='accuracy')

        # loss and gradient
        self.global_step = tf.Variable(0, trainable=False)
        initial_learning_rate = tf.Variable(
            config.learning_rate, trainable=False)
        self.learning_rate = tf.train.exponential_decay(
            initial_learning_rate, self.global_step, self.config.decay_step,
            self.config.lr_decay,
            name='lr') if config.use_lr_decay else initial_learning_rate
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)

        self.loss = -tf.reduce_sum(self.label * tf.log(self.softmax))
        self.train_op = self.optimizer.minimize(self.loss,
                                                global_step=self.global_step)

        self.layers_collector.append(tf.expand_dims(self.fc1, 1))
        self.layers_collector.append(self.softmax)

    def res_block(self, x, channel):
        for i in range(3):
            shortcut = x
            if i == 0:
                shortcut = self.conv2d(shortcut, channel, [1, 1])

            x = self.conv2d(x, channel, [3, 3])
            x += shortcut
            self.layers_collector.append(self.transpose(x))
        return x

    def transpose(self, layer):
        return tf.transpose(layer, [0, 3, 1, 2])

    def conv2d(self, input, channel, kernel, name=None):
        l2_regularizer = tf.contrib.layers.l2_regularizer(
            scale=self.config.l2_beta) if self.config.l2_norm else None

        conv = tf.layers.conv2d(input, channel, kernel,
                                strides=(1, 1), padding='SAME',
                                use_bias=True,
                                kernel_initializer=self.initializer,
                                kernel_regularizer=l2_regularizer)
        if self.config.batch_norm:
            conv = tf.layers.batch_normalization(conv)
        activate = self.activate_func(conv, name)
        return activate


In [2]:
%run runner.py miniResNet

Using miniResNet
use_lr_decay True
batch_norm False
l2_norm True
model_path ./trained_model/
decay_step 1200
learning_rate 0.0005
optimizer adam
initializer xavier
valid_step 1200
valid_size 1000
info_plane_interval 1200
dropout True
keep_prob 0.6
max_epoch 20
sample_size 2000
model_name latest.ckpt
lr_decay 0.85
l2_beta 0.01
batch_size 50
activate_func tanh
Model doesn't exist.
Initializing........
step 1200, epoch 1, loss 19.688631, valid accuracy: 0.880000
step 2400, epoch 2, loss 12.870801, valid accuracy: 0.901000
step 3600, epoch 3, loss 10.321647, valid accuracy: 0.912000
step 4800, epoch 4, loss 8.512961, valid accuracy: 0.918000
step 6000, epoch 5, loss 6.699117, valid accuracy: 0.915000
step 7200, epoch 6, loss 4.828868, valid accuracy: 0.929000
step 8400, epoch 7, loss 3.615759, valid accuracy: 0.927000
step 9600, epoch 8, loss 2.540831, valid accuracy: 0.918000
step 10800, epoch 9, loss 1.671445, valid accuracy: 0.926000
step 12000, epoch 10, loss 0.997087, valid accuracy: 