# 用theano实现softmax分类器

上一篇笔记[softmax分类器](./softmax-crossentropy-derivative.ipynb)介绍了softmax分类器的基本原理，涉及交叉熵误差及其对应的梯度更新式的推导过程。这一篇笔记着重于softmax分类器的实现，所用的工具为theano，测试的数据集为MNIST手写数字数据集，该数据集包含60000张训练图片，10000张测试图片。

载入依赖包

In [1]:
from __future__ import print_function, division
from abc import ABCMeta, abstractmethod
from six import add_metaclass
import numpy as np
import theano
import theano.tensor as T
from keras.datasets import mnist
from collections import OrderedDict

DEBUG: nvcc STDOUT mod.cu
   ���ڴ����� C:/Users/hschen/AppData/Local/Theano/compiledir_Windows-10-10.0.14393-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/tmpgcrtfx/265abc51f7c376c224983485238ff1a5.lib �Ͷ��� C:/Users/hschen/AppData/Local/Theano/compiledir_Windows-10-10.0.14393-Intel64_Family_6_Model_60_Stepping_3_GenuineIntel-2.7.12-64/tmpgcrtfx/265abc51f7c376c224983485238ff1a5.exp

Using gpu device 0: GeForce GTX 960M (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


随机数生成器和其他的工具函数

In [2]:
class NumpyRNG(object):
    _rng = None

    @classmethod
    def get_rng(cls):
        if cls._rng == None:
            cls._rng = np.random
        return cls._rng

    @classmethod
    def set_rng(cls, seed):
        cls._rng = np.random.RandomState(seed)


def floatX(arr):
    return np.asarray(arr, dtype=theano.config.floatX)

初始化模块，用于初始化权重

In [3]:
@add_metaclass(ABCMeta)
class Initializer(object):
    @abstractmethod
    def create_param(self, size):
        """"""

    def __call__(self, size, shared = True):
        param = self.create_param(size)
        if shared:
            return theano.shared(param)
        else:
            return param

class Constant(Initializer):
    def __init__(self, scale = 0.):
        self.scale = scale

    def create_param(self, size):
        param = floatX(np.ones(size) * self.scale)
        return param

class Normal(Initializer):
    def __init__(self, mean = 0., std = .01):
        self.mean = mean
        self.std = std

    def create_param(self, size):
        rng = NumpyRNG.get_rng()
        param = floatX(rng.normal(loc=self.mean, scale=self.std, size=size))
        return param

softmax层的实现

In [4]:
class Layer(object):
    def __init__(self, **kwargs):
        """"""


    @abstractmethod
    def apply(self, x, mask=None):
        """"""

    def __call__(self, x, mask=None):
        return self.apply(x, mask)
class Logistic(Layer):
    def __init__(self, n_in, n_out, **kwargs):
        self.n_in = n_in
        self.n_out = n_out
        self.W = Normal(0., .01)((n_in, n_out))
        self.b = Constant()(n_out)
        self.params = [self.W, self.b]

    def apply(self, x, mask=None):
        p_y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b)
        return p_y_given_x

    def cost(self, x, label):
        if label.ndim != 1 or x.ndim != 2:
            raise NotImplementedError
        prediction = self.apply(x)
        cost = T.mean(-T.log(T.clip(prediction[T.arange(x.shape[0]), label], 1e-3, 1.)))
        return cost

优化算法为adadelta，借用了lasagne的代码。

In [5]:
def adadelta(params, grads, learning_rate=1.0, rho=0.95, epsilon=1e-6):
    updates = OrderedDict()

    # Using theano constant to prevent upcasting of float32
    one = T.constant(1)

    for param, grad in zip(params, grads):
        value = param.get_value(borrow=True)
        # accu: accumulate gradient magnitudes
        accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                             broadcastable=param.broadcastable)
        # delta_accu: accumulate update magnitudes (recursively!)
        delta_accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
                                   broadcastable=param.broadcastable)

        # update accu (as in rmsprop)
        accu_new = rho * accu + (one - rho) * grad ** 2
        updates[accu] = accu_new

        # compute parameter update, using the 'old' delta_accu
        update = (grad * T.sqrt(delta_accu + epsilon) /
                  T.sqrt(accu_new + epsilon))
        updates[param] = param - learning_rate * update

        # update delta_accu (as accu, but accumulating updates)
        delta_accu_new = rho * delta_accu + (one - rho) * update ** 2
        updates[delta_accu] = delta_accu_new

    return updates

生成batch的index

In [6]:
def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    """

    idx_list = np.arange(n, dtype="int32")
    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size
    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])
    return zip(range(len(minibatches)), minibatches)

训练代码。读取数据的部分借用了keras的模块。

In [7]:
batch_size = 32
n_epoch = 30
(X_train, y_train), (X_test, y_test) = mnist.load_data(path=r"G:\data\mnist.pkl.gz")
X_train = X_train.reshape(len(X_train), 28 * 28) / np.float32(255.)
X_test = X_test.reshape(len(X_test), 28*28) / np.float32(255.)
X_tr = T.matrix(dtype=theano.config.floatX)
y_tr = T.vector(dtype='int32')
lr = Logistic(784, 10)
lr_cost = lr.cost(X_tr, y_tr)
grads = T.grad(lr_cost, lr.params)
updates = adadelta(lr.params, grads)
acc = T.mean(T.eq(T.argmax(lr(X_tr), axis=1), y_tr))
fn_train = theano.function([X_tr, y_tr], [lr_cost, acc], updates=updates)

mini_batches = get_minibatches_idx(len(X_train), batch_size, shuffle=True)
for e in range(n_epoch):
    for _, train_idx in mini_batches:
        cost = fn_train(X_train[train_idx], y_train[train_idx])
    trn_loss,trn_acc = fn_train(X_train, y_train)
    val_loss,val_acc = fn_train(X_test, y_test)
    print("epoch {}, trn_loss={}, trn_acc={}, val_loss={}, val_acc={}"\
          .format(e, trn_loss, trn_acc, val_loss, val_acc))

epoch 0, trn_loss=0.286183655262, trn_acc=0.918216666667, val_loss=0.280671298504, val_acc=0.9188
epoch 1, trn_loss=0.267140179873, trn_acc=0.9241, val_loss=0.26878619194, val_acc=0.923
epoch 2, trn_loss=0.258325606585, trn_acc=0.926983333333, val_loss=0.264801651239, val_acc=0.925
epoch 3, trn_loss=0.252924561501, trn_acc=0.928833333333, val_loss=0.263516664505, val_acc=0.9257
epoch 4, trn_loss=0.249107733369, trn_acc=0.93035, val_loss=0.263270288706, val_acc=0.9264
epoch 5, trn_loss=0.246360614896, trn_acc=0.931616666667, val_loss=0.263392955065, val_acc=0.9267
epoch 6, trn_loss=0.244001120329, trn_acc=0.9324, val_loss=0.263515710831, val_acc=0.9266
epoch 7, trn_loss=0.242098540068, trn_acc=0.932816666667, val_loss=0.263630479574, val_acc=0.9274
epoch 8, trn_loss=0.240467473865, trn_acc=0.93315, val_loss=0.263916522264, val_acc=0.927
epoch 9, trn_loss=0.239036202431, trn_acc=0.93375, val_loss=0.264052331448, val_acc=0.9272
epoch 10, trn_loss=0.237787619233, trn_acc=0.93405, val_loss=

## 一些总结
1. 必须对数据进行预处理，可以简单地将像素值除以255，放缩为[0,1]区间内的值。  
```py
X_train = X_train.reshape(len(X_train), 28 * 28) / np.float32(255.)
X_test = X_test.reshape(len(X_test), 28*28) / np.float32(255.)
```
这一步很重要，如果不进行预处理，结果准确度大约是0.4~0.6  
2. 进行预处理后的结果大约为0.9266，这与UFLDL中给出的0.926一致，说明实现是正确的

## 参考
1. [UFLDL exercise-softmax-regression](http://deeplearning.stanford.edu/wiki/index.php/Exercise:Softmax_Regression)