# Batch Normalization

[Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift](http://arxiv.org/abs/1502.03167 "[1502.03167] Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift")

In [1]:
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu2")

Using gpu device 2: GeForce GTX TITAN X


In [2]:
from collections import OrderedDict

import numpy
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score

# Random Seed
rng = numpy.random.RandomState(1234)
trng = RandomStreams(42)

mnist = fetch_mldata('MNIST original')
mnist_x, mnist_y = shuffle(mnist.data.astype("float32")/255.0, mnist.target.astype("int32"))

## Layer

In [3]:
class Layer:
    def __init__(self, in_dim, out_dim, function, beta=0.0, gamma=1.0, eps=1e-5):
        self.func = function
        self.W = theano.shared(
            rng.uniform(
                low=-numpy.sqrt(6./(in_dim+out_dim)),
                high=numpy.sqrt(6./(in_dim+out_dim)),
                size=(in_dim, out_dim)
            ).astype('float32'), name='W'
        )
        self.beta = theano.shared((numpy.zeros(in_dim) + beta).astype('float32'), name='beta')
        self.gamma = theano.shared((numpy.zeros(in_dim) + gamma).astype('float32'), name='gamma')
        self.eps = eps
        self.params = [self.W, self.beta, self.gamma]
        
        self.avg_mean = theano.shared(numpy.zeros(in_dim).astype('float32'), name='avg_mean') # 入力（ミニバッチ）の各次元の平均の平均
        self.avg_var = theano.shared(numpy.zeros(in_dim).astype('float32'), name='avg_var') # 入力（ミニバッチ）の各次元の分散の平均
        self.N = theano.shared(numpy.float32(0), name='N') # 学習回数
        
        # 前回のパラメータ更新量 (P.52)
        self.prev_dW = theano.shared(numpy.zeros((in_dim, out_dim)).astype('float32'), name='prev_dW')
        self.prev_dbeta = theano.shared(numpy.zeros(in_dim).astype('float32'), name='prev_dbeta')
        self.prev_dgamma = theano.shared(numpy.zeros(in_dim).astype('float32'), name='prev_dgamma')
        self.prev_dparams = [self.prev_dW, self.prev_dbeta, self.prev_dgamma]

    # We will be able to use theano.tensor.nnet.bn.batch_normalization since Theano v0.7.1
    # http://deeplearning.net/software/theano/library/tensor/nnet/bn.html
    def bn(self, x, mean, var):
        '''
        batch normalization
    
        :param x: 入力
        :param mean: 平均
        :param var: 分散
        :return: xをbatch normalizationした値
        '''
        x_hat = (x - mean) / T.sqrt(var + self.eps)
        y = self.gamma * x_hat + self.beta    
        return y

    def fprop(self, x):
        '''
        順伝播
    
        :param x: 入力
        :return: レイヤーの出力
        '''
        self.mean = T.mean(x, axis=0)
        self.var = T.var(x, axis=0)
        
        y = self.bn(x, self.mean, self.var)
        z = self.func(T.dot(y, self.W))
        self.z = z
        return z

    def predict(self, x, m):
        '''
        ネットワークパラメータを固定して推定する
    
        :param x: 入力
        :param m: バッチサイズ (m > 1)
        :return: 推定値（レイヤーの出力）
        '''
        var = self.avg_var * m / (m - 1)
        y = self.bn(x, self.avg_mean, var)
        z = self.func(T.dot(y, self.W))
        return z

## Theano functionをコンパイル

In [4]:
def fprops(layers, x):
    '''
    ネットワーク全体の順伝播

    :param layers: ネットワーク 
    :param x: 入力
    :return: 出力層の出力
    '''
    z = x
    for layer in layers:
        z = layer.fprop(z)    
    return z

In [5]:
def predict(layers, x, m):
    '''
    学習済みネットワークを用いて推定する（ネットワーク全体の順伝播）

    :param layers: ネットワーク 
    :param x: 入力
    :param m: バッチサイズ
    :return: 推定値（出力層の出力）
    '''
    z = x
    for layer in layers:
        z = layer.predict(z, m)    
    return z

In [6]:
# Cost Function (Negative Log Likelihood)
def cross_entropy(y, d):
    '''
    交差エントロピーを計算する
    See: (2.11) and #3.3

    :param y: 出力層の出力
    :param d: 目標出力
    :return: 交差エントロピー
    '''
    # cf. http://deeplearning.net/tutorial/logreg.html#defining-a-loss-function
    # cf. http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
    return -T.mean(T.log(y)[T.arange(d.shape[0]), d])

In [7]:
def sgd_with_momentum(params, gparams, prev_dparams, eps=0.01, mu=0.9):
    '''
    stochastic gradient descent with momentum
    See: #3.2, #3.6.4 and P.52
    
    :param params: 更新するパラメータ
    :param gparams: パラメータの勾配
    :param eps: 学習率
    :param mu: モメンタム
    :return: 更新後のパラメータが格納されたOrderedDict
    '''
    updates = OrderedDict()
    for param, gparam, prev_dparam in zip(params, gparams, prev_dparams):
        updates[param] = param - eps * gparam + mu * prev_dparam
        updates[prev_dparam] = - eps * gparam + mu * prev_dparam
    return updates

In [8]:
def get_train(layers, lr=0.01):
    '''
    学習を行うtheano.functionを生成する

    :param layers: 学習対象のネットワーク
    :param lr: 学習率
    :return: theano.function
    '''
    x, t = T.fmatrix("x"), T.ivector("t")
    
    ## Collect Parameters and Symbolic output
    params = []
    prev_dparams = []
    for layer in layers:
        params += layer.params
        prev_dparams += layer.prev_dparams
    
    y = fprops(layers, x)
    cost = cross_entropy(y, t)
    
    ## Get Gradient
    gparams = T.grad(cost, params)
    updates = sgd_with_momentum(params, gparams, prev_dparams, eps=lr)
    
    # Update stats for batch normalization
    for layer in layers:
        N = layer.N
        avg_mean = layer.avg_mean
        avg_var  = layer.avg_var
        # 統計量を更新
        updates[N] = N + 1
        updates[avg_mean] = (avg_mean*N + layer.mean) / (N + 1)
        updates[avg_var]  = (avg_var*N + layer.var) / (N + 1)
    
    ## Compile
    train = theano.function([x,t], cost, updates=updates)
    return train

In [9]:
def get_test(layers, m):
    '''
    推定を行うtheano.functionを生成する

    :param layers: 学習対象のネットワーク
    :param m: バッチサイズ
    :return: theano.function
    '''
    x, t = T.fmatrix("x"), T.ivector("t")

    y = predict(layers, x, m)
    cost = cross_entropy(y, t)

    ## Compile
    test = theano.function([x,t], [cost, T.argmax(y, axis=1)])
    return test

## 実験

In [10]:
train_x, valid_x, train_y, valid_y = train_test_split(mnist_x, mnist_y, test_size=0.2, random_state=42)

### 3層

In [11]:
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax),
]

batch_size = 100
nbatches = train_x.shape[0] // batch_size

train = get_train(layers)
test = get_test(layers, batch_size)

for epoch in range(100):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])
    
    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 0.222, Validation F1:: 0.935
EPOCH::  10, Validatioon Cost:: 0.086, Validation F1:: 0.976
EPOCH::  20, Validatioon Cost:: 0.087, Validation F1:: 0.979
EPOCH::  30, Validatioon Cost:: 0.087, Validation F1:: 0.980
EPOCH::  40, Validatioon Cost:: 0.090, Validation F1:: 0.980
EPOCH::  50, Validatioon Cost:: 0.089, Validation F1:: 0.981
EPOCH::  60, Validatioon Cost:: 0.091, Validation F1:: 0.981
EPOCH::  70, Validatioon Cost:: 0.093, Validation F1:: 0.981
EPOCH::  80, Validatioon Cost:: 0.096, Validation F1:: 0.981
EPOCH::  90, Validatioon Cost:: 0.094, Validation F1:: 0.982
EPOCH:: 100, Validatioon Cost:: 0.095, Validation F1:: 0.981


### 5層

In [12]:
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax),
]

batch_size = 100
nbatches = train_x.shape[0] // batch_size

train = get_train(layers)
test = get_test(layers, batch_size)

for epoch in range(200):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])
    
    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 0.166, Validation F1:: 0.951
EPOCH::  10, Validatioon Cost:: 0.100, Validation F1:: 0.974
EPOCH::  20, Validatioon Cost:: 0.099, Validation F1:: 0.978
EPOCH::  30, Validatioon Cost:: 0.105, Validation F1:: 0.979
EPOCH::  40, Validatioon Cost:: 0.107, Validation F1:: 0.978
EPOCH::  50, Validatioon Cost:: 0.090, Validation F1:: 0.981
EPOCH::  60, Validatioon Cost:: 0.095, Validation F1:: 0.982
EPOCH::  70, Validatioon Cost:: 0.094, Validation F1:: 0.983
EPOCH::  80, Validatioon Cost:: 0.096, Validation F1:: 0.982
EPOCH::  90, Validatioon Cost:: 0.095, Validation F1:: 0.983
EPOCH:: 100, Validatioon Cost:: 0.100, Validation F1:: 0.982
EPOCH:: 110, Validatioon Cost:: 0.104, Validation F1:: 0.981
EPOCH:: 120, Validatioon Cost:: 0.103, Validation F1:: 0.982
EPOCH:: 130, Validatioon Cost:: 0.099, Validation F1:: 0.982
EPOCH:: 140, Validatioon Cost:: 0.102, Validation F1:: 0.983
EPOCH:: 150, Validatioon Cost:: 0.105, Validation F1:: 0.983
EPOCH:: 160, Validatioon

### 7層

In [11]:
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax),
]

batch_size = 100
nbatches = train_x.shape[0] // batch_size

train = get_train(layers)
test = get_test(layers, batch_size)

for epoch in range(300):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])
    
    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 0.209, Validation F1:: 0.938
EPOCH::  10, Validatioon Cost:: 0.101, Validation F1:: 0.976
EPOCH::  20, Validatioon Cost:: 0.100, Validation F1:: 0.978
EPOCH::  30, Validatioon Cost:: 0.103, Validation F1:: 0.979
EPOCH::  40, Validatioon Cost:: 0.102, Validation F1:: 0.981
EPOCH::  50, Validatioon Cost:: 0.112, Validation F1:: 0.979
EPOCH::  60, Validatioon Cost:: 0.112, Validation F1:: 0.981
EPOCH::  70, Validatioon Cost:: 0.114, Validation F1:: 0.980
EPOCH::  80, Validatioon Cost:: 0.100, Validation F1:: 0.981
EPOCH::  90, Validatioon Cost:: 0.107, Validation F1:: 0.981
EPOCH:: 100, Validatioon Cost:: 0.110, Validation F1:: 0.981
EPOCH:: 110, Validatioon Cost:: 0.113, Validation F1:: 0.981
EPOCH:: 120, Validatioon Cost:: 0.109, Validation F1:: 0.981
EPOCH:: 130, Validatioon Cost:: 0.120, Validation F1:: 0.980
EPOCH:: 140, Validatioon Cost:: 0.118, Validation F1:: 0.980
EPOCH:: 150, Validatioon Cost:: 0.112, Validation F1:: 0.981
EPOCH:: 160, Validatioon