# Optimization

参考

* [『深層学習』（岡谷貴之）：機械学習プロフェッショナルシリーズ｜講談社BOOK倶楽部](http://bookclub.kodansha.co.jp/product?isbn=9784061529021 "『深層学習』（岡谷貴之）：機械学習プロフェッショナルシリーズ｜講談社BOOK倶楽部")
* [深層学習 | 近代科学社](http://www.kindaikagaku.co.jp/information/kd0487.htm "深層学習")
* [機械学習 - ニューラルネットで最適化アルゴリズムを色々試してみる - Qiita](http://qiita.com/hogefugabar/items/1d4f6c905d0edbc71af2 "機械学習 - ニューラルネットで最適化アルゴリズムを色々試してみる - Qiita")

In [1]:
import theano.sandbox.cuda
theano.sandbox.cuda.use("gpu3")

Using gpu device 3: GeForce GTX TITAN X


In [2]:
from collections import OrderedDict

import numpy
import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score

# Random Seed
rng = numpy.random.RandomState(1234)
trng = RandomStreams(42)

mnist = fetch_mldata('MNIST original')
mnist_x, mnist_y = shuffle(mnist.data.astype("float32")/255.0, mnist.target.astype("int32"))

## Layer

In [3]:
class Layer(object):
    def __init__(self, in_dim, out_dim, function, Optimizer, **args):
        '''
        :param in_dim: 入力次元
        :param out_dim: 出力次元
        :param function: 活性化関数
        :param Optimizer: 最適化手法（クラスを与える）
        :param args: Optimizerに渡すパラメータ
        '''
        self.func = function
        self.W = theano.shared(
            rng.uniform(
                low=-numpy.sqrt(6./(in_dim+out_dim)),
                high=numpy.sqrt(6./(in_dim+out_dim)),
                size=(in_dim, out_dim)
            ).astype('float32'), name='W'
        )
        self.b = theano.shared(numpy.zeros(out_dim).astype('float32'), name='bias')
        self.params = [self.W, self.b]
        self.optimizer = Optimizer(self.params, **args)

    def fprop(self, x):
        '''
        順伝播

        :param x: 入力
        :return: レイヤーの出力
        '''
        z = self.func(T.dot(x, self.W) + self.b)
        self.z = z
        return z
    
    def optimize(self, cost):
        '''
        パラメータの最適化
        See:
          http://deeplearning.net/software/theano/library/compile/function.html

        :param cost: コスト
        :return: Tuple(パラメータ，新しい値)のリスト
        '''
        updates = self.optimizer.update(cost)
        return list(updates.items())

## Optimizer

In [4]:
class Optimizer(object):
    def __init__(self, params):
        '''
        :param params: 更新するパラメータ
        '''
        self.params = params
    
    def update(self, cost):
        pass

## SGD

In [5]:
class SGD(Optimizer):
    def __init__(self, params, eps=0.01):
        '''
        :param params: 更新するパラメータ
        :param eps: 学習率
        '''
        super().__init__(params)
        self.eps = eps
    
    def update(self, cost):
        '''
        :param cost: コスト
        :return: 更新後のパラメータが格納されたOrderedDict
        '''
        updates = OrderedDict()

        gparams = T.grad(cost, self.params)
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - self.eps * gparam

        return updates

## SGD with Momentum

In [6]:
class Momentum(SGD):
    def __init__(self, params, eps=0.01, mu=0.9):
        '''
        :param params: 更新するパラメータ
        :param eps: 学習率
        :param mu: モメンタム
        '''
        SGD.__init__(self, params, eps)
        #super().__init__(eps) # http://stackoverflow.com/questions/222877/how-to-use-super-in-python
        self.mu = mu
        self.prev_gparams = [
            theano.shared(numpy.zeros(param.shape.eval()).astype('float32')) for param in params
        ]
    
    def update(self, cost):
        '''
        :param cost: コスト
        :return: 更新後のパラメータが格納されたOrderedDict
        '''
        updates = OrderedDict()

        gparams = T.grad(cost, self.params)
        for param, gparam, prev_gparam in zip(self.params, gparams, self.prev_gparams):
            updates[param] = param - self.eps * gparam + self.mu * prev_gparam
            updates[prev_gparam] = - self.eps * gparam + self.mu * prev_gparam

        return updates

## AdaGrad

* [Adaptive Subgradient Methods for Online Learning and Stochastic Optimization](http://jmlr.org/papers/v12/duchi11a.html "Adaptive Subgradient Methods for Online Learning and Stochastic Optimization")

In [7]:
class AdaGrad(Optimizer):
    def __init__(self, params, gamma=1.0, eps=1e-6):
        '''
        :param params: 更新するパラメータ
        :param gamma: 学習率の計算で利用する定数
        :param eps: パラメータが発散するのを防ぐために使う微少な値
        '''
        super().__init__(params)
        self.gamma = numpy.float32(gamma)
        self.eps = numpy.float32(eps)
        self.grads = [
            theano.shared(numpy.zeros(param.shape.eval()).astype('float32')) for param in self.params
        ]
    
    def update(self, cost):
        '''
        :param cost: コスト
        :return: 更新後のパラメータが格納されたOrderedDict
        '''
        updates = OrderedDict()

        gparams = T.grad(cost, self.params)
        for param, gparam, grad in zip(self.params, gparams, self.grads):
            tau = self.gamma / (T.sqrt(grad + gparam * gparam) + self.eps)
            updates[param] = param - tau * gparam
            updates[grad] = grad + gparam * gparam

        return updates

## Adam

* [Adam: A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980 "Adam: A Method for Stochastic Optimization")

In [8]:
class Adam(Optimizer):
    def __init__(self, params, alpha=0.001, beta1=0.9, beta2=0.999, eps=1e-8, tau=1.0-1e-8):
        '''
        :param params: 更新するパラメータ
        :param alpha:
        :param beta1:
        :param beta2:
        :param eps: パラメータが発散するのを防ぐために使う微少な値
        :param tau:
        '''
        self.params = params
        self.m = [
            theano.shared(numpy.zeros(param.shape.eval()).astype('float32'), name="mean") for param in self.params
        ]
        self.v = [
            theano.shared(numpy.zeros(param.shape.eval()).astype('float32'), name="variance") for param in self.params
        ]
        self.alpha = numpy.float32(alpha)
        self.beta1 = theano.shared(numpy.float32(beta1), name="beta1")
        self.beta2 = numpy.float32(beta2)
        self.eps = numpy.float32(eps)
        self.tau = numpy.float32(tau)
    
    def update(self, cost):
        '''
        :param cost: コスト
        :return: 更新後のパラメータが格納されたOrderedDict
        '''
        updates = OrderedDict()

        gparams = T.grad(cost, self.params)
        beta1 = self.beta1 * self.tau
        for param, gparam, m, v in zip(self.params, gparams, self.m, self.v):
            new_m = beta1 * m + (numpy.float32(1.0) - beta1) * gparam
            new_v = self.beta2 * v + (numpy.float32(1.0) - self.beta2) * gparam * gparam
            m_hat = new_m / (numpy.float32(1.0) - beta1)
            v_hat = new_v / (numpy.float32(1.0) - self.beta2)
            updates[param] = param - self.alpha * m_hat / (T.sqrt(v_hat) + self.eps)
            updates[m] = new_m
            updates[v] = new_v
        updates[self.beta1] = beta1
        
        return updates

## Theano functionをコンパイル

In [9]:
def fprops(layers, x):
    '''
    ネットワーク全体の順伝播

    :param layers: ネットワーク 
    :param x: 入力
    :return: 出力層の出力
    '''
    z = x
    for layer in layers:
        z = layer.fprop(z)    
    return z

In [10]:
# Cost Function (Negative Log Likelihood)
def cross_entropy(y, d):
    '''
    交差エントロピーを計算する

    :param y: 出力層の出力
    :param d: 目標出力
    :return: 交差エントロピー
    '''
    # cf. http://deeplearning.net/tutorial/logreg.html#defining-a-loss-function
    # cf. http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
    return -T.mean(T.log(y)[T.arange(d.shape[0]), d])

In [11]:
def compile_functions(layers):
    '''
    訓練とテストに利用するtheano.functionをコンパイルする

    :param layers: 学習対象のネットワーク
    :return: 訓練とテストに利用するtheano.function
    '''
    x, t = T.fmatrix("x"), T.ivector("t")
    
    y = fprops(layers, x)
    cost = cross_entropy(y, t)
    
    updates = []
    for layer in layers:
        updates += layer.optimize(cost)
    
    ## Compile
    train = theano.function([x,t], cost, updates=updates)
    test = theano.function([x,t],[cost, T.argmax(y, axis=1)])

    return train, test

## 実験

In [12]:
train_x, valid_x, train_y, valid_y = train_test_split(mnist_x, mnist_y, test_size=0.2, random_state=42)

### SGD

In [13]:
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=SGD),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax, Optimizer=SGD),
]

train, test = compile_functions(layers)

batch_size = 100
nbatches = train_x.shape[0] // batch_size

for epoch in range(1000):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])
    
    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 2.306, Validation F1:: 0.098
EPOCH::  10, Validatioon Cost:: 2.306, Validation F1:: 0.113
EPOCH::  20, Validatioon Cost:: 2.310, Validation F1:: 0.104
EPOCH::  30, Validatioon Cost:: 2.305, Validation F1:: 0.113
EPOCH::  40, Validatioon Cost:: 2.304, Validation F1:: 0.094
EPOCH::  50, Validatioon Cost:: 2.303, Validation F1:: 0.094
EPOCH::  60, Validatioon Cost:: 2.306, Validation F1:: 0.113
EPOCH::  70, Validatioon Cost:: 2.306, Validation F1:: 0.113
EPOCH::  80, Validatioon Cost:: 2.304, Validation F1:: 0.104
EPOCH::  90, Validatioon Cost:: 2.304, Validation F1:: 0.100
EPOCH:: 100, Validatioon Cost:: 2.303, Validation F1:: 0.104
EPOCH:: 110, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 120, Validatioon Cost:: 2.303, Validation F1:: 0.103
EPOCH:: 130, Validatioon Cost:: 2.303, Validation F1:: 0.098
EPOCH:: 140, Validatioon Cost:: 2.304, Validation F1:: 0.113
EPOCH:: 150, Validatioon Cost:: 2.302, Validation F1:: 0.104
EPOCH:: 160, Validatioon

### SGD with Momentum

In [14]:
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Momentum),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax, Optimizer=Momentum),
]

train, test = compile_functions(layers)

batch_size = 100
nbatches = train_x.shape[0] // batch_size

for epoch in range(1000):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])
    
    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 2.330, Validation F1:: 0.104
EPOCH::  10, Validatioon Cost:: 2.303, Validation F1:: 0.104
EPOCH::  20, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH::  30, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH::  40, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH::  50, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH::  60, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH::  70, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH::  80, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH::  90, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 100, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 110, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH:: 120, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH:: 130, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 140, Validatioon Cost:: 2.302, Validation F1:: 0.104
EPOCH:: 150, Validatioon Cost:: 2.301, Validation F1:: 0.113
EPOCH:: 160, Validatioon

### AdaGrad

In [15]:
gamma = 0.01
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=AdaGrad, gamma=gamma),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax, Optimizer=AdaGrad, gamma=gamma),
]

train, test = compile_functions(layers)

batch_size = 100
nbatches = train_x.shape[0] // batch_size

for epoch in range(500):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])

    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 2.308, Validation F1:: 0.113
EPOCH::  10, Validatioon Cost:: 2.305, Validation F1:: 0.113
EPOCH::  20, Validatioon Cost:: 2.304, Validation F1:: 0.113
EPOCH::  30, Validatioon Cost:: 2.303, Validation F1:: 0.104
EPOCH::  40, Validatioon Cost:: 2.303, Validation F1:: 0.113
EPOCH::  50, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH::  60, Validatioon Cost:: 2.302, Validation F1:: 0.104
EPOCH::  70, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH::  80, Validatioon Cost:: 2.303, Validation F1:: 0.113
EPOCH::  90, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 100, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 110, Validatioon Cost:: 2.303, Validation F1:: 0.113
EPOCH:: 120, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 130, Validatioon Cost:: 2.302, Validation F1:: 0.104
EPOCH:: 140, Validatioon Cost:: 2.302, Validation F1:: 0.113
EPOCH:: 150, Validatioon Cost:: 2.302, Validation F1:: 0.104
EPOCH:: 160, Validatioon

### Adam

In [16]:
layers = [
    Layer(in_dim=784,  out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=1000, function=T.nnet.sigmoid, Optimizer=Adam),
    Layer(in_dim=1000, out_dim=10,   function=T.nnet.softmax, Optimizer=Adam),
]

train, test = compile_functions(layers)

batch_size = 100
nbatches = train_x.shape[0] // batch_size

for epoch in range(300):
    train_x, train_y = shuffle(train_x, train_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        
        train(train_x[start:end], train_y[start:end])

    if ((epoch+1) % 10 == 0) or (epoch == 0):
        valid_cost, pred = test(valid_x, valid_y)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(valid_cost),
                                                                                     f1_score(valid_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 2.327, Validation F1:: 0.100
EPOCH::  10, Validatioon Cost:: 0.136, Validation F1:: 0.960
EPOCH::  20, Validatioon Cost:: 0.099, Validation F1:: 0.975
EPOCH::  30, Validatioon Cost:: 0.127, Validation F1:: 0.971
EPOCH::  40, Validatioon Cost:: 0.116, Validation F1:: 0.978
EPOCH::  50, Validatioon Cost:: 0.117, Validation F1:: 0.980
EPOCH::  60, Validatioon Cost:: 0.128, Validation F1:: 0.980
EPOCH::  70, Validatioon Cost:: 0.117, Validation F1:: 0.983
EPOCH::  80, Validatioon Cost:: 0.109, Validation F1:: 0.983
EPOCH::  90, Validatioon Cost:: 0.118, Validation F1:: 0.982
EPOCH:: 100, Validatioon Cost:: 0.127, Validation F1:: 0.981
EPOCH:: 110, Validatioon Cost:: 0.145, Validation F1:: 0.978
EPOCH:: 120, Validatioon Cost:: 0.112, Validation F1:: 0.982
EPOCH:: 130, Validatioon Cost:: 0.180, Validation F1:: 0.976
EPOCH:: 140, Validatioon Cost:: 0.141, Validation F1:: 0.984
EPOCH:: 150, Validatioon Cost:: 0.145, Validation F1:: 0.981
EPOCH:: 160, Validatioon