# Foward/Backward Computation

In [1]:
import numpy as np
import chainer
from chainer import Function, gradient_check, report, training, utils, Variable
from chainer import datasets, iterators, optimizers, serializers
from chainer import link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from chainer.training import extensions

In [2]:
x_data = np.array([5], dtype=np.float32)
x = Variable(x_data)  # trainableな変数となる

In [3]:
x, x.data  # can get value from data attribute then it's array

(variable([ 5.]), array([ 5.], dtype=float32))

In [4]:
y = x**2 -2 * x +1

In [5]:
y.data

array([ 16.], dtype=float32)

In [6]:
y.backward()  # backwordを行うと
x.grad  # それぞれの変数においての勾配が計算される．

array([ 8.], dtype=float32)

In [7]:
z = 2*x
y = x**2 - z +1
y.backward(retain_grad=True)
z.grad  # 中間の変数の勾配はメモリ効率のため，retain_grad=Trueとしないと解放される．

array([-1.], dtype=float32)

In [8]:
y.backward()  # そうしないと勾配を保持しない

In [9]:
z.grad is None

True

In [10]:
x.grad  # 最初の変数は勾配が常にある．

array([ 24.], dtype=float32)

In [11]:
x = Variable(np.array([[1, 2 ,3], [4, 5, 6]], dtype=np.float32))
y = x**2 -2*x +1
# 多次元配列を入力とするときは初期誤差を明示的に書かなければならない
y.grad = np.ones((2, 3), dtype=np.float32)
y.backward()
x.grad

array([[  0.,   2.,   4.],
       [  6.,   8.,  10.]], dtype=float32)

In [12]:
x = Variable(np.array([[1, 2 ,3], [4, 5, 6]], dtype=np.float32))
y = x**2 -2*x +1
# y.grad = np.ones((2, 3), dtype=np.float32)
y.backward()
x.grad is None

True

# Links

In [13]:
# accept input value whose shape is (N, 3) and this functions output shape is (N,2)
f = L.Linear(3, 2)

In [14]:
f.W  # instance of Variable

variable W([[ 1.36686826, -0.64760172, -0.41609076],
            [ 0.47256982, -1.13561976,  0.17554796]])

In [15]:
f.W.data, f.b.data  # W,b はインスタンス化した際に初期化される．よって何度このセルを実行しても値は同じ

(array([[ 1.36686826, -0.64760172, -0.41609076],
        [ 0.47256982, -1.13561976,  0.17554796]], dtype=float32),
 array([ 0.,  0.], dtype=float32))

In [16]:
x = Variable(np.array([[1, 2, 3], [4, 5, 6]],
                      dtype=np.float32))  # input
y = f(x)  # fully-connected
y.data

array([[-1.17660749, -1.27202582],
       [-0.26708007, -2.73453164]], dtype=float32)

In [17]:
f = L.Linear(2)

In [18]:
f.W, f.b  # 入力次元を指定してないので, W はデータが流れるまでは初期化されない

(variable W(None), variable b([ 0.,  0.]))

In [19]:
x = Variable(np.array([[1, 2, 3], [4, 5, 6]],
                      dtype=np.float32))  # input
y = f(x)  # fully-connected
print(f.W, f.b)  # 推論したら値が初期化されている．
y.data

variable W([[-0.24030645 -0.07181747  0.50280213]
            [-0.87310749  0.12547415  0.24238083]]) variable b([ 0.  0.])


array([[ 1.12446499,  0.10498327],
       [ 1.69649959, -1.41077423]], dtype=float32)

In [20]:
f.cleargrads()

In [21]:
y.grad = np.ones((2, 2), dtype=np.float32)
y.backward()
f.W.grad, f.b.grad

(array([[ 5.,  7.,  9.],
        [ 5.,  7.,  9.]], dtype=float32), array([ 2.,  2.], dtype=float32))

In [22]:
y.backward()

In [23]:
f.W.grad, f.b.grad  # cleargradsをしないと蓄積される

(array([[ 10.,  14.,  18.],
        [ 10.,  14.,  18.]], dtype=float32), array([ 4.,  4.], dtype=float32))

In [24]:
f.cleargrads()
y.grad = np.ones((2, 2), dtype=np.float32)
y.backward()
f.W.grad, f.b.grad

(array([[ 5.,  7.,  9.],
        [ 5.,  7.,  9.]], dtype=float32), array([ 2.,  2.], dtype=float32))

# Write a model as a chain

In [25]:
l1 = L.Linear(4, 3)
l2 = L.Linear(3, 2)
def my_forward(x):
    h = l1(x)
    return l2(h)

In [26]:
class MyProc(object):
    def __init__(self):
        self.l1 = L.Linear(4, 3)
        self.l2 = L.Linear(3, 2)
        
    def forward(self, x):
        h = self.l1(x)
        return self.l2(h)

In [27]:
class MyChain(Chain):
    def __init__(self):
        # MyChainクラスのスーパークラスの初期化メソッドを呼ぶ．単にsuper().__init__でもOK
        super(MyChain, self).__init__()
        with self.init_scope():
            self.l1 = L.Linear(4, 3)
            self.l2 = L.Linear(3, 2)
        
    def __call__(self, x):
        h = self.l1(x)
        return self.l2(h)

In [28]:
class MyChain2(ChainList):
    def __init__(self):
        # MyChain2クラスのスーパークラスの初期化メソッドを呼ぶ．単にsuper().__init__でもOK
        super(MyChain2, self).__init__(
            L.Linear(4, 3),
            L.Linear(3, 2),
        )

        
    def __call__(self, x):
        h = self[0](x)
        return self[1](h)

# Optimizer

In [29]:
model = MyChain()
optimizer = optimizers.SGD(lr=0.01)
optimizer.setup(model)

In [30]:
model, optimizer

(<__main__.MyChain at 0x7f7710f1e8d0>,
 <chainer.optimizers.sgd.SGD at 0x7f7710f1edd8>)

In [31]:
# あらかじめadd_hookメソッドを呼ぶことで正則化などができる
optimizer.add_hook(chainer.optimizer.WeightDecay(0.0005))

In [32]:
x = np.random.uniform(-1, 1, (2, 4)).astype('f')
x

array([[-0.0366318 , -0.43847084, -0.35300794, -0.40803644],
       [-0.91343999, -0.55565655, -0.35723013, -0.72801453]], dtype=float32)

In [33]:
model.cleargrads()
# chainerを使って定義した機械学習ネットワークにデータを投げる時には型をVariableにしなければいけません。
# その時、引数に与えるデータは常にnp.arrayを用いる必要があります。

loss = model(chainer.Variable(x))  # define loss function
loss

variable([[-0.34955841, -0.09633632],
          [-0.70786154, -0.01984116]])

In [34]:
loss = F.sum(loss)
loss

variable(-1.1735974550247192)

In [35]:
loss.backward()  # compute gradients here
optimizer.update()

In [36]:
model.l1.W.grad

array([[-0.26027855, -0.27260467, -0.19440885, -0.31117183],
       [ 0.42476881,  0.44423321,  0.31731641,  0.50752056],
       [-0.85289735, -0.89215851, -0.63798934, -1.01940405]], dtype=float32)

In [37]:
model.cleargrads()  # 勾配消去
loss.backward()  # compute gradient
optimizer.update()  # 最適化
print(model.l1.W.data)  # 勾配を消して,backwordしてupdateすれば，変数が最適化されていく
model.l1.W.grad

[[-0.05605068 -0.57227468  0.28936765  0.04687699]
 [ 0.484687    0.04224001 -0.08731898 -0.21627571]
 [ 0.14677453  0.73036766 -0.68068922  1.07621634]]


array([[-0.2529926 , -0.26498091, -0.18896216, -0.30245966],
       [ 0.42044103,  0.43970475,  0.31408113,  0.50234562],
       [-0.88210183, -0.92271727, -0.65982145, -1.05432534]], dtype=float32)

In [38]:
model2 = MyChain()
optimizer = optimizers.SGD(lr=0.01)
optimizer.setup(model2)

In [39]:
np.random.seed(1)
def lossfun(arg1, arg2):
    # calculate loss
    loss = F.sum(model2(arg1 - arg2))
    return loss
arg1 = np.random.uniform(-1, 1, (100, 4)).astype('f')  # 入力データ1
arg2 = np.random.uniform(-1, 1, (100, 4)).astype('f')  # 入力データ2
# lossfunc以外の引数はlossfuncの引数である必要がある. この方法での最適化だとcleargradsは必要ない
optimizer.update(lossfun, chainer.Variable(arg1), chainer.Variable(arg2)) 
print(model2.l1.W.data)
model.l1.W.grad

[[-0.9480201   0.47994247  0.80694389  0.3817637 ]
 [-0.15900344 -0.57771677  0.54771912  0.95340836]
 [ 0.46522558 -1.3306762   0.19725969 -0.0401984 ]]


array([[-0.2529926 , -0.26498091, -0.18896216, -0.30245966],
       [ 0.42044103,  0.43970475,  0.31408113,  0.50234562],
       [-0.88210183, -0.92271727, -0.65982145, -1.05432534]], dtype=float32)

[[-0.91590959  0.44996142  0.85731137  0.30127001]
 [-0.21076667 -0.52938628  0.46652496  1.08316696]
 [ 0.5580737  -1.41736698  0.34289834 -0.27294749]]
[[ -3.2110486    2.99810553  -5.03674984   8.0493679 ]
 [  5.17632389  -4.83305168   8.11941719 -12.97586441]
 [ -9.28481388   8.66908264 -14.56386471  23.27490997]]
[[-0.79964656  0.34140849  1.03967798  0.00982505]
 [-0.36427641 -0.38605666  0.22573443  1.46798086]
 [ 0.83279109 -1.67386627  0.77381134 -0.96160132]]
[[-11.62630558  10.85529327 -18.23665619  29.14449692]
 [ 15.35097504 -14.33296204  24.07905388 -38.48139191]
 [-27.47174263  25.64992523 -43.09130478  68.86538696]]
[[-0.53176022  0.09128734  1.45987594 -0.66170478]
 [-0.72829759 -0.04617581 -0.3452577   2.38049889]
 [ 1.48447061 -2.28232908  1.79601538 -2.59521341]]
[[ -26.78863144   25.01211548  -42.01979446   67.15298462]
 [  36.4021225   -33.9880867    57.09921265  -91.2518158 ]
 [ -65.16796112   60.84628677 -102.22040558  163.36120605]]


[[ 0.40302911  0.13216969  0.33647144  0.04901356]
 [ 0.43127936 -0.81043494 -0.44686693  0.21333061]
 [-0.01339246 -0.12016282 -0.08008625  0.23532893]]
[[  3.98837996  -3.72388697   6.25604582  -9.99795818]
 [ -5.96011114   5.5648613   -9.34884071  14.94063854]
 [ -0.61966693   0.57857305  -0.97198975   1.55336356]]
[[ 0.37370729  0.159547    0.29047817  0.12251665]
 [ 0.48439357 -0.86002684 -0.36355367  0.08018543]
 [-0.01370221 -0.11987362 -0.08057211  0.2361054 ]]
[[  2.93218112  -2.73773098   4.59932613  -7.35030842]
 [ -5.31141949   4.95918941  -8.33132553  13.31451797]
 [  0.03097464  -0.02892053   0.04858584  -0.07764636]]
[[ 0.27114359  0.2553091   0.12959997  0.37962049]
 [ 0.65625489 -1.020491   -0.09397742 -0.35063177]
 [-0.00749792 -0.12566645 -0.07084027  0.22055268]]
[[ 10.2563715   -9.57621098  16.08782005 -25.71038437]
 [-17.18613434  16.04641342 -26.95762444  43.08171844]
 [ -0.62042809   0.57928395  -0.97318405   1.55527186]]
