# 第3回 演習課題

## 課題1．単純パーセプトロンの実装と学習

In [1]:
import numpy as np
from sklearn.utils import shuffle
np.random.seed(1234)

1.層をLayerクラスとして定義

In [2]:
class Layer:
    def __init__(self,in_dim,out_dim,function):
        self.W = np.zeros((in_dim,out_dim))
        self.b = np.zeros(out_dim)
        self.function = function

    #forward propagation
    def fprop(self,x):
        u = x.dot(self.W) + self.b
        z = self.function(u)
        return z

2.ステップ関数

ヒント：ステップ関数

* $u\geq0$のとき，$f(u)=+1$
* $u<0$のとき，$f(u)=-1$

In [3]:
def step(x):
    y = np.sign(x)
    y[y==0] = 1
    return y

4.データセットの設定とレイヤーインスタンス

In [4]:
#OR
train_X = np.array([[0,1],[1,0],[0,0],[1,1]])
train_y = np.array([[1],[1],[-1],[1]])
test_X,test_y = train_X,train_y

layer = Layer(2,1,step)

5.train関数とtest関数

ヒント：パーセプトロン学習則

$y_n\neq d_n$のとき
* $w^{(t+1)}=w^{(t)}+\epsilon x_nd_n$　
* $b^{(t+1)}=b^{(t)}+\epsilon d_n$

In [5]:
def train(x,d,eps=1):
    #forward propagation
    y = layer.fprop(x)

    #update parameters
    if y * d != 1:
        layer.W = layer.W + eps*d*x.T
        layer.b = layer.b + eps*d

def test(x):
    y = layer.fprop(x)
    return y

5.パラメータの更新

In [6]:
#epoch
for epoch in range(10):
    #online learning
    for x,y in zip(train_X,train_y):
        train(x[np.newaxis,:],y[np.newaxis,:],eps=1)
pred_y = test(test_X)
print(pred_y)

[[ 1.]
 [ 1.]
 [-1.]
 [ 1.]]


## 課題2．活性化関数とその微分の実装

1.シグモイド関数とその微分

In [7]:
def sigmoid(x):
    return 1.0/(1.0 + np.exp(-x))
def diff_sigmoid(x):
    v = sigmoid(x)
    return v * (1.0 - v)

２.ソフトマックス関数とその微分

In [8]:
def softmax(x):
    v = np.exp(x)
    return v / np.sum(v, axis=1)[:, np.newaxis]
def diff_softmax(x):
    v = softmax(x)
    return v * (np.ones(x.shape) - v)

2.tanh関数とその微分

In [9]:
def tanh(x):
    return np.tanh(x)
def diff_tanh(x):
    v = tahnh(x)
    return 1 - v*v

## 課題3．多層パーセプトロンの実装と学習

1.Layerクラス

In [10]:
class Layer:
    def __init__(self,in_dim,out_dim,function,diff_function):
        #Xavier
        self.W = np.random.uniform(
                                    low=-np.sqrt(6./(in_dim+out_dim)), 
                                    high=np.sqrt(6./(in_dim+out_dim)), 
                                    size=(in_dim, out_dim))
        self.b = np.zeros(out_dim)
        self.function = function
        
        self.diff_function = diff_function
        self.u     = None
        self.delta = None

    #forward propagation
    def fprop(self,x):
        self.u = np.dot(x, self.W) + self.b
        z = self.function(self.u)
        return z

    #back propagation
    def bprop(self,delta,W):
        self.delta = self.diff_function(self.u) * delta.dot(W.T)
        return self.delta

2.ネットワーク全体の順伝播

In [11]:
def fprops(layers, x):
    z = x
    for layer in layers:
        z = layer.fprop(z)    
    return z

3.ネットワーク全体の誤差逆伝播

In [12]:
def bprops(layers, delta):
    for i,layer in enumerate(layers[::-1]):
        if i == 0:
            layer.delta = delta
        else:
            delta = layer.bprop(delta, _W)
        _W = layer.W

4.データセットの設定とネットワークの定義

In [13]:
#XOR
train_X = np.array([[0,1],[1,0],[0,0],[1,1]])
train_y = np.array([[1],[1],[0],[0]])
test_X,test_y = train_X,train_y

layers = [Layer(2,3,sigmoid,diff_sigmoid),
          Layer(3,1,sigmoid,diff_sigmoid)]

5.train関数とtest関数

In [14]:
def train(X,d,eps=1):
    #forward propagation
    y = fprops(layers,X)
        
    #cost function & delta
    cost = np.sum(-d * np.log(y) - (1 - d) * np.log(1 - y))
    delta = y - d
    
    #back propagation
    bprops(layers,delta)

    #update parameters
    z = X
    for layer in layers:
        dW = np.dot(z.T, layer.delta)  # P.52
        db = np.dot(np.ones(len(z)),layer.delta)  # P.52

        layer.W = layer.W - eps*dW
        layer.b = layer.b - eps*db

        z = layer.fprop(z)
        
    #train cost
    y = fprops(layers,X)
    cost = np.sum(-d * np.log(y) - (1 - d) * np.log(1 - y)) # (2.8)
    
    return cost

def test(X,d):
    #test cost
    y = fprops(layers,X)
    cost = np.sum(-d * np.log(y) - (1 - d) * np.log(1 - y))
    return cost,y

6.パラメータの更新

In [15]:
#epoch
for epoch in range(100):
    #online learning
    train_X, train_y = shuffle(train_X, train_y)
    train(train_X, train_y)
    #for x,y in zip(train_X,train_y):
    #    train(x[np.newaxis,:],y[np.newaxis,:])
    n,pred_y = test(test_X,test_y)
print(pred_y)

[[ 0.9040565 ]
 [ 0.9355088 ]
 [ 0.03297601]
 [ 0.12862548]]


## 宿題．MNISTデータセットを多層パーセプトロンで学習

- データはmnist_x,mnist_yで与えられます
    - mnsit_xとmnist_yをtrain_X,train_yとtest_X,test_yに分けて，モデルを学習してください

ヒント
* 出力yはone-of-k表現
* 最終層の活性化関数はsoftmax関数，誤差関数は多クラス交差エントロピー
* 最終層のデルタは教科書参照

In [None]:
import matplotlib.pyplot as plt
import numpy

from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score

from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', data_home='.')

X, y = shuffle(mnist.data, mnist.target)
X = X / 255.0
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

次のセルに解答を書いて提出してください
- レイヤークラスなど，必要なものは全て書いてください.
- システム側では，pred_yの結果から評価します.
- test関数の戻り値（pred_y）は，one-of-k表現（one-hot）のままで大丈夫です.

In [16]:
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score

In [17]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

In [18]:
mnist_x, mnist_y = shuffle(mnist.data.astype("float32")/255.0, mnist.target.astype("int32"))
train_x, test_x, train_y, test_y = train_test_split(mnist_x, mnist_y, test_size=0.2, random_state=42)

In [19]:
# one-of-k表現に変換
from sklearn.preprocessing import LabelBinarizer
train_label_y = LabelBinarizer().fit_transform(train_y)
test_label_y = LabelBinarizer().fit_transform(test_y)

### Layerクラス

In [20]:
class Layer:
    def __init__(self, in_dim, out_dim, function, diff_function):
        #Xavier
        self.W = np.random.uniform(low=-np.sqrt(6./(in_dim+out_dim)), 
                                   high=np.sqrt(6./(in_dim+out_dim)), 
                                   size=(in_dim, out_dim)).astype('float32')
        self.b = np.zeros(out_dim)
        self.function = function
        
        self.diff_function = diff_function
        self.u     = None
        self.delta = None

    #forward propagation
    def fprop(self,x):
        self.u = np.dot(x, self.W) + self.b
        z = self.function(self.u)
        return z

    #back propagation
    def bprop(self,delta,W):
        self.delta = self.diff_function(self.u) * np.dot(delta, W.T)
        return self.delta

### ネットワーク全体の順伝播

In [21]:
def fprops(layers, x):
    z = x
    for layer in layers:
        z = layer.fprop(z)    
    return z

### ネットワーク全体の誤差逆伝播

In [22]:
def bprops(layers, delta):
    for i,layer in enumerate(layers[::-1]):
        if i == 0:
            layer.delta = delta
        else:
            delta = layer.bprop(delta, _W)
        _W = layer.W

### ネットワークの定義

In [29]:
layers = [
    Layer(784, 1000, sigmoid, diff_sigmoid),
    Layer(1000, 1000, sigmoid, diff_sigmoid),
    Layer(1000, 1000, sigmoid, diff_sigmoid),
    Layer(1000, 10, softmax, diff_softmax),
]

### train関数とtest関数

In [24]:
def train(X, d, eps=0.1):
    # forward propagation
    y = fprops(layers, X)

    # 出力層のdelta
    delta = y - d
    
    # back propagation
    bprops(layers, delta)

    # update parameters
    z = X
    for layer in layers:
        dW = np.dot(z.T, layer.delta) / X.shape[0] # P.52
        db = np.dot(np.ones(len(z)), layer.delta) / X.shape[0] # P.52

        layer.W = layer.W - eps*dW
        layer.b = layer.b - eps*db

        z = layer.fprop(z)
        
    # train cost
    #y = fprops(layers, X)
    #cost = -np.sum(d * np.log(y)) # (2.11)
    #return cost

def test(X, d):
    # test cost
    y = fprops(layers, X)
    cost = -np.mean(d * np.log(y)) # (2.11)
    return cost, y

### パラメータの更新

#### オンライン学習

`train_x`の行毎に学習する

In [25]:
# epoch
for epoch in range(10):
    X, Y = shuffle(train_x, train_label_y)
    for x, y in zip(X, Y):
        train(x[np.newaxis,:], y[np.newaxis,:], 0.01)

    if True:
    #if ((epoch+1) % 10 == 0) or (epoch == 0):
        cost, pred_y = test(test_x, test_label_y)
        pred = np.argmax(pred_y, axis=1)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(cost),
                                                                                     f1_score(test_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 0.037, Validation F1:: 0.888
EPOCH::   2, Validatioon Cost:: 0.032, Validation F1:: 0.904
EPOCH::   3, Validatioon Cost:: 0.024, Validation F1:: 0.928
EPOCH::   4, Validatioon Cost:: 0.023, Validation F1:: 0.933
EPOCH::   5, Validatioon Cost:: 0.020, Validation F1:: 0.941
EPOCH::   6, Validatioon Cost:: 0.018, Validation F1:: 0.949
EPOCH::   7, Validatioon Cost:: 0.017, Validation F1:: 0.953
EPOCH::   8, Validatioon Cost:: 0.017, Validation F1:: 0.957
EPOCH::   9, Validatioon Cost:: 0.015, Validation F1:: 0.962
EPOCH::  10, Validatioon Cost:: 0.015, Validation F1:: 0.960


#### ミニバッチ学習

教科書 3.3節

In [30]:
## Iterate
batch_size = 100
nbatches = train_x.shape[0] // batch_size

for epoch in range(500):
    x, y = shuffle(train_x, train_label_y)
    for i in range(nbatches):
        start = i * batch_size
        end = start + batch_size
        train(x[start:end], y[start:end], 0.01)

    #if True:
    if ((epoch+1) % 10 == 0) or (epoch == 0):
        cost, pred_y = test(test_x, test_label_y)
        pred = np.argmax(pred_y, axis=1)
        print("EPOCH:: {:3d}, Validatioon Cost:: {:.3f}, Validation F1:: {:.3f}".format(epoch+1,
                                                                                     float(cost),
                                                                                     f1_score(test_y, pred, average="micro")))

EPOCH::   1, Validatioon Cost:: 0.230, Validation F1:: 0.135
EPOCH::  10, Validatioon Cost:: 0.176, Validation F1:: 0.515
EPOCH::  20, Validatioon Cost:: 0.071, Validation F1:: 0.785
EPOCH::  30, Validatioon Cost:: 0.048, Validation F1:: 0.864
EPOCH::  40, Validatioon Cost:: 0.041, Validation F1:: 0.883
EPOCH::  50, Validatioon Cost:: 0.038, Validation F1:: 0.892
EPOCH::  60, Validatioon Cost:: 0.036, Validation F1:: 0.898
EPOCH::  70, Validatioon Cost:: 0.034, Validation F1:: 0.902
EPOCH::  80, Validatioon Cost:: 0.033, Validation F1:: 0.905
EPOCH::  90, Validatioon Cost:: 0.032, Validation F1:: 0.908
EPOCH:: 100, Validatioon Cost:: 0.031, Validation F1:: 0.910
EPOCH:: 110, Validatioon Cost:: 0.031, Validation F1:: 0.911
EPOCH:: 120, Validatioon Cost:: 0.030, Validation F1:: 0.915
EPOCH:: 130, Validatioon Cost:: 0.029, Validation F1:: 0.915
EPOCH:: 140, Validatioon Cost:: 0.029, Validation F1:: 0.917
EPOCH:: 150, Validatioon Cost:: 0.028, Validation F1:: 0.919
EPOCH:: 160, Validatioon

In [31]:
# テストデータを用いて予測精度を計算
from sklearn.metrics import confusion_matrix, classification_report
pred = [np.argmax(v) for v in pred_y]
print(confusion_matrix(test_y, pred))
print(classification_report(test_y, pred))

[[1315    1    3    4    1    7    8    1    6    2]
 [   0 1517    8   10    3    1    2    5    2    2]
 [   2    7 1379   13    4    1    5    8    7    2]
 [   2    5   14 1308    0   21    1    8   11    6]
 [   3    6    3    2 1344    2   16    3    4   45]
 [   8    3    0   11    4 1214   10    0    5    7]
 [   7    3    3    0    2    8 1344    0    8    0]
 [   0    8    7    6   13    2    0 1399    2   14]
 [   5    9    7   17    2   10    4    3 1310   11]
 [   0    6    0   15   20    7    0    8    7 1341]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98      1348
          1       0.97      0.98      0.97      1550
          2       0.97      0.97      0.97      1428
          3       0.94      0.95      0.95      1376
          4       0.96      0.94      0.95      1428
          5       0.95      0.96      0.96      1262
          6       0.97      0.98      0.97      1375
          7       0.97      0.96      0.97  