# 第4回講義 宿題

## 課題. MNISTデータセットを多層パーセプトロン(MLP)で学習せよ

### 注意
- homework関数を完成させて提出してください
    - 訓練データはtrain_X, train_y, テストデータはtest_Xで与えられます
    - train_Xとtrain_yをtrain_X, train_yとvalid_X, valid_yに分けるなどしてモデルを学習させてください
    - test_Xに対して予想ラベルpred_yを作り, homework関数の戻り値としてください\
- pred_yのtest_yに対する精度(F値)で評価します
- 全体の実行時間がiLect上で60分を超えないようにしてください
- homework関数の外には何も書かないでください (必要なものは全てhomework関数に入れてください)
- 解答提出時には Answer Cell の内容のみを提出してください

- MLPの実装にTensorflowなどのライブラリを使わないでください

### ヒント
- 出力yはone-of-k表現
- 最終層の活性化関数はソフトマックス関数, 誤差関数は多クラス交差エントロピー
- 最終層のデルタは教科書参照

次のセルのhomework関数を完成させて提出してください

# Answer Cell

In [1]:
def homework(_train_X, _train_Y, test_X):
    import copy 
    # hyperparameter 
    batch = 20
    eps=0.01
    
    # method
    def one_to_hot(x):
        """one to hot method"""
        columns = np.unique(x)
        X = np.zeros([x.shape[0], len(columns)])
        for i, column in enumerate(columns):
            X[np.where(x==column), i] = 1
        return X

    def softmax(x):
        """softmax"""
        exp_x = np.exp(x)
        return exp_x/np.sum(exp_x, axis=1, keepdims=True)

    def deriv_softmax(x):
        """derive of softmax"""
        return softmax(x)*(1 - softmax(x))
    
    def tanh(x):
        """tanh"""
        return np.tanh(x)

    def deriv_tanh(x):
        """derive of tanh"""
        return 1 - tanh(x)**2
    
    def train(x, t, W1, b1, W2, b2, W3, b3, eps=0.01):
        """train method"""
            
        # Forward Propagation Layer1
        u1 = np.matmul(x, W1) + b1
        z1 = tanh(u1)

        # Forward Propagation Layer2
        u2 = np.matmul(z1, W2) + b2
        z2 = tanh(u2)
        
        # Forward Propagation Layer3
        u3 = np.matmul(z2, W3) + b3
        z3 = softmax(u3)

        # Back Propagation (Cost Function: Negative Loglikelihood)
        y = z3
        cost = np.sum(-t*np.log(y) - (1 - t)*np.log(1 - y))
        
        # 下のように出力層の活性化関数の微分もかけた方が精度がずっとよくなる
        delta_3 = deriv_softmax(u3)*(y - t) # Layer3 delta
        # delta_3 = (y - t) # Layer3 delta
        delta_2 = deriv_tanh(u2) * np.matmul(delta_3, W3.T) # Layer2 delta
        delta_1 = deriv_tanh(u1) * np.matmul(delta_2, W2.T) # Layer2 delta

        # Update Parameters Layer1
        dW1 = np.matmul(x.T, delta_1)
        db1 = np.matmul(np.ones(len(x)), delta_1)
        W1 = W1 - eps*dW1
        b1 = b1 - eps*db1

        # Update Parameters Layer2
        dW2 = np.matmul(z1.T, delta_2)
        db2 = np.matmul(np.ones(len(z1)), delta_2)
        W2 = W2 - eps*dW2
        b2 = b2 - eps*db2
        
        # Update Parameters Layer3
        dW3 = np.matmul(z2.T, delta_3)
        db3 = np.matmul(np.ones(len(z2)), delta_3)
        W3 = W3 - eps*dW3
        b3 = b3 - eps*db3

        return cost, W1, b1, W2, b2, W3, b3
    
    def validation(x, t):
        # Forward Propagation Layer1
        u1 = np.matmul(x, W1) + b1
        z1 = tanh(u1)

        # Forward Propagation Layer2
        u2 = np.matmul(z1, W2) + b2
        z2 = tanh(u2)

        # Forward Propagation Layer3
        u3 = np.matmul(z2, W3) + b3
        z3 = softmax(u3)

        y = z3
        
        # validation Cost
        cost = np.sum(-t*np.log(y)-(1-t)*np.log(1-y))
        return cost
    
    def predict(x):
        # Forward Propagation Layer1
        u1 = np.matmul(x, W1) + b1
        z1 = tanh(u1)

        # Forward Propagation Layer2
        u2 = np.matmul(z1, W2) + b2
        z2 = tanh(u2)

        # Forward Propagation Layer3
        u3 = np.matmul(z2, W3) + b3
        z3 = softmax(u3)

        y = z3
        
        return y
    
    # main
    _train_Y = one_to_hot(_train_Y)
    num = _train_Y.shape[0]
    validation_num = int(_train_Y.shape[0] / 10)
    
    # layer setting
    len_columns = test_X.shape[1]
    len_categories = _train_Y.shape[1]
    # Layer1 weights
    W1 = np.random.uniform(low=-0.08, high=0.08, size=(len_columns, 100)).astype('float32')
    b1 = np.zeros(100).astype('float32')
    
    # Layer2 weights
    W2 = np.random.uniform(low=-0.08, high=0.08, size=(100, 50)).astype('float32')
    b2 = np.zeros(50).astype('float32')

    # Layer3 weights
    W3 = np.random.uniform(low=-0.08, high=0.08, size=(50, len_categories)).astype('float32')
    b3 = np.zeros(len_categories).astype('float32')
    
    # train
    validation_cost_min = float("inf")
    early_stopping_count = 0
    for epoch in range(1000):
        if epoch%5 == 0:
            validation_index = np.random.randint(0, num, validation_num)
            validation_X = copy.deepcopy(_train_X[validation_index,:])
            ind = np.ones(num, dtype=bool)
            ind[validation_index] = False
            train_X = copy.deepcopy(_train_X [ind, :])
            validation_y = copy.deepcopy(_train_Y[validation_index, :])
            train_y = copy.deepcopy(_train_Y[ind, :])
        # batch learning
        for n in range(0,  train_X.shape[0], batch):
            _train_x, _train_y = train_X[n:n+batch], train_y[n:n+batch]
            _cost, W1, b1, W2, b2, W3, b3 = train(_train_x, _train_y, W1, b1, W2, b2, W3, b3, eps=eps)
            
        if epoch%5 == 0:
            validation_cost = validation(validation_X, validation_y)
            if validation_cost_min < validation_cost:
                # print(epoch)
                # print(validation_cost)
                early_stopping_count += 1
            else:
                early_stopping_count = 0
                validation_cost_min = validation_cost
        
        if early_stopping_count > 5:
            # print("break")
            break

    pred_y = predict(test_X)
    # print(pred_y)

    return pred_y.argmax(axis=1)

- 以下のvalidate_homework関数を用いてエラーが起きないか動作確認をして下さい。
- 提出に際して、score_homework関数で60分で実行が終わることを確認して下さい。
- 評価は以下のscore_homework関数で行われますが、random_stateの値は変更されます。

# Checker Cell (for student)

In [2]:
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
import time 

import numpy as np

def load_mnist():
    mnist = fetch_mldata('MNIST original')
    mnist_X, mnist_y = shuffle(mnist.data.astype('float32'),
                               mnist.target.astype('int32'), random_state=42)

    mnist_X = mnist_X / 255.0

    return train_test_split(mnist_X, mnist_y,
                test_size=0.2,
                random_state=42)

def validate_homework():
    train_X, test_X, train_y, test_y = load_mnist()

    # validate for small dataset
    train_X_mini = train_X[:1000]
    train_y_mini = train_y[:1000]
    test_X_mini = test_X[:100]
    test_y_mini = test_y[:100]

    pred_y = homework(train_X_mini, train_y_mini, test_X_mini)
    print(f1_score(test_y_mini, pred_y, average='macro'))

def score_homework():
    train_X, test_X, train_y, test_y = load_mnist()
    pred_y = homework(train_X, train_y, test_X)
    print(f1_score(test_y, pred_y, average='macro'))

In [7]:
start = time.time()
validate_homework()
end = time.time()
print(end-start)
# score_homework()

25
57.6591455428
40
23.8518824097
50
37.5896594737
65
12.0812161727
70
10.5694509892
80
26.4484350524
90
6.33613062895
100
4.94313083635
105
6.15635383415
110
5.31009110126
125
5.14861063166
130
4.91121832753
135
4.56079583582
140
4.82649019391
145
4.40783669766
150
4.47760207595
0.841292202208
7.034273862838745


In [3]:
start = time.time()
score_homework()
end = time.time()
print(end-start)

45
223.982163285
55
116.743696131
60
169.917430143
65
96.3490784865
70
216.002913126
75
113.408338462
80
148.752564938
0.976366185955
206.42156267166138


1.20079128646
0.646603966863
0.509422359096
0.41167448721
0.345116980671
0.303201268896
0.273704189566
0.248743505878
0.305688774066
0.869893914823

In [None]:
train_x, test_x, train_y, test_y = load_mnist()

In [None]:
# self validation

In [None]:
import matplotlib.pyplot as plt

# グラフをipython notebook内に表示
%matplotlib inline

In [None]:
def self_validate_homework():
    train_X, test_X, train_y, test_y = load_mnist()

    # validate for small dataset
    train_X_mini = train_X[:100]
    train_y_mini = train_y[:100]
    test_X_mini = test_X[:100]
    test_y_mini = test_y[:100]

    pred_y, costs = homework(train_X_mini, train_y_mini, test_X_mini)
    print(f1_score(test_y_mini, pred_y, average='macro'))
    plt.plot(np.arange(len(costs)), costs)
    

In [None]:
self_validate_homework()

In [None]:
def one_to_hot(x):
    columns = np.unique(x)
    X = np.zeros([x.shape[0], len(columns)])
    for i, column in enumerate(columns):
        X[np.where(x==column), i] = 1
    return X

In [None]:
one_to_hot(test_y)[0]

In [None]:
len(np.where(test_y==0)[0])

In [None]:
test_x[0].min()

In [None]:
train_x.shape

In [None]:
a = np.arange(7)

In [None]:
ind = np.ones(7, dtype=bool)
ind[3] = False

In [None]:
a[ind]