# 第6回講義 宿題

## 課題

CNNを用いて、FashionMNISTの高精度な分類器を実装してみましょう。

モデルのレイヤーを変更してみるなどして精度の向上にチャンレンジして下さい。 精度上位者はリーダーボードに載ります。

### 目標値

Accuracy 93%

### ルール

- 訓練データはx_train、 t_train、テストデータはx_testで与えられます。
- 予測ラベルは one_hot表現ではなく0~9のクラスラベル で表してください。
- **下のセルで指定されているx_train、t_train以外の学習データは使わないでください。**
- Tensorflowを利用して構いません。
- ただし、**tf.layersのような高レベルのAPIは使用しないで下さい。**具体的に以下のモジュールは使用しないでください。

```
tf.app,
tf.compat,
tf.contrib,
tf.estimator,
tf.gfile,
tf.graph_util,
tf.image,
tf.initializers,
tf.keras,
tf.layers,
tf.logging,
tf.losses,
tf.metrics,
tf.python_io,
tf.resource_loader,
tf.saved_model,
tf.sets,
tf.summary,
tf.sysconfig,
tf.test
```

### 提出方法

- 2つのファイルを提出していただきます。
  - テストデータ (x_test) に対する予測ラベルをcsvファイル (ファイル名: submission_pred.csv) で提出してください。
  - それに対応するpythonのコードをsubmission_code.pyとして提出してください (%%writefileコマンドなどを利用してください)。

### 評価方法

- 予測ラベルのt_testに対する精度 (Accuracy) で評価します。
- 毎日夜24時にテストデータの一部に対する精度でLeader Boardを更新します。
- 締切日の夜24時にテストデータ全体に対する精度でLeader Boardを更新します。これを最終的な評価とします。

### データの読み込み

- この部分は修正しないでください

In [2]:
!pip3 install tensorflow-gpu==1.8

Collecting tensorflow-gpu==1.8
  Downloading https://files.pythonhosted.org/packages/f2/fa/01883fee1cdb4682bbd188edc26da5982c459e681543bb7f99299fca8800/tensorflow_gpu-1.8.0-cp35-cp35m-manylinux1_x86_64.whl (216.3MB)
[K    100% |################################| 216.3MB 6.9kB/s 
Collecting tensorboard<1.9.0,>=1.8.0 (from tensorflow-gpu==1.8)
  Downloading https://files.pythonhosted.org/packages/59/a6/0ae6092b7542cfedba6b2a1c9b8dceaf278238c39484f3ba03b03f07803c/tensorboard-1.8.0-py3-none-any.whl (3.1MB)
[K    100% |################################| 3.1MB 519kB/s 
Collecting bleach==1.5.0 (from tensorboard<1.9.0,>=1.8.0->tensorflow-gpu==1.8)
  Downloading https://files.pythonhosted.org/packages/33/70/86c5fec937ea4964184d4d6c4f0b9551564f821e1c3575907639036d9b90/bleach-1.5.0-py2.py3-none-any.whl
Collecting html5lib==0.9999999 (from tensorboard<1.9.0,>=1.8.0->tensorflow-gpu==1.8)
  Downloading https://files.pythonhosted.org/packages/ae/ae/bcb60402c60932b32dfaf19bb53870b29eda2cd17551ba56392

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

try:
    del [
        tf.app,
        tf.compat,
        tf.contrib,
        tf.estimator,
        tf.gfile,
        tf.graph_util,
        tf.image,
        tf.initializers,
        tf.keras,
        tf.layers,
        tf.logging,
        tf.losses,
        tf.metrics,
        tf.python_io,
        tf.resource_loader,
        tf.saved_model,
        tf.sets,
        tf.summary,
        tf.sysconfig,
        tf.test
    ]
    
except AttributeError:
    print('Unrequired modules are already deleted (Skipped).')

def load_mnist():

    # 学習データ
    x_train = np.load('/root/userspace/public/chap06/data/x_train.npy')
    t_train = np.load('/root/userspace/public/chap06/data/t_train.npy')
    
    # テストデータ
    x_test = np.load('/root/userspace/public/chap06/data/x_test.npy')

    x_train = x_train.reshape(-1, 28, 28, 1).astype('float32') / 255
    x_test = x_test.reshape(-1, 28, 28, 1).astype('float32') / 255
    t_train = np.eye(10)[t_train.astype('int32').flatten()]

    return (x_train, x_test, t_train)

### 畳み込みニューラルネットワーク(CNN)の実装

In [6]:
# %%writefile /root/userspace/chap06/materials/submission_code.py

# import tensorboard as tb
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

rng = np.random.RandomState(1234)
random_state = 42

### レイヤー定義 ###

class Conv:
    def __init__(self, filter_shape, function = lambda x: x, strides = [1,1,1,1], padding = 'VALID'):
        # He initializationを使う
        # filter_shape = Height * Width * Num of input_channels * Num of output_channels
        fun_in = np.prod(filter_shape[:3])
        fun_out = np.prod(filter_shape[:2]) * filter_shape[3]
        self.W = tf.Variable(rng.uniform(
                low = -np.sqrt(6/ fun_in),
                high = np.sqrt(6/ fun_out),
                size = filter_shape
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((filter_shape[3]), dtype = 'float32'), name = 'b')
        self.function = function
        self.strides = strides
        self.padding = padding
    # WRITE ME
    
    def __call__(self, x):
        u = tf.nn.conv2d(x, self.W, strides = self.strides, padding = self.padding) + self.b
        return self.function(u)
    
    
class Pooling:
    def __init__(self, ksize = [1, 2, 2, 1] , strides = [1, 2, 2, 1], padding = 'VALID'):
        self.ksize = ksize
        self.strides = strides
        self.padding = padding
    
    def __call__(self, x):
        return tf.nn.max_pool(x, ksize = self.ksize, strides = self.strides, padding = self.padding)
    # WRITE ME
    
    
class Flatten:
    def __call__(self, x):
        return tf.reshape(x, (-1, np.prod(x.get_shape().as_list()[1:])))
    # WRITE ME
    
    
class Dense:
    def __init__(self, in_dim, out_dim, function = lambda x: x):
        # ここでも, He Initialization
        self.W = tf.Variable(rng.uniform(
                low = - np.sqrt(6/ in_dim),
                high = np.sqrt(6/ in_dim),
                size = [in_dim, out_dim]
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((out_dim), dtype = 'float32'), name = 'b')
        self.function = function
        
    def __call__(self, x):
        u = tf.matmul(x, self.W) + self.b
        return self.function(u)
    
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))
    # WRITE ME
    


x_train, x_test, t_train = load_mnist()
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train, test_size=0.1, random_state=random_state)

### ネットワーク ###
tf.reset_default_graph()

x = tf.placeholder(tf.float32, [None, 28, 28, 1])
t = tf.placeholder(tf.float32, [None, 10])
                                                                          
h = Conv((5, 5, 1, 20), tf.nn.relu)(x)
h = Pooling((1, 2, 2, 1))(h)
h = Conv((5, 5, 20, 50), tf.nn.relu)(h)
h = Pooling((1, 2, 2, 1))(h)
h = Flatten()(h)
y = Dense(800, 10, tf.nn.softmax)(h)

cost = - tf.reduce_mean(tf.reduce_sum(t * tf_log(y), axis=1))
train = tf.train.GradientDescentOptimizer(0.01).minimize(cost)

### 学習 ###

n_epochs = 10
batch_size = 100
n_batches = x_train.shape[0]//batch_size #Floor division(打ち切り除算)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        x_train_fmnist, t_train_fmnist = shuffle(x_train, t_train, random_state = random_state)
        for batch in range(n_batches):
            start = batch * batch_size
            finish = start + batch_size
            sess.run(train, feed_dict = {x: x_train_fmnist[start:finish], t:t_train_fmnist[start:finish]})
        y_pred_, valid_cost_ = sess.run([y, cost], 
                            feed_dict = {x: x_valid, t:t_valid}
                            )
        print("EPOCH: {}, Validation_Cost: {:.3f}, Accuracy_Score: {:.3f}".format(
            epoch+1,
            valid_cost_,
            accuracy_score(t_valid.argmax(axis = 1), y_pred_.argmax(axis = 1))
        ))
    
    y_pred = sess.run(y, feed_dict = {x: x_test})
    submission = pd.Series(y_pred.argmax(axis = 1), name='label')
    submission.to_csv('/root/userspace/chap06/submission/submission_pred_trial.csv', header=True, index_label='id')

# tb.show_graph(tf.get_default_graph().as_graph_def())

# WRITE ME
# submission = pd.Series(y_pred, name='label')
# submission.to_csv('/root/userspace/chap06/materials/submission_pred.csv', header=True, index_label='id')

EPOCH: 1, Validation_Cost: 2.224, Accuracy_Score: 0.257
EPOCH: 2, Validation_Cost: 1.482, Accuracy_Score: 0.531
EPOCH: 3, Validation_Cost: 0.931, Accuracy_Score: 0.646
EPOCH: 4, Validation_Cost: 0.804, Accuracy_Score: 0.703
EPOCH: 5, Validation_Cost: 0.691, Accuracy_Score: 0.738
EPOCH: 6, Validation_Cost: 0.627, Accuracy_Score: 0.761
EPOCH: 7, Validation_Cost: 0.582, Accuracy_Score: 0.780
EPOCH: 8, Validation_Cost: 0.549, Accuracy_Score: 0.795
EPOCH: 9, Validation_Cost: 0.523, Accuracy_Score: 0.807
EPOCH: 10, Validation_Cost: 0.502, Accuracy_Score: 0.817


## LeNet(1998)の実装
具体的には以下のようになる。
1. Convolution
2. Max pooling
3. Convolution
4. Max pooling  
*以下、MLP*
5. Full connection layer(120 neurons)
6. Full connection layer(84 neurons)
7. Output(10 neurons)

```python
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out
```

### 入力次元数
INPUT: $28 \times 28 \times 1$  
C1: $24 \times 24 \times 6$  
S2: $12 \times 12 \times 6$  
C3: $8 \times 8 \times 16$  
S4: $4 \times 4 \times 16 \rightarrow 256$(after "Flatten" is applied)  
C5: $120$  
C6: $84$  
OUTPUT: $10$  

In [None]:
# %%writefile /root/userspace/chap06/materials/submission_code.py

# import tensorboard as tb
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

rng = np.random.RandomState(1234)
random_state = 42

### レイヤー定義 ###

class Conv:
    def __init__(self, filter_shape, function = lambda x: x, strides = [1,1,1,1], padding = 'VALID'):
        # He initializationを使う
        # filter_shape = Height * Width * Num of input_channels * Num of output_channels
        fun_in = np.prod(filter_shape[:3])
        fun_out = np.prod(filter_shape[:2]) * filter_shape[3]
        self.W = tf.Variable(rng.uniform(
                low = -np.sqrt(6/ fun_in),
                high = np.sqrt(6/ fun_out),
                size = filter_shape
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((filter_shape[3]), dtype = 'float32'), name = 'b')
        self.function = function
        self.strides = strides
        self.padding = padding
    # WRITE ME
    
    def __call__(self, x):
        u = tf.nn.conv2d(x, self.W, strides = self.strides, padding = self.padding) + self.b
        return self.function(u)
    
    
class Pooling:
    def __init__(self, ksize = [1, 2, 2, 1] , strides = [1, 2, 2, 1], padding = 'VALID'):
        self.ksize = ksize
        self.strides = strides
        self.padding = padding
    
    def __call__(self, x):
        return tf.nn.max_pool(x, ksize = self.ksize, strides = self.strides, padding = self.padding)
    # WRITE ME
    
    
class Flatten:
    def __call__(self, x):
        return tf.reshape(x, (-1, np.prod(x.get_shape().as_list()[1:])))
    # WRITE ME
    
    
class Dense:
    def __init__(self, in_dim, out_dim, function = lambda x: x):
        # ここでも, He Initialization
        self.W = tf.Variable(rng.uniform(
                low = - np.sqrt(6/ in_dim),
                high = np.sqrt(6/ in_dim),
                size = [in_dim, out_dim]
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((out_dim), dtype = 'float32'), name = 'b')
        self.function = function
        self.params = [self.W, self.b]
        
    def __call__(self, x):
        u = tf.matmul(x, self.W) + self.b
        return self.function(u)
    
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))
    # WRITE ME

def get_params(layers):
    params_all = []
    for layer in layers:
        params = layer.params
        params_all.extend(params)
    return params_all
    
def compute_l1_reg(params):
    l1_reg = 0
    for param in params:
        l1_reg += tf.reduce_sum(tf.abs(param))
    return l1_reg

def compute_l2_reg(params):
    l2_reg = 0
    for param in params:
        l2_reg += tf.reduce_sum(tf.square(param)) # 2 * tf.nn.l2_lossを使っても良い
    return l2_reg

x_train, x_test, t_train = load_mnist()
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train, test_size=0.1, random_state=random_state)

### ネットワーク ###
tf.reset_default_graph()

x = tf.placeholder(tf.float32, [None, 28, 28, 1])
t = tf.placeholder(tf.float32, [None, 10])
lmd = 0.0001

# LeNetっぽくやってみる

# L2正規化のために、全結合層だけ名前をつけておく。
C5 = Dense(256, 120, tf.nn.relu)
C6 = Dense(120, 84, tf.nn.relu)

h = Conv((5, 5, 1, 6), tf.nn.relu)(x) 
h = Pooling((1, 2, 2, 1))(h)
h = Conv((5, 5, 6, 16), tf.nn.relu)(h)
h = Pooling((1, 2, 2, 1))(h)
print("Before Flatten, the shape of h is:{}".format(h.shape))
h = Flatten()(h)
print("After Flatten, the shape of h is:{}".format(h.shape))
h = C5(h)
h = C6(h)
y = Dense(84, 10, tf.nn.softmax)(h)


layers = [C5, C6]
params_all = get_params(layers)
l2reg = compute_l2_reg(params_all)


cost = - tf.reduce_mean(tf.reduce_sum(t * tf_log(y), axis=1)) + lmd * l2reg
train = tf.train.AdamOptimizer(0.01).minimize(cost)

### 学習 ###

n_epochs = 50
batch_size = 100
n_batches = x_train.shape[0]//batch_size #Floor division(打ち切り除算)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        x_train_fmnist, t_train_fmnist = shuffle(x_train, t_train, random_state = random_state)
        for batch in range(n_batches):
            start = batch * batch_size
            finish = start + batch_size
            sess.run(train, feed_dict = {x: x_train_fmnist[start:finish], t:t_train_fmnist[start:finish]})
        y_pred_, valid_cost_ = sess.run([y, cost], 
                            feed_dict = {x: x_valid, t:t_valid}
                            )
        print("EPOCH: {}, Validation_Cost: {:.3f}, Accuracy_Score: {:.3f}".format(
            epoch+1,
            valid_cost_,
            accuracy_score(t_valid.argmax(axis = 1), y_pred_.argmax(axis = 1))
        ))
    
    y_pred = sess.run(y, feed_dict = {x: x_test})
    submission = pd.Series(y_pred.argmax(axis = 1), name='label')
    submission.to_csv('/root/userspace/chap06/submission/submission_pred_LeNet_L2reg.csv', header=True, index_label='id')

# tb.show_graph(tf.get_default_graph().as_graph_def())

# WRITE ME
# submission = pd.Series(y_pred, name='label')
# submission.to_csv('/root/userspace/chap06/materials/submission_pred.csv', header=True, index_label='id')

Before Flatten, the shape of h is:(?, 4, 4, 16)
After Flatten, the shape of h is:(?, 256)
EPOCH: 1, Validation_Cost: 0.460, Accuracy_Score: 0.840
EPOCH: 2, Validation_Cost: 0.437, Accuracy_Score: 0.851
EPOCH: 3, Validation_Cost: 0.421, Accuracy_Score: 0.858
EPOCH: 4, Validation_Cost: 0.400, Accuracy_Score: 0.866
EPOCH: 5, Validation_Cost: 0.396, Accuracy_Score: 0.868
EPOCH: 6, Validation_Cost: 0.398, Accuracy_Score: 0.871
EPOCH: 7, Validation_Cost: 0.386, Accuracy_Score: 0.876
EPOCH: 8, Validation_Cost: 0.401, Accuracy_Score: 0.873
EPOCH: 9, Validation_Cost: 0.399, Accuracy_Score: 0.873
EPOCH: 10, Validation_Cost: 0.398, Accuracy_Score: 0.873
EPOCH: 11, Validation_Cost: 0.390, Accuracy_Score: 0.872
EPOCH: 12, Validation_Cost: 0.400, Accuracy_Score: 0.872
EPOCH: 13, Validation_Cost: 0.399, Accuracy_Score: 0.874
EPOCH: 14, Validation_Cost: 0.408, Accuracy_Score: 0.868
EPOCH: 15, Validation_Cost: 0.416, Accuracy_Score: 0.869
EPOCH: 16, Validation_Cost: 0.407, Accuracy_Score: 0.872
EPOCH: 

### MiniVGG netを実装する
[このサイト](http://cedro3.com/ai/mini-vgg-net/)などを参考にMiniVGGを実装してみることにする。  
MiniVGGのArchitectureは以下の表のようになる。

![MiniVGG](http://cedro3.com/wp-content/uploads/2017/12/berore2.png)

#### バッチ正規化(Batch Normalization)
しかしながら、実用上このような深いニューラルネットワークに対してはバッチ正規化を行うことが推奨されている。  
そこで、次のようにBatch Normalizationの処理を挿入していく。  
Batch Normalizationの処理の意味はいまいちよく理解出来ていないのだが、DL本や[このページ](https://qiita.com/cfiken/items/b477c7878828ebdb0387)で頑張って解説されている。

![Batch Normalization](http://cedro3.com/wp-content/uploads/2017/12/mark2.png)

In [6]:
# %%writefile /root/userspace/chap06/materials/submission_code.py

# import tensorboard as tb
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

rng = np.random.RandomState(1234)
random_state = 42

### レイヤー定義 ###

class Conv:
    def __init__(self, filter_shape, function = lambda x: x, strides = [1,1,1,1], padding = 'VALID'):
        # He initializationを使う
        # filter_shape = Height * Width * Num of input_channels * Num of output_channels
        fun_in = np.prod(filter_shape[:3])
        fun_out = np.prod(filter_shape[:2]) * filter_shape[3]
        self.W = tf.Variable(rng.uniform(
                low = -np.sqrt(6/ fun_in),
                high = np.sqrt(6/ fun_out),
                size = filter_shape
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((filter_shape[3]), dtype = 'float32'), name = 'b')
        self.function = function
        self.strides = strides
        self.padding = padding
    # WRITE ME
    
    def __call__(self, x, config):
        u = tf.nn.conv2d(x, self.W, strides = self.strides, padding = self.padding) + self.b
        self.moments = tf.nn.moments(u, axes = [0], name = 'mean', keep_dims = True)
        u = tf.cond(
        pred = config,
        true_fn = lambda: tf.nn.batch_normalization(u, 
                                         mean = self.moments[0], 
                                         variance = self.moments[1],
                                        offset = None,
                                        scale = None,
                                        variance_epsilon = 1e-8),
        false_fn = lambda: u)
        return self.function(u)
    
class Pooling:
    def __init__(self, ksize = [1, 2, 2, 1] , strides = [1, 2, 2, 1], padding = 'VALID'):
        self.ksize = ksize
        self.strides = strides
        self.padding = padding
    
    def __call__(self, x):
        return tf.nn.max_pool(x, ksize = self.ksize, strides = self.strides, padding = self.padding)
    # WRITE ME
    
    
class Flatten:
    def __call__(self, x):
        return tf.reshape(x, (-1, np.prod(x.get_shape().as_list()[1:])))
    # WRITE ME
    
    
class Dense:
    def __init__(self, in_dim, out_dim, function = lambda x: x):
        # ここでも, He Initialization
        self.W = tf.Variable(rng.uniform(
                low = - np.sqrt(6/ in_dim),
                high = np.sqrt(6/ in_dim),
                size = [in_dim, out_dim]
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((out_dim), dtype = 'float32'), name = 'b')
        self.function = function
        self.params = [self.W, self.b]
        
    def __call__(self, x):
        u = tf.matmul(x, self.W) + self.b
        return self.function(u)

class Dropout:
    def __init__(self, dropout_keep_prob=1.0):
        self.dropout_keep_prob = dropout_keep_prob
        self.params = []
    
    def __call__(self, x):
        # 訓練時のみdropoutを適用
        return tf.cond(
            pred=is_training,
            true_fn=lambda: tf.nn.dropout(x, keep_prob=self.dropout_keep_prob),
            false_fn=lambda: x
        )
    
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))
    # WRITE ME

def get_params(layers):
    params_all = []
    for layer in layers:
        params = layer.params
        params_all.extend(params)
    return params_all
    
def compute_l1_reg(params):
    l1_reg = 0
    for param in params:
        l1_reg += tf.reduce_sum(tf.abs(param))
    return l1_reg

def compute_l2_reg(params):
    l2_reg = 0
    for param in params:
        l2_reg += tf.reduce_sum(tf.square(param)) # 2 * tf.nn.l2_lossを使っても良い
    return l2_reg

x_train, x_test, t_train = load_mnist()
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train, test_size=0.1, random_state=random_state)

### ネットワーク ###
tf.reset_default_graph()

x = tf.placeholder(tf.float32, [None, 28, 28, 1])
t = tf.placeholder(tf.float32, [None, 10])
is_training = tf.placeholder(tf.bool)
lmd = 0.0001
dropout_keep_prob = 0.75



# # miniVGG network
# config = tf.Variable(True)

# h = Conv(filter_shape = (5, 5, 1, 6), function = tf.nn.relu)(x, config) # [None, 28, 28, 1] -> [None, 24, 24, 6]
# h = Conv(filter_shape = (3, 3, 6, 6), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 24, 24, 6] -> [None, 24, 24, 6]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 24, 24, 6] -> [None, 12, 12, 6]

# h = Conv(filter_shape = (5, 5, 6, 16), function = tf.nn.relu)(h, config) # [None, 12, 12, 6] -> [None, 8, 8, 16]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 8, 8, 16] -> [None, 4, 4, 16]

# print("Before Flatten, the shape of h is:{}".format(h.shape))
# h = Flatten()(h)
# print("After Flatten, the shape of h is:{}".format(h.shape))

# h = Dense(256, 120, tf.nn.relu)(h)
# h = Dense(120, 84, tf.nn.relu)(h)
# y = Dense(84, 10, tf.nn.softmax)(h)


# miniVGG network
config = tf.Variable(True)

h = Conv(filter_shape = (3, 3, 1, 32), function = tf.nn.relu, padding = 'SAME')(x, config) # [None, 28, 28, 1] -> [None, 28, 28, 32]
h = Conv(filter_shape = (3, 3, 32, 32), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 28, 28, 32] -> [None, 28, 28, 32]
h = Conv(filter_shape = (3, 3, 32, 32), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 28, 28, 32] -> [None, 28, 28, 32]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 28, 28, 32] -> [None, 14, 14, 32]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

h = Conv(filter_shape = (3, 3, 32, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 14, 14, 32] -> [None, 14, 14, 64]
h = Conv(filter_shape = (3, 3, 64, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 14, 14, 64] -> [None, 14, 14, 64]
h = Conv(filter_shape = (3, 3, 64, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 14, 14, 64] -> [None, 14, 14, 64]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 14, 14, 64] -> [None, 7, 7, 64]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

h = Conv(filter_shape = (3, 3, 64, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 64] -> [None, 7, 7, 64]
h = Conv(filter_shape = (3, 3, 64, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 64] -> [None, 7, 7, 64]
h = Conv(filter_shape = (3, 3, 64, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 64] -> [None, 7, 7, 64]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1], padding = 'SAME')(h) # [None, 7, 7, 64] -> [None, 4, 4, 64]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

print("Before Flatten, the shape of h is:{}".format(h.shape))
h = Flatten()(h)
print("After Flatten, the shape of h is:{}".format(h.shape))

h = Dense(1024, 512, tf.nn.relu)(h)
h = Dropout(0.5)(h)
# h = Dense(512, 84, tf.nn.relu)(h)
# h = Dropout(0.5)(h)
y = Dense(512, 10, tf.nn.softmax)(h)


cost = - tf.reduce_mean(tf.reduce_sum(t * tf_log(y), axis=1))
optimizer = tf.train.AdamOptimizer(0.01)
# optimizer = tf.train.AdadeltaOptimizer()
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
    train = optimizer.minimize(cost)

    
# 単純に移動平均・移動分散を計算するだけではパラメータが更新されないので、以下のように書き換える必要あり。
# crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.cnn.y, logits=self.cnn.logits)
# loss_op = tf.reduce_mean(crossent)
# optimizer = tf.train.AdamOptimizer(config['learning_rate'])
# extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # <- ここ
# with tf.control_dependencies(extra_update_ops):  # <- ここ
#     train_op = optimizer.minimize(loss_op)


### 学習 ###

n_epochs = 100
batch_size = 150
n_batches = x_train.shape[0]//batch_size #Floor division(打ち切り除算)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        x_train_fmnist, t_train_fmnist = shuffle(x_train, t_train, random_state = random_state)
        for batch in range(n_batches):
            start = batch * batch_size
            finish = start + batch_size
            sess.run(train, feed_dict = {x: x_train_fmnist[start:finish], t:t_train_fmnist[start:finish], is_training:True})
        y_pred_, valid_cost_ = sess.run([y, cost], 
                            feed_dict = {x: x_valid, t:t_valid, is_training:False}
                            )
        if ((epoch+1) % 5 == 0):
            print("EPOCH: {}, Validation_Cost: {:.3f}, Accuracy_Score: {:.3f}".format(
                epoch+1,
                valid_cost_,
                accuracy_score(t_valid.argmax(axis = 1), y_pred_.argmax(axis = 1))
            ))
    y_pred = sess.run(y, feed_dict = {x: x_test, is_training: False})
    submission = pd.Series(y_pred.argmax(axis = 1), name='label')
    submission.to_csv('/root/userspace/chap06/submission/submission_pred_miniVGG_bn_adadelta.csv', header=True, index_label='id')

# tb.show_graph(tf.get_default_graph().as_graph_def())

# WRITE ME
# submission = pd.Series(y_pred, name='label')
# submission.to_csv('/root/userspace/chap06/materials/submission_pred.csv', header=True, index_label='id')

Before Flatten, the shape of h is:(?, 4, 4, 64)
After Flatten, the shape of h is:(?, 1024)
EPOCH: 5, Validation_Cost: 0.440, Accuracy_Score: 0.833
EPOCH: 10, Validation_Cost: 0.358, Accuracy_Score: 0.869
EPOCH: 15, Validation_Cost: 0.334, Accuracy_Score: 0.871
EPOCH: 20, Validation_Cost: 0.304, Accuracy_Score: 0.888
EPOCH: 25, Validation_Cost: 0.321, Accuracy_Score: 0.884
EPOCH: 30, Validation_Cost: 0.300, Accuracy_Score: 0.890
EPOCH: 35, Validation_Cost: 0.292, Accuracy_Score: 0.893
EPOCH: 40, Validation_Cost: 0.285, Accuracy_Score: 0.894
EPOCH: 45, Validation_Cost: 0.278, Accuracy_Score: 0.898
EPOCH: 50, Validation_Cost: 0.289, Accuracy_Score: 0.896
EPOCH: 55, Validation_Cost: 0.282, Accuracy_Score: 0.902
EPOCH: 60, Validation_Cost: 0.268, Accuracy_Score: 0.901
EPOCH: 65, Validation_Cost: 0.272, Accuracy_Score: 0.905
EPOCH: 70, Validation_Cost: 0.274, Accuracy_Score: 0.904
EPOCH: 75, Validation_Cost: 0.285, Accuracy_Score: 0.902
EPOCH: 80, Validation_Cost: 0.278, Accuracy_Score: 0.90

### 結果
`submission_pred_miniVGG_bn`は全てのbatch normalizationとDropoutを実行したminiVGGNetでの結果。
100Epochほど回し、最終的なAccuracy_Scoreは0.918であった。  
`submission_pred_miniVGG_bn_keep`はConvolution層のDropoutを無くした場合。
100Epoch回したが、最終的なAccuracy_Scoreは0.877で悪化した。  
`submission_pred_miniVGG_bn_alldropout.csv`は全てにDropoutを実行したもの
最終的なAccuracy_scoreは0.910であった。

#### Optimizerの変更
上記は全てAdamで実行したが、他のOptimizerも試してみる。
- Adadeltaの結果

### VGG16の実装
[このサイト](https://zhuanlan.zhihu.com/p/28968219)を参考にして、VGG16を実装した。  
ファイル名は`submission_pred_VGG16_bn_adam.csv`で保存した。

In [None]:
# %%writefile /root/userspace/chap06/materials/submission_code.py

# import tensorboard as tb
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

rng = np.random.RandomState(1234)
random_state = 42

### レイヤー定義 ###

class Conv:
    def __init__(self, filter_shape, function = lambda x: x, strides = [1,1,1,1], padding = 'VALID'):
        # He initializationを使う
        # filter_shape = Height * Width * Num of input_channels * Num of output_channels
        fun_in = np.prod(filter_shape[:3])
        fun_out = np.prod(filter_shape[:2]) * filter_shape[3]
        self.W = tf.Variable(rng.uniform(
                low = -np.sqrt(6/ fun_in),
                high = np.sqrt(6/ fun_out),
                size = filter_shape
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((filter_shape[3]), dtype = 'float32'), name = 'b')
        self.function = function
        self.strides = strides
        self.padding = padding
    # WRITE ME
    
    def __call__(self, x, config):
        u = tf.nn.conv2d(x, self.W, strides = self.strides, padding = self.padding) + self.b
        self.moments = tf.nn.moments(u, axes = [0], name = 'mean', keep_dims = True)
        u = tf.cond(
        pred = config,
        true_fn = lambda: tf.nn.batch_normalization(u, 
                                         mean = self.moments[0], 
                                         variance = self.moments[1],
                                        offset = None,
                                        scale = None,
                                        variance_epsilon = 1e-8),
        false_fn = lambda: u)
        return self.function(u)
    
class Pooling:
    def __init__(self, ksize = [1, 2, 2, 1] , strides = [1, 2, 2, 1], padding = 'VALID'):
        self.ksize = ksize
        self.strides = strides
        self.padding = padding
    
    def __call__(self, x):
        return tf.nn.max_pool(x, ksize = self.ksize, strides = self.strides, padding = self.padding)
    # WRITE ME
    
    
class Flatten:
    def __call__(self, x):
        return tf.reshape(x, (-1, np.prod(x.get_shape().as_list()[1:])))
    # WRITE ME
    
    
class Dense:
    def __init__(self, in_dim, out_dim, function = lambda x: x):
        # ここでも, He Initialization
        self.W = tf.Variable(rng.uniform(
                low = - np.sqrt(6/ in_dim),
                high = np.sqrt(6/ in_dim),
                size = [in_dim, out_dim]
                ).astype('float32'), name = 'W')
        self.b = tf.Variable(np.zeros((out_dim), dtype = 'float32'), name = 'b')
        self.function = function
        self.params = [self.W, self.b]
        
    def __call__(self, x):
        u = tf.matmul(x, self.W) + self.b
        return self.function(u)

class Dropout:
    def __init__(self, dropout_keep_prob=1.0):
        self.dropout_keep_prob = dropout_keep_prob
        self.params = []
    
    def __call__(self, x):
        # 訓練時のみdropoutを適用
        return tf.cond(
            pred=is_training,
            true_fn=lambda: tf.nn.dropout(x, keep_prob=self.dropout_keep_prob),
            false_fn=lambda: x
        )
    
def tf_log(x):
    return tf.log(tf.clip_by_value(x, 1e-10, x))
    # WRITE ME

def get_params(layers):
    params_all = []
    for layer in layers:
        params = layer.params
        params_all.extend(params)
    return params_all
    
def compute_l1_reg(params):
    l1_reg = 0
    for param in params:
        l1_reg += tf.reduce_sum(tf.abs(param))
    return l1_reg

def compute_l2_reg(params):
    l2_reg = 0
    for param in params:
        l2_reg += tf.reduce_sum(tf.square(param)) # 2 * tf.nn.l2_lossを使っても良い
    return l2_reg

x_train, x_test, t_train = load_mnist()
x_train, x_valid, t_train, t_valid = train_test_split(x_train, t_train, test_size=0.1, random_state=random_state)

### ネットワーク ###
tf.reset_default_graph()

x = tf.placeholder(tf.float32, [None, 28, 28, 1])
t = tf.placeholder(tf.float32, [None, 10])
is_training = tf.placeholder(tf.bool)
lmd = 0.0001
dropout_keep_prob = 0.75



# # miniVGG network
# config = tf.Variable(True)

# h = Conv(filter_shape = (5, 5, 1, 6), function = tf.nn.relu)(x, config) # [None, 28, 28, 1] -> [None, 24, 24, 6]
# h = Conv(filter_shape = (3, 3, 6, 6), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 24, 24, 6] -> [None, 24, 24, 6]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 24, 24, 6] -> [None, 12, 12, 6]

# h = Conv(filter_shape = (5, 5, 6, 16), function = tf.nn.relu)(h, config) # [None, 12, 12, 6] -> [None, 8, 8, 16]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 8, 8, 16] -> [None, 4, 4, 16]

# print("Before Flatten, the shape of h is:{}".format(h.shape))
# h = Flatten()(h)
# print("After Flatten, the shape of h is:{}".format(h.shape))

# h = Dense(256, 120, tf.nn.relu)(h)
# h = Dense(120, 84, tf.nn.relu)(h)
# y = Dense(84, 10, tf.nn.softmax)(h)


# miniVGG network
config = tf.Variable(True)

# Block1
h = Conv(filter_shape = (3, 3, 1, 64), function = tf.nn.relu, padding = 'SAME')(x, config) # [None, 28, 28, 1] -> [None, 28, 28, 64]
h = Conv(filter_shape = (3, 3, 64, 64), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 28, 28, 64] -> [None, 28, 28, 64]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 28, 28, 64] -> [None, 14, 14, 64]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

# Block2
h = Conv(filter_shape = (3, 3, 64, 128), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 14, 14, 64] -> [None, 14, 14, 128]
h = Conv(filter_shape = (3, 3, 128, 128), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 14, 14, 128] -> [None, 14, 14, 128]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 14, 14, 128] -> [None, 7, 7, 128]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

# Block3
h = Conv(filter_shape = (3, 3, 128, 256), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 128] -> [None, 7, 7, 256]
h = Conv(filter_shape = (3, 3, 256, 256), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 256] -> [None, 7, 7, 256]
h = Conv(filter_shape = (3, 3, 256, 256), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 256] -> [None, 7, 7, 256]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1], padding = 'SAME')(h) # [None, 7, 7, 256] -> [None, 4, 4, 256]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

# Block4
h = Conv(filter_shape = (3, 3, 256, 512), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 4, 4, 256] -> [None, 4, 4, 512]
h = Conv(filter_shape = (3, 3, 512, 512), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 4, 4, 512] -> [None, 4, 4, 512]
h = Conv(filter_shape = (3, 3, 512, 512), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 4, 4, 512] -> [None, 4, 4, 512]
h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1], padding = 'SAME')(h) # [None, 4, 4, 512] -> [None, 2, 2, 512]
h = Dropout(dropout_keep_prob)(h) # 1 of 4 inputs is randomly excluded

print("Before Flatten, the shape of h is:{}".format(h.shape))
h = Flatten()(h)
print("After Flatten, the shape of h is:{}".format(h.shape))

h = Dense(2048, 200, tf.nn.relu)(h)
# h = Dense(200, 200, tf.nn.relu)(h)
h = Dropout(0.5)(h)
# h = Dense(512, 84, tf.nn.relu)(h)
# h = Dropout(0.5)(h)
y = Dense(200, 10, tf.nn.softmax)(h)


cost = - tf.reduce_mean(tf.reduce_sum(t * tf_log(y), axis=1))
optimizer = tf.train.AdamOptimizer(0.01)
# optimizer = tf.train.AdadeltaOptimizer()
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(extra_update_ops):
    train = optimizer.minimize(cost)

    
# 単純に移動平均・移動分散を計算するだけではパラメータが更新されないので、以下のように書き換える必要あり。
# crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.cnn.y, logits=self.cnn.logits)
# loss_op = tf.reduce_mean(crossent)
# optimizer = tf.train.AdamOptimizer(config['learning_rate'])
# extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)  # <- ここ
# with tf.control_dependencies(extra_update_ops):  # <- ここ
#     train_op = optimizer.minimize(loss_op)


### 学習 ###

n_epochs = 100
batch_size = 100
n_batches = x_train.shape[0]//batch_size #Floor division(打ち切り除算)

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        x_train_fmnist, t_train_fmnist = shuffle(x_train, t_train, random_state = random_state)
        for batch in range(n_batches):
            start = batch * batch_size
            finish = start + batch_size
            sess.run(train, feed_dict = {x: x_train_fmnist[start:finish], t:t_train_fmnist[start:finish], is_training:True})
        y_pred_, valid_cost_ = sess.run([y, cost], 
                            feed_dict = {x: x_valid, t:t_valid, is_training:False}
                            )
#         if ((epoch+1) % 5 == 0):
        print("EPOCH: {}, Validation_Cost: {:.3f}, Accuracy_Score: {:.3f}".format(
            epoch+1,
            valid_cost_,
            accuracy_score(t_valid.argmax(axis = 1), y_pred_.argmax(axis = 1))
        ))
    y_pred = sess.run(y, feed_dict = {x: x_test, is_training: False})
    submission = pd.Series(y_pred.argmax(axis = 1), name='label')
    submission.to_csv('/root/userspace/chap06/submission/submission_pred_VGG16_bn_adam.csv', header=True, index_label='id')

# tb.show_graph(tf.get_default_graph().as_graph_def())

# WRITE ME
# submission = pd.Series(y_pred, name='label')
# submission.to_csv('/root/userspace/chap06/materials/submission_pred.csv', header=True, index_label='id')

Before Flatten, the shape of h is:(?, 2, 2, 512)
After Flatten, the shape of h is:(?, 2048)
EPOCH: 1, Validation_Cost: 1.078, Accuracy_Score: 0.627
EPOCH: 2, Validation_Cost: 1.112, Accuracy_Score: 0.623
EPOCH: 3, Validation_Cost: 0.975, Accuracy_Score: 0.679
EPOCH: 4, Validation_Cost: 1.017, Accuracy_Score: 0.653
EPOCH: 5, Validation_Cost: 1.047, Accuracy_Score: 0.691
EPOCH: 6, Validation_Cost: 0.781, Accuracy_Score: 0.723
EPOCH: 7, Validation_Cost: 0.630, Accuracy_Score: 0.780
EPOCH: 8, Validation_Cost: 0.587, Accuracy_Score: 0.755
EPOCH: 9, Validation_Cost: 0.539, Accuracy_Score: 0.766
EPOCH: 10, Validation_Cost: 0.553, Accuracy_Score: 0.771
EPOCH: 11, Validation_Cost: 0.553, Accuracy_Score: 0.769
EPOCH: 12, Validation_Cost: 0.492, Accuracy_Score: 0.791
EPOCH: 13, Validation_Cost: 0.480, Accuracy_Score: 0.797
EPOCH: 14, Validation_Cost: 0.485, Accuracy_Score: 0.792
EPOCH: 15, Validation_Cost: 0.554, Accuracy_Score: 0.779
EPOCH: 16, Validation_Cost: 0.462, Accuracy_Score: 0.803
EPOCH

In [None]:

# # Batch Normalization sample
# config = tf.Variable(True)

# h = Conv(filter_shape = (5, 5, 1, 6), function = tf.nn.relu)(x, config) # [None, 28, 28, 1] -> [None, 24, 24, 6]
# h = Conv(filter_shape = (3, 3, 6, 6), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 24, 24, 6] -> [None, 24, 24, 6]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 24, 24, 6] -> [None, 12, 12, 6]

# h = Conv(filter_shape = (5, 5, 6, 16), function = tf.nn.relu)(h, config) # [None, 12, 12, 6] -> [None, 8, 8, 16]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 8, 8, 16] -> [None, 4, 4, 16]

# print("Before Flatten, the shape of h is:{}".format(h.shape))
# h = Flatten()(h)
# print("After Flatten, the shape of h is:{}".format(h.shape))


# # miniVGG network
# config = tf.Variable(True)

# h = Conv(filter_shape = (3, 3, 1, 24), function = tf.nn.relu, padding = 'SAME')(x, config) # [None, 28, 28, 1] -> [None, 28, 28, 24]
# h = Conv(filter_shape = (3, 3, 24, 24), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 28, 28, 24] -> [None, 28, 28, 24]
# h = Conv(filter_shape = (3, 3, 24, 24), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 28, 28, 24] -> [None, 28, 28, 24]
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 28, 28, 24] -> [None, 14, 14, 24]

# h = Conv(filter_shape = (3, 3, 24, 48), function = tf.nn.relu, padding = 'SAME')(h, config) 
# h = Conv(filter_shape = (3, 3, 48, 48), function = tf.nn.relu, padding = 'SAME')(h, config) 
# h = Conv(filter_shape = (3, 3, 48, 48), function = tf.nn.relu, padding = 'SAME')(h, config) 
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1])(h) # [None, 14, 14, 48] -> [None, 7, 7, 48]

# h = Conv(filter_shape = (3, 3, 48, 48), function = tf.nn.relu, padding = 'SAME')(h, config) # [None, 7, 7, 128] -> [None, 7, 7, 128]
# h = Conv(filter_shape = (3, 3, 48, 48), function = tf.nn.relu, padding = 'SAME')(h, config) 
# h = Conv(filter_shape = (3, 3, 48, 48), function = tf.nn.relu, padding = 'SAME')(h, config) 
# h = Pooling(ksize = (1, 2, 2, 1), strides = [1, 2, 2, 1], padding = 'SAME')(h) # [None, 7, 7, 48] -> [None, 4, 4, 48]

# print("Before Flatten, the shape of h is:{}".format(h.shape))
# h = Flatten()(h)
# print("After Flatten, the shape of h is:{}".format(h.shape))

# h = Dense(768, 200, tf.nn.relu)(h)
# h = Dropout(dropout_keep_prob)(h)
# h = Dense(200, 84, tf.nn.relu)(h)
# h = Dropout(dropout_keep_prob)(h)
# y = Dense(84, 10, tf.nn.softmax)(h)
