## 37.  データ拡張（Data Augmentation）

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_datasets as tfds

In [None]:
(ds_train, ds_test), ds_info = tfds.load(
    "stanford_dogs", split=["train", "test"], with_info=True, as_supervised=True
    )
NUM_CLASSES = ds_info.features["label"].num_classes

In [None]:
IMG_SIZE = 224
size = (IMG_SIZE, IMG_SIZE)
ds_train = ds_train.map(lambda image, label: (tf.image.resize(image, size), label))
ds_test = ds_test.map(lambda image, label: (tf.image.resize(image, size), label))

In [None]:
import matplotlib.pyplot as plt

def format_label(label):
  string_label = label_info.int2str(label)
  return string_label.split("-")[1]

In [None]:
plt.figure(figsize=(9,9))
label_info = ds_info.features["label"]
for i, (image, label) in enumerate(ds_train.take(9)):
  ax = plt.subplot(3, 3, i + 1)
  plt.imshow(image.numpy().astype("uint8"))
  plt.title("{}".format(format_label(label)))
  plt.axis("off")

In [None]:
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.models import Sequential

In [None]:
img_augmentation = Sequential(
    [preprocessing.RandomRotation(factor=0.15), ###
     preprocessing.RandomTranslation(height_factor=0.1, ### 
                                     width_factor=0.1), ###
     preprocessing.RandomFlip(),
     preprocessing.RandomZoom(height_factor=(-0.8, 0.6)), ###
     preprocessing.RandomContrast(factor=0.1),  ###
     ],
     name="img_augmentation",)

image_index = 0 ###

plt.figure(figsize=(9,9))
for image, label in ds_train.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    aug_img = img_augmentation(tf.expand_dims(image, axis=0))
    plt.imshow(aug_img[image_index].numpy().astype("uint8"))
    plt.title("{}".format(format_label(label)))
    plt.axis("off")

In [None]:
img_augmentation = Sequential(
    [preprocessing.RandomRotation(factor=0.15), ###
     preprocessing.RandomTranslation(height_factor=0.1, ### 
                                     width_factor=0.1), ###
     preprocessing.RandomFlip(),
     preprocessing.RandomZoom(height_factor=(-0.8, 0.6)), ###
     preprocessing.RandomContrast(factor=0.1),  ###
     ],
     name="img_augmentation",)

image_index = 0 ###

plt.figure(figsize=(9,9))
for image, label in ds_train.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    aug_img = img_augmentation(tf.expand_dims(image, axis=0))
    plt.imshow(aug_img[image_index].numpy().astype("uint8"))
    plt.title("{}".format(format_label(label)))
    plt.axis("off")

In [None]:
img_augmentation = Sequential(
    [preprocessing.RandomRotation(factor=0.15), ###
     preprocessing.RandomTranslation(height_factor=0.1, ### 
                                     width_factor=0.1), ###
     preprocessing.RandomFlip(),
     preprocessing.RandomZoom(height_factor=(-0.8, 0.6)), ###
     preprocessing.RandomContrast(factor=0.1),  ###
     ],
     name="img_augmentation",)

image_index = 0 ###

plt.figure(figsize=(9,9))
for image, label in ds_train.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    aug_img = img_augmentation(tf.expand_dims(image, axis=0))
    plt.imshow(aug_img[image_index].numpy().astype("uint8"))
    plt.title("{}".format(format_label(label)))
    plt.axis("off")

In [None]:
img_augmentation = Sequential(
    [preprocessing.RandomRotation(factor=0.15), ###
     preprocessing.RandomTranslation(height_factor=0.1, ### 
                                     width_factor=0.1), ###
     preprocessing.RandomFlip(),
     preprocessing.RandomZoom(height_factor=(-0.8, 0.6)), ###
     preprocessing.RandomContrast(factor=0.1),  ###
     ],
     name="img_augmentation",)

image_index = 0 ###

plt.figure(figsize=(9,9))
for image, label in ds_train.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    aug_img = img_augmentation(tf.expand_dims(image, axis=0))
    plt.imshow(aug_img[image_index].numpy().astype("uint8"))
    plt.title("{}".format(format_label(label)))
    plt.axis("off")

## 38. 転移学習（transfer learning）と fine-tuning

<font color=red size=5>**ノートブックの設定からGPUを選択すること**</font>

[転移学習のサーベイ](https://www.kamishima.net/archive/2009-tr-jsai_dmsm1-PR.pdf)

[転移学習を用いたデータ解析](https://datachemeng.com/transfer_learning/)

### <font color = blue>**1.** </font> 公式チュートリアル

https://www.tensorflow.org/tutorials/images/transfer_learning

- 事前学習済みネットワークを転移学習し、犬と猫の画像の分類を行う

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import tensorflow as tf

from tensorflow.keras.preprocessing import image_dataset_from_directory

#### <font color = green> **1.1.** </font> Data preprocessing

In [None]:
#### Data download

_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True)
PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')

train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')

BATCH_SIZE = 32
IMG_SIZE = (160, 160)

train_dataset = image_dataset_from_directory(train_dir,
                                             shuffle=True,
                                             batch_size=BATCH_SIZE,
                                             image_size=IMG_SIZE)

In [None]:
validation_dataset = image_dataset_from_directory(validation_dir,
                                                  shuffle=True,
                                                  batch_size=BATCH_SIZE,
                                                  image_size=IMG_SIZE)

In [None]:
### Show the first nine images and labels from the training set:

class_names = train_dataset.class_names

plt.figure(figsize=(10, 10))
for images, labels in train_dataset.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [None]:
###
# As the original dataset doesn't contains a test set, you will create one.
# To do so, determine how many batches of data are available in the
# validation set using ```tf.data.experimental.cardinality```, then move
# 20% of them to a test set.

val_batches = tf.data.experimental.cardinality(validation_dataset)
test_dataset = validation_dataset.take(val_batches // 5)
validation_dataset = validation_dataset.skip(val_batches // 5)

In [None]:
print('Number of validation batches: %d' % tf.data.experimental.cardinality(validation_dataset))
print('Number of test batches: %d' % tf.data.experimental.cardinality(test_dataset))

In [None]:
#### Configure the dataset for performance

AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.prefetch(buffer_size=AUTOTUNE)
validation_dataset = validation_dataset.prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.prefetch(buffer_size=AUTOTUNE)

In [None]:
#### Use data augmentation

data_augmentation = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.RandomFlip('horizontal'),
  tf.keras.layers.experimental.preprocessing.RandomRotation(0.2),
])

Note : 
- These layers are active only during training, when you call `model.fit`.
- They are inactive when the model is used in inference mode in `model.evaulate` or `model.fit`.

In [None]:
### Let's repeatedly apply these layers to the same image and see the result.

for image, _ in train_dataset.take(1):
  plt.figure(figsize=(10, 10))
  first_image = image[0]
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    augmented_image = data_augmentation(tf.expand_dims(first_image, 0))
    plt.imshow(augmented_image[0] / 255)
    plt.axis('off')

In [None]:
#### Rescale pixel values
# In a moment, you will download `tf.keras.applications.MobileNetV2` for use as your base model.
# This model expects pixel vaues in `[-1,1]`, but at this point, the pixel values in your images are in `[0-255]`.
# To rescale them, use the preprocessing method included with the model.

preprocess_input = tf.keras.applications.mobilenet_v2.preprocess_input

Note : Alternatively, you could rescale pixel values from `[0,255]` to `[-1, 1]` using a [Rescaling](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing/Rescaling) layer.

In [None]:
rescale = tf.keras.layers.experimental.preprocessing.Rescaling(1./127.5, offset= -1)

Note : If using other `tf.keras.applications`, be sure to check the API doc to determine if they expect pixels in `[-1,1]` or `[0,1]`, or use the included `preprocess_input` function.

#### <font color = green> **1.2.** </font> Create the base model from the pre-trained convnets

In [None]:
# Create the base model from the pre-trained model MobileNet V2
# This is pre-trained on the ImageNet dataset, a large dataset consisting of 1.4M images and 1000 classes.

IMG_SHAPE = IMG_SIZE + (3,)
base_model = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE,
                                               include_top=False, # you load a network that doesn't include the classification layers at the top,
                                                                  # which is ideal for feature extraction
                                               weights='imagenet')

In [None]:
###
# This feature extractor converts each `160x160x3` image
# into a `5x5x1280` block of features. Let's see what it
# does to an example batch of images:

image_batch, label_batch = next(iter(train_dataset))
feature_batch = base_model(image_batch)
print(feature_batch.shape)

#### <font color = green> **1.3.** </font> Feature extraction

In [None]:
#### Freeze the convolutional base

base_model.trainable = False

<font color=red> Important note about BatchNormalization layers </font>

Many models contain `tf.keras.layers.BatchNormalization` layers. \
This layer is a special case and precautions should be taken in the context of fine-tuning, as shown later in this tutorial. 

When you set `layer.trainable = False`, the `BatchNormalization` layer will run in inference mode, and will not update its mean and variance statistics. 

When you unfreeze a model that contains BatchNormalization layers in order to do fine-tuning, you should keep the BatchNormalization layers in inference mode by passing `training = False` when calling the base model. \
Otherwise, the updates applied to the non-trainable weights will destroy what the model has learned.

For details, see the [Transfer learning guide](https://www.tensorflow.org/guide/keras/transfer_learning).

In [None]:
# Let's take a look at the base model architecture
base_model.summary()

In [None]:
#### Add a classification head
# To generate predictions from the block of features,
# average over the spatial `5x5` spatial locations,
# using a `tf.keras.layers.GlobalAveragePooling2D` layer
# to convert the features to  a single 1280-element vector per image.

global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
feature_batch_average = global_average_layer(feature_batch)
print(feature_batch_average.shape)

In [None]:
###
# Apply a `tf.keras.layers.Dense` layer to convert these features into a
# single prediction per image. You don't need an activation function here
# because this prediction will be treated as a `logit`, or a raw prediction value. 
# Positive numbers predict class 1, negative numbers predict class 0.

prediction_layer = tf.keras.layers.Dense(1)
prediction_batch = prediction_layer(feature_batch_average)
print(prediction_batch.shape)

In [None]:
###
# Build a model by chaining together the data augmentation, rescaling,
# base_model and feature extractor layers using the Keras Functional API
# (https://www.tensorflow.org/guide/keras/functional).
# As previously mentioned, use training=False as our model contains a BatchNormalization layer.

inputs = tf.keras.Input(shape=(160, 160, 3))
x = data_augmentation(inputs)
x = preprocess_input(x)
x = base_model(x, training=False)
x = global_average_layer(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = prediction_layer(x)
model = tf.keras.Model(inputs, outputs)

In [None]:
### 
# Compile the model before training it.
# Since there are two classes, use a binary cross-entropy loss
# with `from_logits=True` since the model provides a linear output.

base_learning_rate = 0.0001
model.compile(optimizer=tf.keras.optimizers.Adam(lr=base_learning_rate),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
### 
# The 2.5M parameters in MobileNet are frozen,
# but there are 1.2K _trainable_ parameters in the Dense layer.
# These are divided between two `tf.Variable` objects, the weights and biases.

len(model.trainable_variables)

In [None]:
#### Train the model
### After training for 10 epochs, you should see ~94% accuracy on the validation set.

initial_epochs = 10
loss0, accuracy0 = model.evaluate(validation_dataset)

In [None]:
print("initial loss: {:.2f}".format(loss0))
print("initial accuracy: {:.2f}".format(accuracy0))

In [None]:
history = model.fit(train_dataset,
                    epochs=initial_epochs,
                    validation_data=validation_dataset)

# 1エポック60秒くらい

In [None]:
### 
# Let's take a look at the learning curves of the training and validation accuracy/loss
# when using the MobileNet V2 base model as a fixed feature extractor.

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Cross Entropy')
plt.ylim([0,1.0])
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

Note :
- If you are wondering why the validation metrics are clearly better than the training metrics, the main factor is because layers like `tf.keras.layers.BatchNormalization` and `tf.keras.layers.Dropout` affect accuracy during training.
  - They are turned off when calculating validation loss.

- To a lesser extent, it is also because training metrics report the average for an epoch, while validation metrics are evaluated after the epoch, so validation metrics see a model that has trained slightly longer.

#### <font color = green> **1.4.** </font> Fine tuning

Note :
- This should only be attempted after you have trained the top-level classifier with the pre-trained model set to non-trainable.
- If you add a randomly initialized classifier on top of a pre-trained model and attempt to train all layers jointly, the magnitude of the gradient updates will be too large (due to the random weights from the classifier) and your pre-trained model will forget what it has learned.

In [None]:
#### Un-freeze the top layers of the model
# All you need to do is unfreeze the `base_model` and set the bottom layers to be un-trainable.
# Then, you should recompile the model (necessary for these changes to take effect), and resume training.

base_model.trainable = True

In [None]:
# Let's take a look to see how many layers are in the base model
print("Number of layers in the base model: ", len(base_model.layers))

# Fine-tune from this layer onwards
fine_tune_at = 100

# Freeze all the layers before the `fine_tune_at` layer
for layer in base_model.layers[:fine_tune_at]:
  layer.trainable =  False

In [None]:
#### Compile the model
# As you are training a much larger model and want to readapt the pretrained weights,
# it is important to use a lower learning rate at this stage.
# Otherwise, your model could overfit very quickly.

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer = tf.keras.optimizers.RMSprop(lr=base_learning_rate/10),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
len(model.trainable_variables)

In [None]:
#### Continue training the model
### If you trained to convergence earlier, this step will improve your accuracy by a few percentage points.

fine_tune_epochs = 10
total_epochs =  initial_epochs + fine_tune_epochs

history_fine = model.fit(train_dataset,
                         epochs=total_epochs,
                         initial_epoch=history.epoch[-1],
                         validation_data=validation_dataset)

# 1エポック90秒前後

In [None]:
### After fine tuning the model nearly reaches 98% accuracy on the validation set.

acc += history_fine.history['accuracy']
val_acc += history_fine.history['val_accuracy']

loss += history_fine.history['loss']
val_loss += history_fine.history['val_loss']

In [None]:
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)
plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.ylim([0.8, 1])
plt.plot([initial_epochs-1,initial_epochs-1],
          plt.ylim(), label='Start Fine Tuning')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(2, 1, 2)
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.ylim([0, 1.0])
plt.plot([initial_epochs-1,initial_epochs-1],
         plt.ylim(), label='Start Fine Tuning')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

In [None]:
#### Evaluation and prediction
### Finaly you can verify the performance of the model on new data using test set.

loss, accuracy = model.evaluate(test_dataset)
print('Test accuracy :', accuracy)

In [None]:
### And now you are all set to use this model to predict if your pet is a cat or dog.

#Retrieve a batch of images from the test set
image_batch, label_batch = test_dataset.as_numpy_iterator().next()
predictions = model.predict_on_batch(image_batch).flatten()

# Apply a sigmoid since our model returns logits
predictions = tf.nn.sigmoid(predictions)
predictions = tf.where(predictions < 0.5, 0, 1)

print('Predictions:\n', predictions.numpy())
print('Labels:\n', label_batch)

plt.figure(figsize=(10, 10))
for i in range(9):
  ax = plt.subplot(3, 3, i + 1)
  plt.imshow(image_batch[i].astype("uint8"))
  plt.title(class_names[predictions[i]])
  plt.axis("off")

### <font color = blue>**2.** </font> 公式チュートリアル　簡易版＆日本語解説

https://note.nkmk.me/python-tensorflow-keras-transfer-learning-fine-tuning/

#### <font color = green> **2.1.** </font> cifar10（Canadian Institute For Advanced Research）

10種類の「物体カラー写真」（乗り物や動物など）の画像データセット
- ラベル「0」： airplane（飛行機）
- ラベル「1」： automobile（自動車）
- ラベル「2」： bird（鳥）
- ラベル「3」： cat（猫）
- ラベル「4」： deer（鹿）
- ラベル「5」： dog（犬）
- ラベル「6」： frog（カエル）
- ラベル「7」： horse（馬）
- ラベル「8」： ship（船）
- ラベル「9」： truck（トラック）

In [None]:
import tensorflow as tf

print(tf.__version__)
# 2.1.0 -> 2.4.1@2021/03/04

In [None]:
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

print(type(x_train))
# <class 'numpy.ndarray'>

print(x_train.shape, y_train.shape)
# (50000, 32, 32, 3) (50000, 1)

print(x_test.shape, y_test.shape)
# (10000, 32, 32, 3) (10000, 1)

### 32×32 のRGBカラー画像が訓練用50000枚、テスト用10000枚。正解ラベルは0から9の整数。

In [None]:
### モデルの実装

inputs = tf.keras.Input(shape=(None, None, 3))
x = tf.keras.layers.Lambda(lambda img: tf.image.resize(img, (160, 160)))(inputs)
x = tf.keras.layers.Lambda(tf.keras.applications.mobilenet_v2.preprocess_input)(x)

base_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
    weights='imagenet', input_tensor=x, input_shape=(160, 160, 3),
    include_top=False, pooling='avg'
    )

model = tf.keras.Sequential(
    [base_model,
     tf.keras.layers.Dense(10, activation='softmax')]
     )

model.summary()

### ベースモデル（MobileNetV2の学習済みモデル）が一つのレイヤーとして扱われる

In [None]:
print(len(model.layers))
# 2

print(model.layers[0].name)
# mobilenetv2_1.00_160

print(len(model.layers[0].layers))
# 158 -> 157

In [None]:
### 追加した全結合層のみを学習
# ベースモデルのtrainable属性をFalseとし、Freeze（凍結）する
# ベースモデルの各レイヤーの重みが更新されなくなる（＝ 学習されなくなる）

base_model.trainable = False

model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# 追加した全結合層はランダムな重みで初期化されているだけなので、当然、学習前のこの時点ではまったく分類できない
# 参考までにevaluate()で評価してみると、正解率は10%前後となる
# 10クラス分類なので適当に予測して偶然当たっているだけの正解率

print(model.evaluate(x_test, y_test, verbose=0))
# [loss value, metrics values]

In [None]:
model.fit(x_train, y_train, epochs=6, validation_split=0.2, batch_size=256)
### ColabのCPUだと1エポック14分半くらい。GPUだと20秒くらい

print(model.evaluate(x_test, y_test, verbose=0))

In [None]:
### 学習済みモデルの一部を再学習（ファインチューニング）
# MobileNetV2はblock_1_xxxからblock_16_xxxまで16のブロックに分かれているが、ここではblock_12_xxx以降を再学習することにする
# ブロック12の最初のレイヤーであるblock_12_expandのインデックス（何層目か）を取得する

layer_names = [l.name for l in base_model.layers]
idx = layer_names.index('block_12_expand')
print(idx)

In [None]:
# ベースモデルのtrainable属性をTrueとし、全体をUnfreeze（解凍）してから、
# ブロック11までのレイヤー（block_12_expandの一つ前までのレイヤー）のtrainableをFalseとしFreeze（凍結）する

base_model.trainable = True

for layer in base_model.layers[:idx]:
  layer.trainable = False

In [None]:
# なお、ベースモデルのtrainableをTrueとしないと、その内部のレイヤーのtrainableを
# TrueとしてもUnfreeze（解凍）されず学習されないので注意
# この例のようにベースモデルが一つのレイヤーとして扱われている場合、
# ベースモデルのtrainableがFalseだと、内部のレイヤーのtrainableがTrueであっても学習対象とならない。

# 上述のように、trainableを変更した後は再度compile()する必要がある

model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.00001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
# この状態で学習すると正解率がさらに改善することが確認できる

model.fit(x_train, y_train, epochs=6, validation_split=0.2, batch_size=256)
### ColabのCPUだと1エポック24分弱。GPUだと30秒かからないくらい

print(model.evaluate(x_test, y_test, verbose=0))

#### <font color = green> **2.2.** </font> 犬猫画像分類

In [None]:
### データのダウンロード

# 例として、以下の公式チュートリアルで紹介されている犬と猫の画像データを使用する。あくまでもお試し用なので枚数は多くない。
# https://www.tensorflow.org/tutorials/images/classification

In [None]:
import os
import tensorflow as tf

In [None]:
print(tf.__version__)

In [None]:
tf.random.set_seed(0)

In [None]:
path_to_zip = tf.keras.utils.get_file(
    fname='cats_and_dogs_filtered.zip',
    origin='https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip',
    extract=True
    )

- 以下のようなディレクトリ構造で、犬と猫の画像に分けられ、さらにtrainとvalidationに振り分けられている。

```
cats_and_dogs_filtered
|__ train
    |______ cats: [cat.0.jpg, cat.1.jpg, cat.2.jpg ....]
    |______ dogs: [dog.0.jpg, dog.1.jpg, dog.2.jpg ...]
|__ validation
    |______ cats: [cat.2000.jpg, cat.2001.jpg, cat.2002.jpg ....]
    |______ dogs: [dog.2000.jpg, dog.2001.jpg, dog.2002.jpg ...]
```



- 公式チュートリアルではディレクトリ名の通り、訓練データと検証（validation）データとして使っているが、以下のサンプルコードではtrainを訓練データと検証データ、validationをテストデータとして使う。

In [None]:
path_to_dir = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')

train_dir = os.path.join(path_to_dir, 'train')
test_dir = os.path.join(path_to_dir, 'validation')

### tf.keras.utils.get_file() はダウンロード先のディレクトリのパスを返す。
### そこから train, validation の各ディレクトリへのパス文字列を生成する。

In [None]:
### データの準備 : ImageDataGenerator

# まず ImageDataGenerator のインスタンスを生成する。
# 引数 preprocessing_function に前処理を行う関数、ここでは MobileNetV2 の
# 前処理関数 preprocess_input を指定する。
# 訓練データの方は訓練用と検証用に分割するため引数 validation_split を指定する

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input,
    validation_split=0.2
    )

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    preprocessing_function=tf.keras.applications.mobilenet_v2.preprocess_input
    )

# Data Augmentation（画像の水増し）を行う場合はその他の引数を設定するが、今回は行わない
# なお、validation_split を設定した場合、訓練用と検証用の両方に対して
# Data Augmentation が行われるので注意

In [None]:
# 訓練用、検証用、テスト用の各ジェネレータイテレータを flow_from_directory() メソッドで生成する
# 訓練用と検証用は引数 subset をそれぞれ 'training', 'validation' とする
# また、引数 target_size に画像のサイズを設定するとリサイズされる

In [None]:
batch_size = 64
height = 160
width = 160

In [None]:
train_generator = train_datagen.flow_from_directory(
    batch_size=batch_size,
    directory=train_dir,
    target_size=(height, width),
    class_mode='binary',
    subset='training'
    )

In [None]:
valid_generator = train_datagen.flow_from_directory(
    batch_size=batch_size,
    directory=train_dir,
    target_size=(height, width),
    class_mode='binary',
    subset='validation'
    )

In [None]:
test_generator = test_datagen.flow_from_directory(
    batch_size=batch_size,
    directory=test_dir,
    target_size=(height, width),
    class_mode='binary'
    )

In [None]:
### モデルの実装と学習
# MobileNetV2 の学習済みモデルをベースモデルとして使う
# リサイズを含む前処理は ImageDataGenerator の設定で行っているため
# ここでは input_shape を設定するのみ

base_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
    weights='imagenet', input_shape=(height, width, 3),
    include_top=False, pooling='avg'
    )

x = base_model.output
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=base_model.input, outputs=x)

In [None]:
# Sequential API でモデルを生成してもいいが、参考までにここでは Functional API を用いる
# Sequential API ではベースモデルが一つのレイヤーとして扱われるが
# Functional API の場合はそのような入れ子の形にはならない

print(len(model.layers))

In [None]:
model.summary()

In [None]:
# 入れ子構造ではないが、base_model の各レイヤーと
# 新たに構築した model の各レイヤーは同じオブジェクトを指している
# base_model の trainable を変更すると、その中の各レイヤーの trainable も変更されるため
# Sequential API での例と同じく一括で設定可能

print(model.layers[0] is base_model.layers[0])

In [None]:
print(base_model.layers[0].trainable)
print(model.layers[0].trainable)

In [None]:
base_model.trainable = False

print(base_model.layers[0].trainable)
print(model.layers[0].trainable)

In [None]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# 新たに追加した全結合層の学習前にモデルを評価すると正解率は50%程度
# 2クラス分類なので、まったく分類できていないことが確認できる

print(model.evaluate(test_generator, verbose=0))

In [None]:
# ImageDataGenerator は無限にイテレーションするので
# 引数 steps_per_epoch および validation_steps を明示的に設定する

model.fit(
    train_generator,
    steps_per_epoch=train_generator.n // batch_size,
    validation_data=valid_generator,
    validation_steps=valid_generator.n // batch_size,
    epochs=6
    )

# 1エポック10秒くらい@GPU

In [None]:
print(model.evaluate(test_generator, verbose=0))

In [None]:
# ファインチューニングのために、ベースモデルの後半のレイヤーの trainable を True とする
# trainable を変更したあとは忘れずに compile()

idx = [l.name for l in base_model.layers].index('block_12_expand')

for layer in base_model.layers[idx:]:
  layer.trainable = True

model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.00001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# base_model の trainable は False のままだが
# ベースモデルが一つのレイヤーとして扱われていないので
# ベースモデルの trainable の値によらず内部のレイヤーの trainable の値が反映される

# ベースモデルの trainable を変更するとその中のレイヤーの trainable も一括で変更されるので
# ベースモデルの trainable を True にしてから前半のレイヤーの trainable を False にしても同じ結果になる

In [None]:
# 再学習を行う

model.fit(
    train_generator,
    steps_per_epoch=train_generator.n // batch_size,
    validation_data=valid_generator,
    validation_steps=valid_generator.n // batch_size,
    epochs=6
    )

# 1エポック10秒くらい@GPU

In [None]:
print(model.evaluate(test_generator, verbose=0))

### <font color = blue>**3.** </font> データ拡張（Data Augmentation）の詳解含む公式サンプルコード

Transfer learning & fine-tuning

**Author:** [fchollet](https://twitter.com/fchollet)<br>
**Date created:** 2020/04/15<br>
**Last modified:** 2020/05/12<br>
**Description:** Complete guide to transfer learning & fine-tuning in Keras.


In [None]:
## Setup

import numpy as np
import tensorflow as tf
from tensorflow import keras

#### <font color = green> **3.1.** </font> Freezing layers: understanding the `trainable` attribute

In [None]:
# Example: the `Dense` layer has 2 trainable weights (kernel & bias)

layer = keras.layers.Dense(3)
layer.build((None, 4))  # Create the weights

print("weights:", len(layer.weights))
print("trainable_weights:", len(layer.trainable_weights))
print("non_trainable_weights:", len(layer.non_trainable_weights))

In [None]:
# Example: the `BatchNormalization` layer has 2 trainable weights and 2 non-trainable weights
# It uses non-trainable weights to keep track of the mean and variance of its inputs during training.

layer = keras.layers.BatchNormalization()
layer.build((None, 4))  # Create the weights

print("weights:", len(layer.weights))
print("trainable_weights:", len(layer.trainable_weights))
print("non_trainable_weights:", len(layer.non_trainable_weights))

In [None]:
layer = keras.layers.Dense(3)
layer.build((None, 4))  # Create the weights
layer.trainable = False  # Freeze the layer

print("weights:", len(layer.weights))
print("trainable_weights:", len(layer.trainable_weights))
print("non_trainable_weights:", len(layer.non_trainable_weights))

In [None]:
### When a trainable weight becomes non-trainable, its value is no longer updated during training.
 
# Make a model with 2 layers
layer1 = keras.layers.Dense(3, activation="relu")
layer2 = keras.layers.Dense(3, activation="sigmoid")
model = keras.Sequential([keras.Input(shape=(3,)), layer1, layer2])

# Freeze the first layer
layer1.trainable = False

# Keep a copy of the weights of layer1 for later reference
initial_layer1_weights_values = layer1.get_weights()

# Train the model
model.compile(optimizer="adam", loss="mse")
model.fit(np.random.random((2, 3)), np.random.random((2, 3)))

In [None]:
# Check that the weights of layer1 have not changed during training
final_layer1_weights_values = layer1.get_weights()
np.testing.assert_allclose(
    initial_layer1_weights_values[0], final_layer1_weights_values[0]
    )
np.testing.assert_allclose(
    initial_layer1_weights_values[1], final_layer1_weights_values[1]
    )

In [None]:
# If you set `trainable = False` on a model or on any layer that has sublayers,
# all children layers become non-trainable as well.

inner_model = keras.Sequential(
    [
     keras.Input(shape=(3,)),
     keras.layers.Dense(3, activation="relu"),
     keras.layers.Dense(3, activation="relu"),
     ]
     )

model = keras.Sequential(
    [keras.Input(shape=(3,)), inner_model, keras.layers.Dense(3, activation="sigmoid"),]
    )

model.trainable = False  # Freeze the outer model

assert inner_model.trainable == False  # All layers in `model` are now frozen
assert inner_model.layers[0].trainable == False  # `trainable` is propagated recursively

#### <font color = green> **3.2.** </font> The typical transfer-learning workflow

Here's what the first workflow looks like in Keras:

First, instantiate a base model with pre-trained weights.

```python
base_model = keras.applications.Xception(
    weights='imagenet',  # Load weights pre-trained on ImageNet.
    input_shape=(150, 150, 3),
    include_top=False)  # Do not include the ImageNet classifier at the top.
```

Then, freeze the base model.

```python
base_model.trainable = False
```

Create a new model on top.

```python
inputs = keras.Input(shape=(150, 150, 3))
# We make sure that the base_model is running in inference mode here,
# by passing `training=False`. This is important for fine-tuning, as you will
# learn in a few paragraphs.
x = base_model(inputs, training=False)
# Convert features of shape `base_model.output_shape[1:]` to vectors
x = keras.layers.GlobalAveragePooling2D()(x)
# A Dense classifier with a single unit (binary classification)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
```

Train the model on new data.

```python
model.compile(optimizer=keras.optimizers.Adam(),
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[keras.metrics.BinaryAccuracy()])
model.fit(new_dataset, epochs=20, callbacks=..., validation_data=...)
```

#### <font color = green> **3.3.** </font> Fine-tuning

This is how to implement fine-tuning of the whole base model:

```python
# Unfreeze the base model
base_model.trainable = True

# It's important to recompile your model after you make any changes
# to the `trainable` attribute of any inner layer, so that your changes
# are take into account
model.compile(optimizer=keras.optimizers.Adam(1e-5),  # Very low learning rate
              loss=keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=[keras.metrics.BinaryAccuracy()])

# Train end-to-end. Be careful to stop before you overfit!
model.fit(new_dataset, epochs=10, callbacks=..., validation_data=...)
```

**Important note about `compile()` and `trainable`**

Calling `compile()` on a model is meant to "freeze" the behavior of that model. \
This implies that the `trainable` attribute values at the time the model is compiled should be preserved throughout the lifetime of that model, until `compile` is called again. \
Hence, if you change any `trainable` value, make sure to call `compile()` again on your model for your changes to be taken into account.

**Important notes about `BatchNormalization` layer**

Many image models contain `BatchNormalization` layers. That layer is a special case on
 every imaginable count. Here are a few things to keep in mind.

- `BatchNormalization` contains 2 non-trainable weights that get updated during training.
  - These are the variables tracking the mean and variance of the inputs.
- When you set `bn_layer.trainable = False`, the `BatchNormalization` layer will run in inference mode, and will not update its mean & variance statistics.
  - This is not the case for other layers in general, as
[weight trainability & inference/training modes are two orthogonal concepts](
  https://keras.io/getting_started/faq/#whats-the-difference-between-the-training-argument-in-call-and-the-trainable-attribute).
  - But the two are tied in the case of the `BatchNormalization` layer.
- When you unfreeze a model that contains `BatchNormalization` layers in order to do fine-tuning, you should keep the `BatchNormalization` layers in inference mode by passing `training=False` when calling the base model.
  - Otherwise the updates applied to the non-trainable weights will suddenly destroy what the model has learned.

You'll see this pattern in action in the end-to-end example at the end of this guide.

#### <font color = green> **3.4.** </font> Transfer learning & fine-tuning with a custom training loop

If instead of `fit()`, you are using your own low-level training loop, the workflow stays essentially the same. \
You should be careful to only take into account the list `model.trainable_weights` when applying gradient updates:

```python
# Create base model
base_model = keras.applications.Xception(
    weights='imagenet',
    input_shape=(150, 150, 3),
    include_top=False)
# Freeze base model
base_model.trainable = False
```

```python
# Create new model on top.
inputs = keras.Input(shape=(150, 150, 3))
x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

loss_fn = keras.losses.BinaryCrossentropy(from_logits=True)
optimizer = keras.optimizers.Adam()
```

```python
# Iterate over the batches of a dataset.
for inputs, targets in new_dataset:
    # Open a GradientTape.
    with tf.GradientTape() as tape:
        # Forward pass.
        predictions = model(inputs)
        # Compute the loss value for this batch.
        loss_value = loss_fn(targets, predictions)

    # Get gradients of loss wrt the *trainable* weights.
    gradients = tape.gradient(loss_value, model.trainable_weights)
    # Update the weights of the model.
    optimizer.apply_gradients(zip(gradients, model.trainable_weights))
```

Likewise for fine-tuning.

#### <font color = green> **3.5.** </font> An end-to-end example: fine-tuning an image classification model on a cats vs. dogs dataset

In [None]:
#### Getting the data
# To keep our dataset small, we will use 40% of the original training data (25,000 images)
#  for training, 10% for validation, and 10% for testing.

import tensorflow_datasets as tfds

tfds.disable_progress_bar()

train_ds, validation_ds, test_ds = tfds.load(
    "cats_vs_dogs",
    # Reserve 10% for validation and 10% for test
    split=["train[:40%]", "train[40%:50%]", "train[50%:60%]"],
    as_supervised=True,  # Include labels
)

print("Number of training samples: %d" % tf.data.experimental.cardinality(train_ds))
print(
    "Number of validation samples: %d" % tf.data.experimental.cardinality(validation_ds)
)
print("Number of test samples: %d" % tf.data.experimental.cardinality(test_ds))

### These are the first 9 images in the training dataset -- as you can see, they're all different sizes.

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for i, (image, label) in enumerate(train_ds.take(9)):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(image)
    plt.title(int(label))
    plt.axis("off")

### We can also see that label 1 is "dog" and label 0 is "cat".

In [None]:
#### Standardizing the data
# each pixel consists of 3 integer values between 0 and 255 (RGB level values).
# - Standardize to a fixed image size. We pick 150x150.
# - Normalize pixel values between -1 and 1.

size = (150, 150)

train_ds = train_ds.map(lambda x, y: (tf.image.resize(x, size), y))
validation_ds = validation_ds.map(lambda x, y: (tf.image.resize(x, size), y))
test_ds = test_ds.map(lambda x, y: (tf.image.resize(x, size), y))

In [None]:
### Besides, let's batch the data and use caching & prefetching to optimize loading speed.

batch_size = 32

train_ds = train_ds.cache().batch(batch_size).prefetch(buffer_size=10)
validation_ds = validation_ds.cache().batch(batch_size).prefetch(buffer_size=10)
test_ds = test_ds.cache().batch(batch_size).prefetch(buffer_size=10)

In [None]:
#### Using random data augmentation

from tensorflow import keras
from tensorflow.keras import layers

data_augmentation = keras.Sequential(
    [layers.experimental.preprocessing.RandomFlip("horizontal"),
     layers.experimental.preprocessing.RandomRotation(0.1),
     ])

In [None]:
### Let's visualize what the first image of the first batch looks like after various random transformations:

import numpy as np

for images, labels in train_ds.take(1):
  plt.figure(figsize=(10, 10))
  first_image = images[0]
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    augmented_image = data_augmentation(
        tf.expand_dims(first_image, 0), training=True
        )
    plt.imshow(augmented_image[0].numpy().astype("int32"))
    plt.title(int(labels[i]))
    plt.axis("off")

In [None]:
### Build a model
# Note that:
# - We add a `Normalization` layer to scale input values (initially in the `[0, 255]` range) to the `[-1, 1]` range.
# - We add a `Dropout` layer before the classification layer, for regularization.
# - We make sure to pass `training=False` when calling the base model, so that it runs in inference mode,
#     so that batchnorm statistics don't get updated even after we unfreeze the base model for fine-tuning.

base_model = keras.applications.Xception(
    weights="imagenet",  # Load weights pre-trained on ImageNet.
    input_shape=(150, 150, 3),
    include_top=False,
)  # Do not include the ImageNet classifier at the top.

In [None]:
# Freeze the base_model
base_model.trainable = False

In [None]:
# Create new model on top
inputs = keras.Input(shape=(150, 150, 3))
x = data_augmentation(inputs)  # Apply random data augmentation

In [None]:
# Pre-trained Xception weights requires that input be normalized
# from (0, 255) to a range (-1., +1.), the normalization layer
# does the following, outputs = (inputs - mean) / sqrt(var)
norm_layer = keras.layers.experimental.preprocessing.Normalization()
mean = np.array([127.5] * 3)
var = mean ** 2
# Scale inputs to [-1, +1]
x = norm_layer(x)
norm_layer.set_weights([mean, var])

In [None]:
# The base model contains batchnorm layers. We want to keep them in inference mode
# when we unfreeze the base model for fine-tuning, so we make sure that the
# base_model is running in inference mode here.
x = base_model(x, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
x = keras.layers.Dropout(0.2)(x)  # Regularize with dropout
outputs = keras.layers.Dense(1)(x)
model = keras.Model(inputs, outputs)

model.summary()

In [None]:
### Train the top layer

model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
    )

epochs = 20
model.fit(train_ds, epochs=epochs, validation_data=validation_ds)

# 1stエポックだけ1分、以降1エポック30秒かからないくらい@GPU

#### <font color = green> **3.6.** </font> Do a round of fine-tuning of the entire model

In [None]:
# Unfreeze the base_model. Note that it keeps running in inference mode
# since we passed `training=False` when calling it. This means that
# the batchnorm layers will not update their batch statistics.
# This prevents the batchnorm layers from undoing all the training
# we've done so far.

base_model.trainable = True
model.summary()

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-5),  # Low learning rate
    loss=keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=[keras.metrics.BinaryAccuracy()],
    )

epochs = 3  ###
model.fit(train_ds, epochs=epochs, validation_data=validation_ds)

### After 10 epochs, fine-tuning gains us a nice improvement here.
# 1エポック2分かからないくらい@GPU

### <font color = blue>**4.** </font> EfficientNet

[EfficientNetを最速で試す方法](https://qiita.com/wakame1367/items/d90fa56bd9d11c4db50e)

[Qiita - 2019年最強の画像認識モデルEfficientNet解説](https://qiita.com/omiita/items/83643f78baabfa210ab1)

In [None]:
import numpy as np
import matplotlib.pylab as plt
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers

In [None]:
tf.__version__

In [None]:
### TensorFlow flowers データセットを利用
# このデータをモデルにロードするには tf.keras.preprocessing.image.ImageDataGenerator を使うのがもっとも簡単な方法
# すべての TensorFlow Hub の画像モジュールは [0, 1] の範囲で float で入力されることを想定しています
# 入力をリスケールする際には ImageDataGenerator の rescale パラメータを利用してください
# 画像のサイズは後ほど処理されます

data_root = tf.keras.utils.get_file(
    'flower_photos','https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
    untar=True)

In [None]:
IMAGE_SHAPE = (224, 224)

image_generator = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)
image_data = image_generator.flow_from_directory(str(data_root), target_size=IMAGE_SHAPE)

In [None]:
# クラス数はデータに合わせて適当に変えてください
# num_classes = 10

In [None]:
### 結果のオブジェクトは image_batch, label_batch のペアを返すイテレーターです

for image_batch, label_batch in image_data:
  print("Image batch shape: ", image_batch.shape)
  print("Label batch shape: ", label_batch.shape)
  break

In [None]:
# URLはこちらのページ https://tfhub.dev/google/collections/efficientnet/1
# 末尾に記載されているURLから使いたいモデル(B0-B7)までのどれかを選んでください
# 今回はB0を使います

feature_extractor_url = "https://tfhub.dev/google/efficientnet/b0/feature-vector/1"

In [None]:
# 特徴抽出器 (feature extractor) を作成します
# width/heightについてはB0は(224, 224)が推奨とされているのでそうしています
# 推奨のwidth/heightについてはこちらのページをご覧ください https://tfhub.dev/google/collections/efficientnet/1

feature_extractor_layer = hub.KerasLayer(feature_extractor_url,
                                         input_shape=(224,224,3))

In [None]:
# これは画像毎に長さ 1280 のベクトルデータを返します

feature_batch = feature_extractor_layer(image_batch)
print(feature_batch.shape)

In [None]:
# 学習済み重みは固定
# 訓練が新しい分類器のレイヤーのみを変更するようにします

feature_extractor_layer.trainable = False

In [None]:
# Keras functional APIで動くかなと試したのですがうまく動かず
# 公式Tutorialに倣って以下の通りにしています
# hub レイヤーをラップして、新しい分類層を追加

model = tf.keras.Sequential(
    [feature_extractor_layer,
     layers.Dense(image_data.num_classes, activation='softmax')
     ])

model.summary()

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='categorical_crossentropy',
    metrics=['acc'])

In [None]:
# 訓練のプロセスを可視化するために、各エポックの平均だけではなく各々のバッチで
# 個別に損失と正確度を記録するためのカスタムコールバックを使用

class CollectBatchStats(tf.keras.callbacks.Callback):
  def __init__(self):
    self.batch_losses = []
    self.batch_acc = []

  def on_train_batch_end(self, batch, logs=None):
    self.batch_losses.append(logs['loss'])
    self.batch_acc.append(logs['acc'])
    self.model.reset_metrics()

In [None]:
epochs = 10
steps_per_epoch = np.ceil(image_data.samples/image_data.batch_size)

batch_stats_callback = CollectBatchStats()

history = model.fit(image_data, epochs=epochs,
                    steps_per_epoch=steps_per_epoch,
                    callbacks = [batch_stats_callback],
                    )
# 1エポック15秒くらい@GPU

In [None]:
plt.figure()
plt.ylabel("Loss")
plt.xlabel("Training Steps")
plt.ylim([0,2])
plt.plot(batch_stats_callback.batch_losses)

In [None]:
plt.figure()
plt.ylabel("Accuracy")
plt.xlabel("Training Steps")
plt.ylim([0,1])
plt.plot(batch_stats_callback.batch_acc)

In [None]:
### 推論結果の確認
# 前からプロットをやり直すには、まずクラス名のリストを取得

class_names = sorted(image_data.class_indices.items(), key=lambda pair:pair[1])
class_names = np.array([key.title() for key, value in class_names])
class_names

In [None]:
# 画像のバッチをモデルに入力し、得られた ID をクラス名に変換

predicted_batch = model.predict(image_batch)
predicted_id = np.argmax(predicted_batch, axis=-1)
predicted_label_batch = class_names[predicted_id]

In [None]:
# 結果をプロット

label_id = np.argmax(label_batch, axis=-1)

plt.figure(figsize=(10,9))
plt.subplots_adjust(hspace=0.5)
for n in range(30):
  plt.subplot(6,5,n+1)
  plt.imshow(image_batch[n])
  color = "green" if predicted_id[n] == label_id[n] else "red"
  plt.title(predicted_label_batch[n].title(), color=color)
  plt.axis('off')
_ = plt.suptitle("Model predictions (green: correct, red: incorrect)")

### <font color = blue>**5.** </font> Fine-tuning a BERT model

- Copyright 2019 The TensorFlow Authors.
- Licensed under the Apache License, Version 2.0 (the "License")
- In this example, we will work through fine-tuning a BERT model using the tensorflow-models PIP package.
- The pretrained BERT model this tutorial is based on is also available on [TensorFlow Hub](https://tensorflow.org/hub), to see how to use it refer to the [Hub Appendix](#hub_bert)

#### <font color = green> **5.1.** </font> Setup

In [None]:
### Install the TensorFlow Model Garden pip package
# `tf-models-official` is the stable Model Garden package.
# Note that it may not include the latest changes in the `tensorflow_models` github repo.
# To include latest changes, you may install `tf-models-nightly`, which is the nightly Model Garden package created daily automatically.
# pip will install all models and dependencies automatically.

!pip install -q tf-models-official==2.4.0
#!pip install -q tf-models-official
#!pip install tensorflow-text
#!pip install tf-nightly

In [None]:
# Imports

import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [None]:
# Resources 
# This directory contains the configuration, vocabulary, and a pre-trained checkpoint used in this tutorial:

gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

In [None]:
# You can get a pre-trained BERT encoder from TensorFlow Hub

hub_url_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"

#### <font color = green> **5.2.** </font> The data


For this example we used the [GLUE MRPC dataset from TFDS](https://www.tensorflow.org/datasets/catalog/glue#gluemrpc).

This dataset is not set up so that it can be directly fed into the BERT model, so this section also handles the necessary preprocessing.

Get the dataset from TensorFlow Datasets

The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.

*   Number of labels: 2.
*   Size of training dataset: 3668.
*   Size of evaluation dataset: 408.
*   Maximum sequence length of training and evaluation dataset: 128.


In [None]:
glue, info = tfds.load('glue/mrpc', with_info=True,
                       # It's small, load the whole dataset
                       batch_size=-1)

In [None]:
list(glue.keys())

In [None]:
# The `info` object describes the dataset and it's features:

info.features

In [None]:
# The two classes are:

info.features['label'].names

In [None]:
# Here is one example from the training set:

glue_train = glue['train']

for key, value in glue_train.items():
  print(f"{key:9s}: {value[0].numpy()}")

#### <font color = green> **5.3.** </font> The BERT tokenizer

In [None]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
    do_lower_case=True)

print("Vocab size:", len(tokenizer.vocab))

In [None]:
# Tokenize a sentence:
tokens = tokenizer.tokenize("Hello TensorFlow!")
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

#### <font color = green> **5.4.** </font> Preprocess the data

In [None]:
# Encode the sentences
# The model expects its two inputs sentences to be concatenated together.
# This input is expected to start with a `[CLS]` "This is a classification problem" token,
# and each sentence should end with a `[SEP]` "Separator" token:

tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])

In [None]:
# Start by encoding all the sentences while appending a `[SEP]` token,
# and packing them into ragged-tensors:

def encode_sentence(s):
  tokens = list(tokenizer.tokenize(s.numpy()))
  tokens.append('[SEP]')
  return tokenizer.convert_tokens_to_ids(tokens)

sentence1 = tf.ragged.constant(
    [encode_sentence(s) for s in glue_train["sentence1"]])
sentence2 = tf.ragged.constant(
    [encode_sentence(s) for s in glue_train["sentence2"]])

In [None]:
print("Sentence1 shape:", sentence1.shape.as_list())
print("Sentence2 shape:", sentence2.shape.as_list())

In [None]:
# Now prepend a `[CLS]` token, and concatenate the ragged tensors to form
# a single `input_word_ids` tensor for each example.
# `RaggedTensor.to_tensor()` zero pads to the longest sequence.

cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
_ = plt.pcolormesh(input_word_ids.to_tensor())

In [None]:
# Mask and input type
# The mask allows the model to cleanly differentiate between the content and the padding.
# The mask has the same shape as the `input_word_ids`,
# and contains a `1` anywhere the `input_word_ids` is not padding.

input_mask = tf.ones_like(input_word_ids).to_tensor()

plt.pcolormesh(input_mask)

In [None]:
# The "input type" also has the same shape, but inside the non-padded region,
# contains a `0` or a `1` indicating which sentence the token is a part of. 

type_cls = tf.zeros_like(cls)
type_s1 = tf.zeros_like(sentence1)
type_s2 = tf.ones_like(sentence2)
input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

plt.pcolormesh(input_type_ids)

In [None]:
# Put it all together
# Collect the above text parsing code into a single function,
# and apply it to each split of the `glue/mrpc` dataset.

def encode_sentence(s, tokenizer):
  tokens = list(tokenizer.tokenize(s))
  tokens.append('[SEP]')
  return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
  num_examples = len(glue_dict["sentence1"])
  sentence1 = tf.ragged.constant(
      [encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence1"])])
  sentence2 = tf.ragged.constant(
      [encode_sentence(s, tokenizer) for s in np.array(glue_dict["sentence2"])])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {'input_word_ids': input_word_ids.to_tensor(),
            'input_mask': input_mask,
            'input_type_ids': input_type_ids}

  return inputs

In [None]:
glue_train = bert_encode(glue['train'], tokenizer)
glue_train_labels = glue['train']['label']

glue_validation = bert_encode(glue['validation'], tokenizer)
glue_validation_labels = glue['validation']['label']

glue_test = bert_encode(glue['test'], tokenizer)
glue_test_labels  = glue['test']['label']

In [None]:
# Each subset of the data has been converted to a dictionary of features,
# and a set of labels.
# Each feature in the input dictionary has the same shape,
# and the number of labels should match:

for key, value in glue_train.items():
  print(f'{key:15s} shape: {value.shape}')

print(f'glue_train_labels shape: {glue_train_labels.shape}')

#### <font color = green> **5.5.** </font> The model

In [None]:
# Build the model
# The first step is to download the configuration  for the pre-trained model.

import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

In [None]:
# The `config` defines the core BERT Model, which is a Keras model to predict
# the outputs of `num_classes` from the inputs with maximum sequence length `max_seq_length`.
# This function returns both the encoder and the classifier.

bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

In [None]:
# The classifier has three inputs and one output:

tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)

In [None]:
# Run it on a test batch of data 10 examples from the training set.
# The output is the logits for the two classes:

glue_batch = {key: val[:10] for key, val in glue_train.items()}

bert_classifier(glue_batch, training=True).numpy()

In [None]:
# The `TransformerEncoder` in the center of the classifier above **is** the `bert_encoder`.
# Inspecting the encoder, we see its stack of `Transformer` layers connected to those same three inputs:

tf.keras.utils.plot_model(bert_encoder, show_shapes=True, dpi=48)

In [None]:
# Restore the encoder weights when built the encoder is randomly initialized.
# Restore the encoder's weights from the checkpoint:

checkpoint = tf.train.Checkpoint(encoder=bert_encoder)
checkpoint.read(
    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()

Note: The pretrained `TransformerEncoder` is also available on [TensorFlow Hub](https://tensorflow.org/hub). See the [Hub appendix](#hub_bert) for details. 

In [None]:
# Set up the optimizer
# BERT adopts the Adam optimizer with weight decay (aka "[AdamW](https://arxiv.org/abs/1711.05101)").
# It also employs a learning rate schedule that firstly warms up from 0 and then decays to 0.

# Set up epochs and steps
epochs = 3
batch_size = 32
eval_batch_size = 32

train_data_size = len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [None]:
# This returns an `AdamWeightDecay`  optimizer with the learning rate schedule set:

type(optimizer)

# To see an example of how to customize the optimizer and it's schedule,
# see the [Optimizer schedule appendix](#optiizer_schedule).

#### <font color = green> **5.6.** </font> Train the model

In [None]:
# The metric is accuracy and we use sparse categorical cross-entropy as loss.

metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
    glue_train, glue_train_labels,
    validation_data=(glue_validation, glue_validation_labels),
    batch_size=32,
    epochs=epochs)

# 1エポック1分以内@GPU

In [None]:
# Now run the fine-tuned model on a custom example to see that it works.
# Start by encoding some sentence pairs:

my_examples = bert_encode(
    glue_dict = {
        'sentence1':['The rain in Spain falls mainly on the plain.',
                     'Look I fine tuned BERT.'],
        'sentence2':['It mostly rains on the flat lands of Spain.',
                     'Is it working? This does not match.']
                 },
                 tokenizer=tokenizer)

In [None]:
# The model should report class `1` "match" for the first example
# and class `0` "no-match" for the second:

result = bert_classifier(my_examples, training=False)

result = tf.argmax(result).numpy()
result

In [None]:
np.array(info.features['label'].names)[result]

#### <font color = green> **5.7.** </font> Save the model

In [None]:
# Often the goal of training a model is to _use_ it for something,
# so export the model and then restore it to be sure that it works.

export_dir='./saved_model'
tf.saved_model.save(bert_classifier, export_dir=export_dir)

In [None]:
reloaded = tf.saved_model.load(export_dir)
reloaded_result = reloaded([my_examples['input_word_ids'],
                            my_examples['input_mask'],
                            my_examples['input_type_ids']], training=False)

original_result = bert_classifier(my_examples, training=False)

# The results are (nearly) identical:
print(original_result.numpy())
print()
print(reloaded_result.numpy())