## BERT Classification (fine tuning)

BERT を使った文書分類のタスクを Amazon SageMaker で動かすためのサンプルです。

### 書き換え
- TPU 依存している部分を少し修正
- `export_savedmodel` で保存
- `eval/predict` は消去。
  - `predict` 部分は TensorFlow Serving で。
- 推論用ファイル読み込み
- Managed Spot Training

#### あとで書く
- GitHub から直接トレーニング
- Horovod で複数 GPU 分散学習 [[blog](https://lambdalabs.com/blog/bert-multi-gpu-implementation-using-tensorflow-and-horovod-with-code/)]

In [None]:
import sagemaker
print('SageMaker Python SDK Version:', sagemaker.__version__)

import os
from sagemaker.utils import sagemaker_timestamp
from sagemaker.tensorflow import TensorFlow
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

default_s3_bucket = sagemaker_session.default_bucket()
role = get_execution_role()

In [None]:
# download the model
!wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip
!unzip multi_cased_L-12_H-768_A-12.zip
!cp multi_cased_L-12_H-768_A-12/vocab.txt src/
    
# # download GLUE data
# !wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
# !python download_glue_data.py

In [None]:
# put model and data to S3
# model = sagemaker_session.upload_data('uncased_L-12_H-768_A-12', key_prefix='model/bert')
model_multi_cased = sagemaker_session.upload_data('multi_cased_L-12_H-768_A-12', key_prefix='model/bert/multi_cased_L-12_H-768_A-12')

# data = sagemaker_session.upload_data('glue_data', key_prefix='data')
aae_data = sagemaker_session.upload_data('aae_data/aae_questions.csv', key_prefix='data/aae')

In [None]:
# pass to the estimator.fit method
train_instance_type='ml.p3.2xlarge'
# train_instance_type='ml.c5.2xlarge'
# train_instance_type='local_gpu'

# model_uncased = '{}/uncased_L-12_H-768_A-12'.format(model_data)
# model_multi_cased = '{}/multi_cased_L-12_H-768_A-12'.format(model_data)

# glue_data = '{}/glue_data'.format(data)
# aae_data = data

# pass to the run_classifier.py
bert_model_dir = '/opt/ml/input/data/model'
# train_dir = '/opt/ml/input/data/train'
train_dir = '/opt/ml/input/data/train'

estimator = TensorFlow(entry_point="run_classifier.py",
                       source_dir='classifier', 
                       role=role,
                       train_instance_count=1,
                       train_instance_type=train_instance_type,
                       script_mode=True,
                       framework_version='1.12',
                       py_version='py3',
                       base_job_name='BERT-classfication-demo', 
                       hyperparameters={
#                            'task_name': 'MRPC', 
                           'task_name': 'AAE', 
                           'do_lower_case': 'true', 
                           'do_train': 'false', 
                           'do_eval': 'false', 
#                            'data_dir': '{}/MRPC'.format(glue_dir), 
                           'data_dir': train_dir, 
                           'vocab_file': '{}/vocab.txt'.format(bert_model_dir), 
                           'bert_config_file': '{}/bert_config.json'.format(bert_model_dir), 
                           'init_checkpoint': '{}/bert_model.ckpt'.format(bert_model_dir), 
                           'max_seq_length': '128', 
                           'train_batch_size': '32', 
                           'learning_rate': '2e-5', 
                           'num_train_epochs': '1.0', 
                           'output_dir': '/opt/ml/checkpoints'
                       }, 
                       
#                        train_use_spot_instances=True, 
#                        train_max_wait = 2*24*60*60, 
#                        checkpoint_s3_uri='s3://{}/checkpoint/BERT/classification/test/'.format(default_s3_bucket), 
#                        checkpoint_local_path='/opt/ml/checkpoints'
                      )

In [None]:
# %%time
# estimator.fit({'glue': glue_data, 'model': model_uncased})
estimator.fit({'train': aae_data, 'model': model_multi_cased})

## Inference

In [None]:
labels = ['AI/ML/Analytics', 'コンテナ', 'DB/Storage', 'モバイル/サーバーレス']

In [None]:
%%time
from sagemaker import get_execution_role
from sagemaker.tensorflow.serving import Model

role = get_execution_role()

model = Model(
    model_data=estimator.model_data, 
    role=role, 
    entry_point='inference.py', 
    source_dir='classifier', 
    framework_version='1.12'
)

predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.xlarge')

In [None]:
import json
import numpy as np

# text = 'SageMakerでBERTのモデルを動かしたい'
text = 'Hello World'

result = predictor.predict({"instances": text})
result
print("この質問は {} っぽいです。".format(labels[np.argmax(result['predictions'])]))

In [None]:
predictor.delete_endpoint()

## TensorFlow Serving のデバッグ

`!saved_model_cli show --dir export/Servo/<timestamp> --all`

In [None]:
examples = []
with open('aae_data/aae_questions.csv', mode='rt', encoding='utf-8') as f:
    lines = f.readlines()
    for (i, line) in enumerate(lines):
      example = line.split(',')
      label = int(example[0])
      text = (example[1])
      examples.append(
          InputExample(guid=i, text_a=text, label=label))

In [None]:
class InputExample(object):
  """A single training/test example for simple sequence classification."""

  def __init__(self, guid, text_a, text_b=None, label=None):
    """Constructs a InputExample.

    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label

In [None]:
examples[0].text_a