<a href="https://colab.research.google.com/github/jangjoongkeon/JK/blob/master/bert_pretraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TPU config

In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.52.142.138:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 11766755298374391227),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 17951037253183963110),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 17179869184, 7633378283634911372),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 9154932188812812092),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 2332987434319191157),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 791603785017173140),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 728349380901738685),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 6981771681442394018),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 4535378

# Bucket config

TPUs are located in Google Cloud, for optimal performance, they read data directly from Google Cloud Storage (GCS)

In [0]:
# Available pretrained model checkpoints:
#   multi_cased_L-12_H-768_A-12 : multilingual cased BERT base
BUCKET_BASE = 'bucket_for_squad_42maru' #@param {type:"string"}
assert BUCKET_BASE, 'Must specify an existing GCS bucket base name'
BUCKET_4_PRETRAINED = 'multi_cased_L-12_H-768_A-12' #@param {type:"string"}
BERT_PRETRAINED_DIR = 'gs://{}/bert_pretrained_model/{}'.format(BUCKET_BASE, BUCKET_4_PRETRAINED)
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

***** BERT pretrained directory: gs://bucket_for_squad_42maru/bert_pretrained_model/multi_cased_L-12_H-768_A-12 *****
gs://bucket_for_squad_42maru/bert_pretrained_model/multi_cased_L-12_H-768_A-12/bert_config.json
gs://bucket_for_squad_42maru/bert_pretrained_model/multi_cased_L-12_H-768_A-12/bert_model.ckpt.index
gs://bucket_for_squad_42maru/bert_pretrained_model/multi_cased_L-12_H-768_A-12/bert_model.ckpt.meta
gs://bucket_for_squad_42maru/bert_pretrained_model/multi_cased_L-12_H-768_A-12/vocab.txt


# Retrieve code


In [0]:
from getpass import getpass

#@title Github
# Use your git id & password.
# If you don't want to reveal your password here, use the getpass().
USERNAME = "colanim" #@param {type:"string"}
GITPASS = getpass("[Github] Password for {} :".format(USERNAME))
BRANCH = "master" #@param {type:"string"}
! test -d download_glue_repo || git clone -b $BRANCH https://$USERNAME:$GITPASS@github.com/42maru/demo-site-mrc.git
!git -C /content/demo-site-mrc/ pull
!git -C /content/demo-site-mrc/ status

fatal: destination path 'demo-site-mrc' already exists and is not an empty directory.
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (1/1), done.[K
remote: Total 5 (delta 4), reused 5 (delta 4), pack-reused 0[K
Unpacking objects: 100% (5/5), done.
From https://github.com/42maru/demo-site-mrc
   6f639f6..108ef5f  dev_en     -> origin/dev_en
Already up to date.
On branch master
Your branch is up to date with 'origin/master'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mmrc_model/demo-site-mrc/[m

nothing added to commit but untracked files present (use "git add" to track)


# Run model

In [0]:
%cd /content/demo-site-mrc/mrc_model/

sys.path.append('/content/demo-site-mrc/mrc_model/')

/content/demo-site-mrc/mrc_model


In [0]:
#@title Parameters
INPUT_DIR = 'bert_pretraining/input/pretraining_input_max_seq_128' #@param {type:"string"}
BERT_PRETRAINED_INPUT_DIR = 'gs://{}/{}'.format(BUCKET_BASE, INPUT_DIR)
print('***** BERT pretrained input directory: {} *****'.format(BERT_PRETRAINED_INPUT_DIR))

# For not to overwrite the output file, make your own directory if you want to test this personally.
OUTPUT_DIR = "bert_pretraining/output/colab/128_90000" #@param {type:"string"}
BERT_PRETRAINED_OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET_BASE, OUTPUT_DIR)
tf.gfile.MakeDirs(BERT_PRETRAINED_OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(BERT_PRETRAINED_OUTPUT_DIR))

INPUT_FILES = 'tf_pretraining_data_*' #@param {type:"string"}
BERT_PRETRAINED_INPUT_FILES = BERT_PRETRAINED_INPUT_DIR + '/' + INPUT_FILES
TRAIN_BATCH_SIZE = 32 #@param ["8", "16", "32"] {type:"raw"}
MAX_SEQ_LEN = 128 #@param ["64", "128", "256", "384", "512"] {type:"raw"}
# Roughly MAX_PREDICTION_PER_SEQ = MAX_SEQ_LEN * 0.15
MAX_PREDICTION_PER_SEQ = 20 #@param ["10", "20", "40", "60", "80] {type:"raw"}
TRAIN_STEPS = 90000 #@param {type:"number"}
WARMUP_STEPS = 9000 #@param {type:"number"}
LEARNING_RATE = 2e-5 #@param {type:"number"}

!python3 -m bert.run_pretraining \
  --input_file=$BERT_PRETRAINED_INPUT_FILES \
  --output_dir=$BERT_PRETRAINED_OUTPUT_DIR \
  --do_train=True \
  --do_eval=True \
  --bert_config_file=$BERT_PRETRAINED_DIR/bert_config.json \
  --init_checkpoint=$BERT_PRETRAINED_DIR/bert_model.ckpt \
  --train_batch_size=$TRAIN_BATCH_SIZE \
  --max_seq_length=$MAX_SEQ_LEN \
  --max_predictions_per_seq=$MAX_PREDICTION_PER_SEQ \
  --num_train_steps=$TRAIN_STEPS \
  --num_warmup_steps=$WARMUP_STEPS \
  --learning_rate=$LEARNING_RATE \
  --use_tpu=True \
  --tpu_name=$TPU_ADDRESS

***** BERT pretrained input directory: gs://bucket_for_squad_42maru/bert_pretraining/input/pretraining_input_max_seq_128 *****
***** Model output directory: gs://bucket_for_squad_42maru/bert_pretraining/output/colab/128_90000 *****
INFO:tensorflow:*** Input Files ***
INFO:tensorflow:  gs://bucket_for_squad_42maru/bert_pretraining/input/pretraining_input_max_seq_128/tf_pretraining_data_10_128ver.tfrecord
INFO:tensorflow:  gs://bucket_for_squad_42maru/bert_pretraining/input/pretraining_input_max_seq_128/tf_pretraining_data_1_128ver.tfrecord
INFO:tensorflow:  gs://bucket_for_squad_42maru/bert_pretraining/input/pretraining_input_max_seq_128/tf_pretraining_data_2_128ver.tfrecord
INFO:tensorflow:  gs://bucket_for_squad_42maru/bert_pretraining/input/pretraining_input_max_seq_128/tf_pretraining_data_3_128ver.tfrecord
INFO:tensorflow:  gs://bucket_for_squad_42maru/bert_pretraining/input/pretraining_input_max_seq_128/tf_pretraining_data_4_128ver.tfrecord
INFO:tensorflow:  gs://bucket_for_squad_4