# 구글 드라이브와 코랩 연동 모듈 설치

In [1]:
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 155320 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.27-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.27-0ubuntu1~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


# Sentencepiece와 nltk 설치

In [2]:
!pip install sentencepiece
!pip install nltk

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 21.2 MB/s eta 0:00:01[K     |▌                               | 20 kB 12.5 MB/s eta 0:00:01[K     |▉                               | 30 kB 10.9 MB/s eta 0:00:01[K     |█                               | 40 kB 9.6 MB/s eta 0:00:01[K     |█▍                              | 51 kB 4.4 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.2 MB/s eta 0:00:01[K     |██                              | 71 kB 5.6 MB/s eta 0:00:01[K     |██▏                             | 81 kB 4.4 MB/s eta 0:00:01[K     |██▍                             | 92 kB 4.8 MB/s eta 0:00:01[K     |██▊                             | 102 kB 5.3 MB/s eta 0:00:01[K     |███                             | 112 kB 5.3 MB/s eta 0:00:01[K     |███▎                            | 122 kB 5.3 MB/s eta 0:00:01[K     |███▌       

# Tensorflow 버전 1.x 세팅

- BERT를 사용하기 위해서는 Tensorflow 1.x 버전 필요

In [3]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [55]:
# tensorflow 버전 확인
import tensorflow as tf
tf.__version__

'1.15.2'

# Google Drive와 GCP 연동

In [5]:
from google.colab import auth
auth.authenticate_user()

from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()

import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()

!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the 

In [6]:
# Gdrive 폴더 만들기
!mkdir -p Gdrive

# google drvie와 colab 연동시키기
!google-drive-ocamlfuse Gdrive -o nonempty

# 파일 리스트 확인
!ls

'''
adc.json 이란 파일에는
어떤 정보가 들어 있다.
구글 코랩과 드라이브 연동 관련된 내요!
정확히 파악해서 채워 넣기
'''

adc.json  Gdrive  sample_data


'\nadc.json 이란 파일에는\n어떤 정보가 들어 있다.\n구글 코랩과 드라이브 연동 관련된 내요!\n정확히 파악해서 채워 넣기\n'

# TPU 정보를 파악하고 bucket과 연결하기 위해 세팅

In [7]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

# 현재 colab 런타임 환경이 TPU로 세팅돼 있는지 확인
assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'

# TPU 주소 추출
TPU_ADDRESS = 'grpc://'+os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

# 코랩의 인증 모듈로 할당된 TPU 모델 확인하기
from google.colab import auth
auth.authenticate_user()

with tf.Session(TPU_ADDRESS) as session:
    print('TPU devices:')
    pprint.pprint(session.list_devices())

    # TPU에 credentials 정보 저장하기
    with open('/content/adc.json', 'r') as f :
        auth_info = json.load(f)
    
    # google cloud storage(bucket)에 현재 TPU와 인증 정보 설정하기
    # 이 세팅을 해야 gcp bucket과 연동 가능
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)

TPU address is grpc://10.114.205.90:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 9451266091524022638),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 8665704821523635106),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 1959677236699311888),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4117399547168169297),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 453460110833258068),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 5233076345863697036),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 2175825685436840196),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 12590735828732294829),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 184193830827238

- 아래는 위 코드랑 조금 다르게 보이는데 결국은 TPU 정보를 얻어오는 코드다.
- 아래 설정한 과정은 /content.adc.json 파일에 저장되고, 이 파일 설정으로 tpu 사용하게 된다.

In [56]:
import datetime
import json
import os
import pprint
import random
import string
import logging

# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

if 'COLAB_TPU_ADDR' in os.environ:
  log.info("Using TPU runtime")
  USE_TPU = True
  TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']

  with tf.Session(TPU_ADDRESS) as session:
    log.info('TPU address is ' + TPU_ADDRESS)
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
      auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    
else:
  log.warning('Not connected to TPU runtime')
  USE_TPU = False

2022-03-07 04:48:03,007 :  Using TPU runtime
2022-03-07 04:48:03,014 :  TPU address is grpc://10.114.205.90:8470


# bert 모델 사용 위한 변수 등록

In [None]:
!git clone https://github.com/google-research/bert

import sys
sys.path.append('bert')

# bert 패키지 사용
from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder

# OpenSubtitles 데이터셋 다운

In [8]:
AVAILABLE = {'af','ar','bg','bn','br','bs','ca','cs',
              'da','de','el','en','eo','es','et','eu',
              'fa','fi','fr','gl','he','hi','hr','hu',
              'hy','id','is','it','ja','ka','kk','ko',
              'lt','lv','mk','ml','ms','nl','no','pl',
              'pt','pt_br','ro','ru','si','sk','sl','sq',
              'sr','sv','ta','te','th','tl','tr','uk',
              'ur','vi','ze_en','ze_zh','zh','zh_cn',
              'zh_en','zh_tw','zh_zh'}

LANG_CODE = 'ko'

assert LANG_CODE in AVAILABLE, "Invalid language code selected"

# OpenSubtitles 데이터셋 다운
# 한글 데이터셋은 압축된 상태로 약 8MB 용량
!wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.'$LANG_CODE'.gz -O dataset.txt.gz
!gzip -d dataset.txt.gz
!tail dataset.txt

--2022-03-07 02:49:40--  http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.ko.gz
Resolving opus.nlpl.eu (opus.nlpl.eu)... 193.166.25.9
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.ko.gz [following]
--2022-03-07 02:49:41--  https://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.ko.gz
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ko.txt.gz [following]
--2022-03-07 02:49:42--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ko.txt.gz
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP req

# 데이터셋을 일부만 활용해서 학습

In [17]:
DEMO_MODE = True

if DEMO_MODE:
    CORPUS_SIZE = 100000
else:
    CORPUS_SIZE = 100000000

!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
!mv subdataset.txt dataset.txt

# 데이터 전처리
- 이모티콘, 대문자 제거

In [57]:
import nltk
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
    # 다 소문자로
    text = str(text).lower()
    # non-UTF 제거하기
    text = text.encode('utf-8', 'ignore').decode()
    # punktuation symbol 제거
    text = ' '.join(regex_tokenizer.tokenize(text))

    return text


def count_lines(filename):
    count= 0
    with open(filename) as fi:
        for line in fi:
            count+=1
    return count

In [18]:
# 데이터셋 경로설정
RAW_DATA_FPATH = 'dataset.txt'
PRC_DATA_FPATH = 'proc_dataset.txt'

# 위에서 만든 함수 normalize_text 활용해서 전처리

from tensorflow.keras.utils import Progbar
# 우선 전체 데이터 개수 확인
total_lines =count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH, encoding='utf-8') as fi:
    with open(PRC_DATA_FPATH, 'w', encoding='utf-8') as fo:
        for l in fi:
            fo.write(normalize_text(l) + '\n')
            bar.add(1)



# 데이터셋 텍스트 토크나이징
- BERT와 같은 NLP 모델 학습시킬 때, 토크나이징한 Vocab의 크기 적절히 제한하는 것이 모델 성능 높이는 데 기여한다.

- 큰 모델일수록 Vocab의 크기도 커지지만 보통의 경우, 3만개 내외의 Vocab을 만든다

- 아래 코드를 실행하면 SentencePiece에서 해당 모델을 잘라가면 Vocab을 생성하고, 이후 텍스트를 자르기 위한 Tokenizer를 학습한다.

In [59]:
MODEL_PREFIX='tokenizer'
VOC_SIZE = 32000
SUBSAMPLE_SIZE = 12800000   # ==> input_sentence_size
NUM_PLACEHOLDERS = 256

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true '
               '--bos_id=-1 --eos_id=-1').format(
                   PRC_DATA_FPATH, MODEL_PREFIX,
                   VOC_SIZE-NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)
               
import sentencepiece as spm
spm.SentencePieceTrainer.Train(SPM_COMMAND)

- 학습된 Sentencepiece Vocab을 로딩

In [60]:
def read_sentencepiece_vocab(filepath):
    voc = []
    with open(filepath, encoding='utf-8') as fi :
        for line in fi:
            voc.append(line.split('\t')[0])

    # 첫번째 위치해 있는 <unk> token은 제외한다.
    voc = voc[1:]
    return voc

snt_vocab = read_sentencepiece_vocab('{}.vocab'.format(MODEL_PREFIX))
print('Learned vocab size :', len(snt_vocab))
print('Sample tokes :', random.sample(snt_vocab, 10))

Learned vocab size : 31743
Sample tokes : ['▁볼라구', '▁살았다고', '▁열렸으니', '▁좋거든', '▁하겠어', '▁냅킨', '▁완벽한밤', '▁들면', '▁인턴이야', '미처']


- 위 Sentencpiece로 학습한 vocab을 BERT가 이해하는 형태로 바꿔주기 위해서, `_`로 시작한 토큰들을 `##`으로 바꿔준다

- ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]의 경우는 BERT에서 사용하는 특수 토큰이기 때문에 해당 토큰에 대한 정보들을 추가해 최종적인 bert_vocab을 만들어준다.

In [61]:
def parse_sentencepiece_token(token):
    if token.startswith('▁'):
        return token[1:]
    else:
        return '##' + token

bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))
ctrl_symbols = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
bert_vocab = ctrl_symbols + bert_vocab

- 사용하지 않은 vocab_range에 있는 것들을 넣어서 bert_vocab의 크기를 앞서 지정한 VOC_SIZE에 맞춰준다.

In [62]:
bert_vocab += ['[UNUSED_{}'.format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

32000


- vocab.txt 텍스트 파일에 위 토큰들을 모두 한 줄 한 줄 저장한다.

In [63]:
VOC_FNAME='vocab.txt'

with open(VOC_FNAME, 'w') as fo :
    for token in bert_vocab:
        fo.write(token+'\n')

# 학습 데이터 분할
- 학습 데이터의 크기가 굉장히 클 수 있기 때문에 학습 원천 데이터를 적당한 사이즈로 잘라준다.

In [64]:
!mkdir ./shards/
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

mkdir: cannot create directory ‘./shards/’: File exists
shard_0000


# BERT Pretraining을 위한 변수 세팅
- MAX_SEQ_LENGTH : BERT의 모델 입력값의 최장 토큰 길이 (이 길이 이상으로 BERT 모델은 이해하지 못한다)
- MASKED_LM_PROB : BERT의 학습 중 Masked LM의 비율 조정
- MAX_PREDICTIONS : Sequence별 예측할 최대 길이
- DO_LOWER_CASE : 영문자를 lower 할지 결정(한글에는 무의미)
- PROCESSES : 전처리할 대 CPU 몇 개 쓸지
- PRETRAINING_DIR : pre_trained 데이터 폴더 이름

In [65]:
MAX_SEQ_LENGTH = 128
MASKED_LM_PROB = 0.15
MAX_PREDICTIONS = 20
DO_LOWER_CASE = False
PROCESSES = 4
PRETRAINING_DIR = 'pretraining_data'

- 위 세팅값을 기본으로 아래 코드 실행하면 pretraining data 만들어진다.

In [74]:
XARGS_CMD = ("ls ./shards/ |"
             "xargs -n 1 -P {} -I{}"
             "python3 bert/create_pretraining_data.py"
             "--input_file=./shards/{}"
             "--output_file={}/{}.tfrecord"
             "--vocab_file={}"
             "--do_lower_case={}"
             "--max_predictions_per_seq={}"
             "--max_seq_length={}"
             "masked_lm_prob={}"
             "--random_seed=34"
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}',
                             PRETRAINING_DIR, '{}',
                             VOC_FNAME, DO_LOWER_CASE,
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

xargs: bert/create_pretraining_data.py--input_file=./shards/{}--output_file=pretraining_data/{}.tfrecord--vocab_file=vocab.txt--do_lower_case=True--max_predictions_per_seq=20--max_seq_length=128masked_lm_prob=0.15--random_seed=34--dupe_factor=5: No such file or directory


In [75]:
XARGS_CMD = ("ls ./shards/ | " "xargs -n 1 -P {} -I{} " "python3 bert/create_pretraining_data.py " "--input_file=./shards/{} " "--output_file={}/{}.tfrecord " "--vocab_file={} " "--do_lower_case={} " "--max_predictions_per_seq={} " "--max_seq_length={} " "--masked_lm_prob={} " "--random_seed=34 " "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', VOC_FNAME, DO_LOWER_CASE, MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

tf.gfile.MkDir(PRETRAINING_DIR) 
!$XARGS_CMD



W0307 05:11:55.974474 140540749305728 module_wrapper.py:139] From bert/create_pretraining_data.py:437: The name tf.logging.set_verbosity is deprecated. Please use tf.compat.v1.logging.set_verbosity instead.


W0307 05:11:55.974711 140540749305728 module_wrapper.py:139] From bert/create_pretraining_data.py:437: The name tf.logging.INFO is deprecated. Please use tf.compat.v1.logging.INFO instead.


W0307 05:11:55.974928 140540749305728 module_wrapper.py:139] From /content/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.


W0307 05:11:56.087627 140540749305728 module_wrapper.py:139] From bert/create_pretraining_data.py:444: The name tf.gfile.Glob is deprecated. Please use tf.io.gfile.glob instead.


W0307 05:11:56.088304 140540749305728 module_wrapper.py:139] From bert/create_pretraining_data.py:446: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

INFO:tensorflow:*** Reading from input files ***

# GCP 버킷에 모델 & 학습 데이터 올리기
- tensorflow를 통해 TPU 사용하려면 GCP bucket에 데이터와 모델이 업로드 돼 있어야 한다.
- `BUCKET_NAME`을 개인의 GCS 버킷 이름으로 수정

In [76]:
BUCKET_NAME = 'nlp_pretrain_model'
MODEL_DIR = 'bert_model'
tf.gfile.MkDir(MODEL_DIR)

# BERT Model 하이퍼 파라미터 설정
- 아래 설정은 BERT Base 모델의 기본값이다. 아래 값을 수정하면 좀 더 다르게 학습된 BERT 모델을 만들 수 있다.
- 얼마나 Dropout 해줄지, Bidirectional하게 할지, Activation Function은 뭘로 해줄지, Hidden Size는 얼마나 할지, Attention Head는 몇개로 해줄지, 레이어는 얼마나 쌓을지 등

In [77]:
bert_base_config = {
    'attention_probs_dropout_prob':0.1,
    'directionality':'bidi',
    'hidden_act':'gelu',
    'hidden_dropout_prob':0.1,
    'hidden_size':768,
    'initializer_range':0.02,
    'intermediate_size':3072,
    'max_position_embeddings':512,
    'num_attention_heads':12,
    'num_hidden_layers':12,
    'pooler_fc_size':768,
    'pooler_num_attention_heads':12,
    'pooler_num_fc_layers':3,
    'pooler_size_per_head':128,
    'pooler_type': 'first_token_transform',
    'type_vocab_size':2,
    'vocab_size': VOC_SIZE
}

with open('{}/bert_config.json'.format(MODEL_DIR), 'w') as fo:
    json.dump(bert_base_config, fo, indent=2)

with open('{}/{}'.format(MODEL_DIR, VOC_FNAME), 'w') as fo :
    for token in bert_vocab:
        fo.write(token + '\n')

- 앞에 만들어준 모델, 프리트레이닝 데이터를 GCS 버킷에 업로드

In [78]:
if BUCKET_NAME:
    !gsutil -m cp -r $MODEL_DIR $PRETRAINING_DIR gs://$BUCKET_NAME

Copying file://bert_model/bert_config.json [Content-Type=application/json]...
/ [0/3 files][    0.0 B/ 18.0 MiB]   0% Done                                    Copying file://bert_model/vocab.txt [Content-Type=text/plain]...
Copying file://pretraining_data/shard_0000.tfrecord [Content-Type=application/octet-stream]...
/ [3/3 files][ 18.0 MiB/ 18.0 MiB] 100% Done                                    
Operation completed over 3 objects/18.0 MiB.                                     


# 모델 학습 하이퍼 파라미터 설정
- GCS 버킷에 데이터와 모델을 모두 업로드 해준 뒤, 실제 TPU에서 학습 진행하도록 명령 넘겨줘야 한다.
- BUCKET_NAME만 위와 동일하게 수정해주자
- 중간의 BATCH_SIZE, LEARNING_RATE, TRAIN_STEPS, NUM_TPU_CORES 등의 변수르 조절해 모델의 학습 속도 및 성능 조정할 수 있다.
- Colab의 TPU는 v3-8이므로 NUM_TPU_CORES는 8Core가 최대

In [79]:
# Input data pipeline config
TRAIN_BATCH_SIZE = 128
MAX_PREDICTIONS = 20
MAX_SEQ_LENGTH =128
MASKED_LM_PROB = 0.15

# Training procedure config
EVAL_BATCH_SIZE = 64
LEARNING_RATE = 2e-5
TRAIN_STEPS = 10000
SAVE_CHECKPOINTS_STEPS = 2500
NUM_TPU_CORES=8

if  BUCKET_NAME:
    BUCKET_PATH = 'gs://{}'.format(BUCKET_NAME)
else:
    BUCKET_PATH = '.'

BERT_GCS_DIR = '{}/{}'.format(BUCKET_PATH, MODEL_DIR)
DATA_GCS_DIR = '{}/{}'.format(BUCKET_PATH, PRETRAINING_DIR)

VOCAB_FILE = os.path.join(BERT_GCS_DIR, VOC_FNAME)
CONFIG_FILE = os.path.join(BERT_GCS_DIR, 'bert_config.json')

INIT_CHECKPOINT = tf.train.latest_checkpoint(BERT_GCS_DIR)

bert_config = modeling.BertConfig.from_json_file(CONFIG_FILE)
input_files =tf.gfile.Glob(os.path.join(DATA_GCS_DIR, '*tfrecord'))

log.info('Using checkpoint : {}'.format(INIT_CHECKPOINT))
log.info('Using {} data shards'.format(len(input_files)))

2022-03-07 05:15:07,812 :  Using checkpoint : None
2022-03-07 05:15:07,816 :  Using 1 data shards


# 모델을 TPU로 올리고 학습
- model_fn 이란 이름의 딥러닝 모델 설정 객체 생성하고 TPU에 연결 후
- 어떻게 학습 하고 어디에 checkpoint를 정리할 지 등 지정해줘야 한다.
- 이후 TPUEstmator를 통해 모델 객체, 설정 객체를 전달해주고, 해당 estimator를 `estimator.train()`하면 TPU 위에서 BERT 모델 학습 진행된다.

In [80]:
model_fn = model_fn_builder(
    bert_config=bert_config,
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=TRAIN_STEPS,
    num_warmup_steps=10,
    use_tpu=USE_TPU,
    use_one_hot_embeddings=True
)

tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)

run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=BERT_GCS_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=SAVE_CHECKPOINTS_STEPS,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)
    )

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=USE_TPU,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE
)

train_input_fn = input_fn_builder(
    input_files=input_files,
    max_seq_length=MAX_SEQ_LENGTH,
    max_predictions_per_seq=MAX_PREDICTIONS,
    is_training=True
)

2022-03-07 05:15:32,347 :  Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x7f93704f6d40>) includes params argument, but params are not passed to Estimator.
2022-03-07 05:15:32,349 :  Using config: {'_model_dir': 'gs://nlp_pretrain_model/bert_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.114.205.90:8470"
    }
  }
}
isolate_session_state: true
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f93704ec150

In [81]:
# 학습
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

2022-03-07 05:15:44,680 :  Querying Tensorflow master (grpc://10.114.205.90:8470) for TPU system metadata.
2022-03-07 05:15:44,699 :  Found TPU system:
2022-03-07 05:15:44,701 :  *** Num TPU Cores: 8
2022-03-07 05:15:44,703 :  *** Num TPU Workers: 1
2022-03-07 05:15:44,706 :  *** Num TPU Cores Per Worker: 8
2022-03-07 05:15:44,708 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 9451266091524022638)
2022-03-07 05:15:44,710 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 1959677236699311888)
2022-03-07 05:15:44,711 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 4117399547168169297)
2022-03-07 05:15:44,713 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 453460110833258068)
2022-03-07 05:15:44,714 :  *** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TP



2022-03-07 05:15:45,076 :  From bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.

2022-03-07 05:15:45,130 :  From bert/modeling.py:358: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
2022-03-07 05:15:45,159 :  From bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.Dense instead.
2022-03-07 05:15:45,162 :  From /tensorflow-1.15.2/python3.7/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.__call__` method instead.
2022-03-07 05:15:48,440 :  From /content/bert/run_pretra

<tensorflow_estimator.python.estimator.tpu.tpu_estimator.TPUEstimator at 0x7f9370ccc390>