# pre-training BERT
# 아래 링크의 코드를 그대로 가져옴
- https://colab.research.google.com/drive/1nVn6AFpQSzXBt8_ywfx6XR8ZfQXlKGAz#scrollTo=myjxQe5awo1v

# Install packages

In [None]:
!pip install tensorflow==1.15
!pip install nltk

In [None]:
!pip install sentencepiece
!git clone https://github.com/google-research/bert

# import & set logging

In [5]:
import os
import sys
import json
import nltk
import random
import logging
import tensorflow as tf
import sentencepiece as spm

from glob import glob
from tensorflow.keras.utils import Progbar

sys.path.append("bert")

from bert import modeling, optimization, tokenization
from bert.run_pretraining import input_fn_builder, model_fn_builder




In [6]:
# configure logging
log = logging.getLogger('tensorflow')
log.setLevel(logging.INFO)

# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s :  %(message)s')
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
log.handlers = [sh]

In [7]:
USE_TPU = False

# Download dataset

In [None]:
!wget http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2016/mono/OpenSubtitles.raw.en.gz -O dataset.txt.gz
!gzip -d dataset.txt.gz
!tail dataset.txt

DEMO_MODE = True #@param {type:"boolean"}

if DEMO_MODE:
  CORPUS_SIZE = 10000
else:
  CORPUS_SIZE = 100000000 #@param {type: "integer"}
  
!(head -n $CORPUS_SIZE dataset.txt) > subdataset.txt
!mv subdataset.txt dataset.txt

# preprocess text
Remove punctuation, uppercase letters and non-UTF symbols

In [8]:
regex_tokenizer = nltk.RegexpTokenizer("\w+")

def normalize_text(text):
  # lowercase text
  text = str(text).lower()
  # remove non-UTF
  text = text.encode("utf-8", "ignore").decode()
  # remove punktuation symbols
  text = " ".join(regex_tokenizer.tokenize(text))
  return text

def count_lines(filename):
  count = 0
  with open(filename) as fi:
    for line in fi:
      count += 1
  return count

In [9]:
normalize_text('Thanks to the advance, they have succeeded in getting over their adversaries.')

'thanks to the advance they have succeeded in getting over their adversaries'

In [10]:
RAW_DATA_FPATH = "dataset.txt" #@param {type: "string"}
PRC_DATA_FPATH = "proc_dataset.txt" #@param {type: "string"}

# apply normalization to the dataset
# this will take a minute or two

total_lines = count_lines(RAW_DATA_FPATH)
bar = Progbar(total_lines)

with open(RAW_DATA_FPATH,encoding="utf-8") as fi:
  with open(PRC_DATA_FPATH, "w",encoding="utf-8") as fo:
    for l in fi:
      fo.write(normalize_text(l)+"\n")
      bar.add(1)



In [11]:
!tail proc_dataset.txt

that s nice
sounds like the left bank s running lean
the service department s over there
can i talk to you
talk to me
yeah sure talk
well thing of it is i came back because
do you know what time it is
it s 5 30
i came back because


# building the vocabulary
BERT는 WordPiece tokenizer를 사용했으나, open source가 아님. 그래서 대신 SentencePiece tokenizer를 unigram mode로 사용하려 한다. 이건 BERT에 바로 적용이 안되고 몇가지 트릭을 사용해야 한다.
SentencePiece는 RAM을 엄청 많이 사용하므로, 바로 돌리면 crash 된다. 따라서 randomly subsample을 돌리기로 한다.
그리고 SentencePiece는 BOS, EOS symbol을 자동으로 더해주기 때문에 이를 막기 위해 저 symbol들의 index를 -1로 둔다.
NUM_PLACEHOLDERS는 fine-tune을 위해 예비로 남겨두는 자리이다.

In [12]:
MODEL_PREFIX = "tokenizer" #@param {type: "string"}
VOC_SIZE = 32000 #@param {type:"integer"}
SUBSAMPLE_SIZE = 12800000 #@param {type:"integer"}
NUM_PLACEHOLDERS = 256 #@param {type:"integer"}

SPM_COMMAND = ('--input={} --model_prefix={} '
               '--vocab_size={} --input_sentence_size={} '
               '--shuffle_input_sentence=true ' 
               '--bos_id=-1 --eos_id=-1').format(
               PRC_DATA_FPATH, MODEL_PREFIX, 
               VOC_SIZE - NUM_PLACEHOLDERS, SUBSAMPLE_SIZE)

spm.SentencePieceTrainer.Train(SPM_COMMAND)

True

In [13]:
testcase = "Colorless geothermal substations are generating furiously"

```
>>> wordpiece.tokenize("Colorless geothermal substations are generating furiously")
 
['color',
 '##less',
 'geo',
 '##thermal',
 'sub',
 '##station',
 '##s',
 'are',
 'generating',
 'furiously']
 ```

위에 보는 대로, wordpiece tokenizer는 subword 들 중에서 중간에 오는 단어들에 '##'을 붙여준다. 

In [14]:
!ls

README.md             proc_dataset.txt      tokenizer.vocab
[1m[36mbert[m[m                  requirements.txt      [1m[36mvenv[m[m
dataset.txt           [1m[36mshards[m[m                vocab.txt
preprocess_bert.ipynb tokenizer.model


SentencePiece는 두개의 파일을 남긴다.
- tokenizer.model 
- tokenizer.vocab

In [15]:
!head -n 10 tokenizer.vocab

<unk>	0
▁you	-3.2342
▁i	-3.2821
▁the	-3.56375
▁s	-3.84955
▁to	-3.87601
▁a	-3.9102
▁it	-3.97593
▁t	-4.25729
▁and	-4.32686


In [16]:
def read_sentencepiece_vocab(filepath):
  voc = []
  with open(filepath, encoding='utf-8') as fi:
    for line in fi:
      voc.append(line.split("\t")[0])
  # skip the first <unk> token
  voc = voc[1:]
  return voc

snt_vocab = read_sentencepiece_vocab("{}.vocab".format(MODEL_PREFIX))
print("Learnt vocab size: {}".format(len(snt_vocab)))
print("Sample tokens: {}".format(random.sample(snt_vocab, 10)))

Learnt vocab size: 31743
Sample tokens: ['▁alana', 'potatoes', '▁defend', 'sailing', 'artman', '▁head', 'pb', 'sation', 'nette', '▁serpent']


SentencePiece는 WordPiece와 반대로 동작한다는 것을 알 수 있다. SentencePiece는 whitespace를 아래와 같이 "▁" (U+2581)로 변경한다.
```
Hello▁World.
```

그리고 문장을 쪼갠다.
```
[Hello] [▁Wor] [ld] [.]
```

따라서 "▁"가 있으면 없애고 아니면 "##"을 붙여 주어야 한다.

In [17]:
def parse_sentencepiece_token(token):
    if token.startswith("▁"):
        return token[1:]
    else:
        return "##" + token

In [18]:
bert_vocab = list(map(parse_sentencepiece_token, snt_vocab))

In [19]:
ctrl_symbols = ["[PAD]","[UNK]","[CLS]","[SEP]","[MASK]"]
bert_vocab = ctrl_symbols + bert_vocab

BERT에 사용되는 문자들과 placeholder token들을 vocab에 더해준다.

In [20]:
bert_vocab += ["[UNUSED_{}]".format(i) for i in range(VOC_SIZE - len(bert_vocab))]
print(len(bert_vocab))

32000


In [21]:
VOC_FNAME = "vocab.txt" #@param {type:"string"}

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [22]:
bert_tokenizer = tokenization.FullTokenizer(VOC_FNAME)
bert_tokenizer.tokenize(testcase)

2019-12-28 18:19:05,423 :  From /Users/kmryu/code/deep/bert_code_review/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



['color',
 '##less',
 'geo',
 '##ther',
 '##mal',
 'subs',
 '##tation',
 '##s',
 'are',
 'generat',
 '##ing',
 'furious',
 '##ly']

# generating pre-training data

In [28]:
!mkdir ./shards
!split -a 4 -l 256000 -d $PRC_DATA_FPATH ./shards/shard_
!ls ./shards/

split: illegal option -- d
usage: split [-a sufflen] [-b byte_count] [-l line_count] [-p pattern]
             [file [prefix]]


In [32]:
!ls ./shards/

shard_0000 shard_0001 shard_0002 shard_0003


In [2]:
MAX_SEQ_LENGTH = 128 #@param {type:"integer"}
MASKED_LM_PROB = 0.15 #@param
MAX_PREDICTIONS = 20 #@param {type:"integer"}
DO_LOWER_CASE = True #@param {type:"boolean"}
PROCESSES = 2 #@param {type:"integer"}
PRETRAINING_DIR = "pretraining_data" #@param {type:"string"}

In [23]:
XARGS_CMD = ("ls ./shards/ | "
             "xargs -n 1 -P {} -I{} "
             "python3 bert/create_pretraining_data.py "
             "--input_file=./shards/{} "
             "--output_file={}/{}.tfrecord "
             "--vocab_file={} "
             "--do_lower_case={} "
             "--max_predictions_per_seq={} "
             "--max_seq_length={} "
             "--masked_lm_prob={} "
             "--random_seed=34 "
             "--dupe_factor=5")

XARGS_CMD = XARGS_CMD.format(PROCESSES, '{}', '{}', PRETRAINING_DIR, '{}', 
                             VOC_FNAME, DO_LOWER_CASE, 
                             MAX_PREDICTIONS, MAX_SEQ_LENGTH, MASKED_LM_PROB)

In [None]:
tf.gfile.MkDir(PRETRAINING_DIR)
!$XARGS_CMD

---
# create_pretraining_data 코드 리뷰

In [25]:
XARGS_CMD

'ls ./shards/ | xargs -n 1 -P 2 -I{} python3 bert/create_pretraining_data.py --input_file=./shards/{} --output_file=pretraining_data/{}.tfrecord --vocab_file=vocab.txt --do_lower_case=True --max_predictions_per_seq=20 --max_seq_length=128 --masked_lm_prob=0.15 --random_seed=34 --dupe_factor=5'

In [None]:
flags = tf.flags

FLAGS = flags.FLAGS

flags.DEFINE_string("input_file", None,
                    "Input raw text file (or comma-separated list of files).")

flags.DEFINE_string(
    "output_file", None,
    "Output TF example file (or comma-separated list of files).")

flags.DEFINE_string("vocab_file", None,
                    "The vocabulary file that the BERT model was trained on.")

flags.DEFINE_bool(
    "do_lower_case", True,
    "Whether to lower case the input text. Should be True for uncased "
    "models and False for cased models.")

flags.DEFINE_bool(
    "do_whole_word_mask", False,
    "Whether to use whole word masking rather than per-WordPiece masking.")

flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")

flags.DEFINE_integer("max_predictions_per_seq", 20,
                     "Maximum number of masked LM predictions per sequence.")

flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")

flags.DEFINE_integer(
    "dupe_factor", 10,
    "Number of times to duplicate the input data (with different masks).")

flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")

flags.DEFINE_float(
    "short_seq_prob", 0.1,
    "Probability of creating sequences which are shorter than the "
    "maximum length.")

FLAGS는 tf의 command line options를 담아놓는 객체로 보인다.
debugging으로 확인해보면 위의 옵션들은 다음과 같다.
```
input_file = {str} './shards/shard_0000'
output_file = {str} 'pretraining_data/shard_0000.tfrecord'
vocab_file = {str} 'vocab.txt'

do_lower_case = {bool} True
do_whole_word_mask = {bool} False
max_seq_length = {int} 128
max_predictions_per_seq = {int} 20

dupe_factor = {int} 5
masked_lm_prob = {float} 0.15
short_seq_prob = {float} 0.1
```

main 함수는 로깅을 제외하면 4가지로 볼 수 있다.
- tokenizer 선언
- input_files 선언
- create_training_instances
- write_instances_to_example_files

In [27]:
def main(_):
  tokenizer = tokenization.FullTokenizer(
    vocab_file=FLAGS.vocab_file,
    do_lower_case=FLAGS.do_lower_case
  )
  
  input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))
    
  instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)
  
  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files)

여기서 tokenization은 bert의 tokenization.py 이다.
위에서 이 tokenization을 사용해서 test를 했었다.

In [31]:
tokenizer = tokenization.FullTokenizer(VOC_FNAME)
tokenizer.tokenize(testcase)

['color',
 '##less',
 'geo',
 '##ther',
 '##mal',
 'subs',
 '##tation',
 '##s',
 'are',
 'generat',
 '##ing',
 'furious',
 '##ly']

input_files는 command line option 에서 받은 그 파일들을 말한다.
아래 코드의 결과물은 다음과 같다.
```
input_files = <class 'list'>: ['../shards/shard_0000']
```

In [None]:
input_files = []
  for input_pattern in FLAGS.input_file.split(","):
    input_files.extend(tf.gfile.Glob(input_pattern))

create_training_instances 에서는 input_files 에서 한줄씩 읽어서 training instance로 변환한다.

In [None]:
instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)

먼저 문서들을 token들로 변환해 list of lists로 변환한다.
여기서 빈 line이 나오면 문서가 끝이 났다고 생각한다.

In [None]:
all_documents = [[]]

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    with tf.gfile.GFile(input_file, "r") as reader:
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break
        line = line.strip()

        # Empty lines are used as document delimiters
        if not line:
          all_documents.append([])
        tokens = tokenizer.tokenize(line)
        if tokens:
          all_documents[-1].append(tokens)

그 후 문서들을 돌면서 instance로 변환한다. dupe_factor만큼 반복한다.

In [None]:
vocab_words = list(tokenizer.vocab.keys())
instances = []
for _ in range(dupe_factor):
  for document_index in range(len(all_documents)):
    instances.extend(
        create_instances_from_document(
            all_documents, document_index, max_seq_length, short_seq_prob,
            masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

create_instances_from_document 에서 본격적으로 document (tokens)를 instance로 만든다.

In [38]:
def create_instances_from_document(
    all_documents, document_index, max_seq_length, short_seq_prob,
    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
  """Creates `TrainingInstance`s for a single document."""
  document = all_documents[document_index]

여기서 document는 list of lists 이다.
```
000 = {list} <class 'list'>: ['and', 'stimulate', 'ying']
001 = {list} <class 'list'>: ['your', 'ying']
002 = {list} <class 'list'>: ['you', 're', 'crazy']
...
```

In [None]:
먼저 max_num_tokens를 정하고,

In [None]:
  # Account for [CLS], [SEP], [SEP]
  max_num_tokens = max_seq_length - 3

target_seq_length를 정하는데, <br/>
fine-tuning에서 짧은 문장도 학습 가능하도록 일정 확률 (short_seq_prob) 만큼은 target_seq_length를 랜덤하게 짧게 만든다.

In [None]:
# We *usually* want to fill up the entire sequence since we are padding
  # to `max_seq_length` anyways, so short sequences are generally wasted
  # computation. However, we *sometimes*
  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
  # sequences to minimize the mismatch between pre-training and fine-tuning.
  # The `target_seq_length` is just a rough target however, whereas
  # `max_seq_length` is a hard limit.
  target_seq_length = max_num_tokens
  if rng.random() < short_seq_prob:
    target_seq_length = rng.randint(2, max_num_tokens)

그 다음에 문장 두개를 이어 붙이는데, <br/>
여기서 문서의 token들을 다 이어 붙인 후 아무 곳이나 정해서 두개의 segment A,B로 나누는 방식을 사용하지는 않는다. <br/>
왜냐하면 그렇게 하면 다음 문장 예측 (next sentence prediction) 이 너무 쉬워지기 때문이다. <br/>
따라서 사용자가 입력으로 넣어준 '진짜' 문장을 segment로 사용해 segment A,B로 나눈다.

In [None]:
instances = []
current_chunk = []
current_length = 0
i = 0

current_length로 segment 두개의 길이가 최대 길이 (target_seq_length)가 넘지 않도록 막는다.

In [None]:
while i < len(document):
  segment = document[i]
  current_chunk.append(segment)
  current_length += len(segment)
  
  if i == len(document) - 1 or current_length >= target_seq_length:
    if current_chunk:

segment A의 길이를 랜덤으로 구한 다음, 그만큼의 토큰만 tokens_a에 붙인다

In [None]:
      a_end = 1
      if len(current_chunk) >= 2:
        a_end = rng.randint(1, len(current_chunk) - 1)

      tokens_a = []
      for j in range(a_end):
        tokens_a.extend(current_chunk[j])

tokens_b를 구할 때는 50%는 다음 문장, 50%는 랜덤한 문장을 가져온다.<br/>
이렇게 함으로써 Next Sentence Prediction을 binary classification 방식으로 하게 된다.

In [None]:
      if len(current_chunk) == 1 or rng.random() < 0.5:
        is_random_next = True
        target_b_length = target_seq_length - len(tokens_a)

        # This should rarely go for more than one iteration for large
        # corpora. However, just to be careful, we try to make sure that
        # the random document is not the same as the document
        # we're processing.
        for _ in range(10):
          random_document_index = rng.randint(0, len(all_documents) - 1)
          if random_document_index != document_index:
            break

        random_document = all_documents[random_document_index]
        random_start = rng.randint(0, len(random_document) - 1)
        for j in range(random_start, len(random_document)):
          tokens_b.extend(random_document[j])
          if len(tokens_b) >= target_b_length:
            break
        # We didn't actually use these segments so we "put them back" so
        # they don't go to waste.
        num_unused_segments = len(current_chunk) - a_end
        i -= num_unused_segments
      # Actual next
      else:
        is_random_next = False
        for j in range(a_end, len(current_chunk)):
          tokens_b.extend(current_chunk[j])

그 후 tokens_a, tokens_b의 합이 max_num_tokens를 넘지 않게 잘라낸다.<br/>
잘라낼 때는 50%확률로 앞에서 잘라낸다.

In [None]:
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)

def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
  """Truncates a pair of sequences to a maximum sequence length."""
  while True:
    total_length = len(tokens_a) + len(tokens_b)
    if total_length <= max_num_tokens:
      break

    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
    assert len(trunc_tokens) >= 1

    # We want to sometimes truncate from the front and sometimes from the
    # back to add more randomness and avoid biases.
    if rng.random() < 0.5:
      del trunc_tokens[0]
    else:
      trunc_tokens.pop()

그 후 문장 맨 앞에는 '[CLS]', a 문장 뒤와 b문장 뒤에는 각각 '[SEP]'를 붙이고,<br/>
tokens_a에는 segment id 0을, tokens_b에는 segment id 1을 붙인 후 <br/>
전체를 이어 붙인다.

In [None]:
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
  tokens.append(token)
  segment_ids.append(0)

tokens.append("[SEP]")
segment_ids.append(0)

for token in tokens_b:
  tokens.append(token)
  segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)

그리고 랜덤하게 마스크를 씌운다. <br/>

In [None]:
(tokens, masked_lm_positions,
         masked_lm_labels) = create_masked_lm_predictions(
             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)

여기서 논문의 3.1 Pre-train BERT Task #1: Masked LM의 내용과 같지만 차이가 있는데,<br/>
바로 Whole Word Masking을 한다는 것이다. <br/>
<br/>
wordpiece 방식으로 tokenizing을 하면 단어가 쪼개지는데, 쪼개진 단어 조각이 아니라 단어 전체를 한 단위로 보고 masking을 수행한다.

In [None]:
def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_words, rng):
  """Creates the predictions for the masked LM objective."""

  cand_indexes = []
  for (i, token) in enumerate(tokens):
    if token == "[CLS]" or token == "[SEP]":
      continue
    # Whole Word Masking means that if we mask all of the wordpieces
    # corresponding to an original word. When a word has been split into
    # WordPieces, the first token does not have any marker and any subsequence
    # tokens are prefixed with ##. So whenever we see the ## token, we
    # append it to the previous set of word indexes.
    #
    # Note that Whole Word Masking does *not* change the training code
    # at all -- we still predict each WordPiece independently, softmaxed
    # over the entire vocabulary.
    if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and
        token.startswith("##")):
      cand_indexes[-1].append(i)
    else:
      cand_indexes.append([i])
      
  rng.shuffle(cand_indexes)

tokens는 단어의 list 이다.
```
000 = {str} '[CLS]'
001 = {str} 'and'
002 = {str} 'stimulate'
003 = {str} 'ying'
004 = {str} 'your'
005 = {str} 'ying'
...
```

sequence 마다 몇개를 masking 할지 max_predictions_per_seq, masked_lm_prob를 보고 결정한다

In [None]:
  output_tokens = list(tokens)

  num_to_predict = min(max_predictions_per_seq,
                       max(1, int(round(len(tokens) * masked_lm_prob))))

covered_indexes는 같은 단어가 두번 masking 대상이 되지 않도록 하기 위한 cache 이다.<br/>
masking 결과를 masked_lms 리스트에 담는다.

In [None]:
  masked_lms = []
  covered_indexes = set()
  for index_set in cand_indexes:
    if len(masked_lms) >= num_to_predict:
      break
    # If adding a whole-word mask would exceed the maximum number of
    # predictions, then just skip this candidate.
    if len(masked_lms) + len(index_set) > num_to_predict:
      continue
    is_any_index_covered = False
    for index in index_set:
      if index in covered_indexes:
        is_any_index_covered = True
        break
    if is_any_index_covered:
      continue

논문 3.1 #1 Masked LM 에서 기술한대로, 80%는 '[MASK]'로, 10%는 원래 그대로, 10%는 랜덤한 단어로 masking을 수행한다.

In [None]:
    for index in index_set:
      covered_indexes.add(index)

      masked_token = None
      # 80% of the time, replace with [MASK]
      if rng.random() < 0.8:
        masked_token = "[MASK]"
      else:
        # 10% of the time, keep original
        if rng.random() < 0.5:
          masked_token = tokens[index]
        # 10% of the time, replace with random word
        else:
          masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]

      output_tokens[index] = masked_token

      masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))

In [None]:
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
                                          ["index", "label"])

masking을 index로 정렬한 후, masking position과 label들을 리턴해준다.

In [None]:
  masked_lms = sorted(masked_lms, key=lambda x: x.index)

  masked_lm_positions = []
  masked_lm_labels = []
  for p in masked_lms:
    masked_lm_positions.append(p.index)
    masked_lm_labels.append(p.label)

  return (output_tokens, masked_lm_positions, masked_lm_labels)

create_instances_from_document로 다시 돌아오면,
```
(tokens, masked_lm_positions,
         masked_lm_labels) = create_masked_lm_predictions(
             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
```

위 함수의 실행 결과는 다음과 같다.

```
tokens = {list} <class 'list'>: ['[CLS]', 'understood', 'stimulate', 'ying', 'your', 'ying', 'you', 're', 'crazy', 'jallel', 'calm', 'down', 'everything', 'will', 'be', 'okay', 'fight', 'jallel', '[SEP]', 'jallel', 'can', 'you', 'hear', 'me', 'arise', 'easy', '[MASK]', 'it', 'up', 'can', 'articulate', 'stand', 'up', 'it', 's', 'okay', '[MASK]', 'take', 'a', 'deep', '[MASK]', 'jallel', 'go', 'to', 'the', 'hospital', 'but', ...
```


```
masked_lm_positions = {list} <class 'list'>: [1, 26, 30, 36, 40, 47, 51, 56, 61, 77, 87, 90, 97, 100, 101, 109, 112, 115, 120]
masked_lm_labels = {list} <class 'list'>: ['and', 'pick', 'you', 'easy', 'breath', 'no', 'it', 'going', 'put', 'not', '##s', 'that', 'that', 'come', 'mr', 'hello', 'is', 'you', 'bucks']
```




위의 정보들로 TrainingInstance 객체를 만들어준다.

In [None]:
        instance = TrainingInstance(
            tokens=tokens,
            segment_ids=segment_ids,
            is_random_next=is_random_next,
            masked_lm_positions=masked_lm_positions,
            masked_lm_labels=masked_lm_labels)
        instances.append(instance)
    ...
    return instances

In [None]:
class TrainingInstance(object):
  """A single training instance (sentence pair)."""

  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
               is_random_next):
    self.tokens = tokens
    self.segment_ids = segment_ids
    self.is_random_next = is_random_next
    self.masked_lm_positions = masked_lm_positions
    self.masked_lm_labels = masked_lm_labels

create_training_instances 로 돌아가면,
```
  for _ in range(dupe_factor):
    for document_index in range(len(all_documents)):
      instances.extend(
          create_instances_from_document(
              all_documents, document_index, max_seq_length, short_seq_prob,
              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

  rng.shuffle(instances)
  return instances
```

이와 같이 training_instances를 리턴해준다.

이제 main(_)으로 돌아가면, instance를 전부 만들었다.
```
instances = create_training_instances(
      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
      rng)
```

option에서 받은 대로 output_files를 정의한다.

In [None]:
  output_files = FLAGS.output_file.split(",")
  tf.logging.info("*** Writing to output files ***")
  for output_file in output_files:
    tf.logging.info("  %s", output_file)

그리고 파일에 쓴다.

In [None]:
write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
                                  FLAGS.max_predictions_per_seq, output_files)

먼저 output_files 경로들에 TFRecordWriter를 정의하고

In [None]:
def write_instance_to_example_files(instances, tokenizer, max_seq_length,
                                    max_predictions_per_seq, output_files):
  """Create TF example files from `TrainingInstance`s."""
  writers = []
  for output_file in output_files:
    writers.append(tf.python_io.TFRecordWriter(output_file))

token을 id로 변환, id, mask, segment_id들의 리스트를 만든다.<br/>
이 때 max_seq_length 만큼만 가져온다.

In [None]:
  for (inst_index, instance) in enumerate(instances):
    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
    input_mask = [1] * len(input_ids)
    segment_ids = list(instance.segment_ids)
    assert len(input_ids) <= max_seq_length

    while len(input_ids) < max_seq_length:
      input_ids.append(0)
      input_mask.append(0)
      segment_ids.append(0)

masked positions, ids, weights도 위와 비슷하게 변환한다. <br/>
max_predictions_per_seq 보다 짧으면 0으로 패딩한다.

In [None]:
    masked_lm_positions = list(instance.masked_lm_positions)
    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
    masked_lm_weights = [1.0] * len(masked_lm_ids)

    while len(masked_lm_positions) < max_predictions_per_seq:
      masked_lm_positions.append(0)
      masked_lm_ids.append(0)
      masked_lm_weights.append(0.0)

next sentence가 random인지 아닌지를 label로 표시한다.

In [None]:
    next_sentence_label = 1 if instance.is_random_next else 0

위에서 구한 parameter들을 dict에 담는다. <br/>
create_int_feature는 int list를 tf의 Int64List로 변환한다.

In [None]:
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
    features["next_sentence_labels"] = create_int_feature([next_sentence_label])

In [None]:
def create_int_feature(values):
  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
  return feature

features (dict)를 tf.train.Example로 변경한 뒤, serialize 해서 파일에 쓴다.

In [None]:
    tf_example = tf.train.Example(features=tf.train.Features(feature=features))

    writers[writer_index].write(tf_example.SerializeToString())