In [1]:
import tensorflow as tf
import tensorflow_hub as hub

BERT_MODEL = 'uncased_L-12_H-768_A-12' #@param {type:"string"}
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

print('BERT_MODEL_HUB\t', BERT_MODEL_HUB)

# Vocab_file을 저장하고 directory 주소를 binary 형태로 얻는다.
with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature='tokenization_info',
                                    as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run(
            [tokenization_info['vocab_file'],
             tokenization_info['do_lower_case']])

BERT_MODEL_HUB	 https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [2]:
vocab_file

b'C:\\Users\\jinma\\AppData\\Local\\Temp\\tfhub_modules\\5a395eafef2a37bd9fc55d7f6ae676d2a134a838\\assets\\vocab.txt'

In [3]:
import collections

# 단어 사전을 저장할 Ordereddict 객체 생성
vocab = collections.OrderedDict()

In [4]:
# Binary text를 unicode(utf-8)로 decode하는 함수 작성
def convert_to_unicode(text):
    if isinstance(text, str):
        return text
    elif isinstance(text, bytes):
        return text.decode('utf-8', 'ignore')
    else:
        raise ValueError('Unsupported string type: %s' % type(text))

In [5]:
# vocab_file의 각 text를 unicode로 변환, vocab에 기록
index = 0
with tf.gfile.GFile(vocab_file, 'r') as reader:
    while True:
        token = convert_to_unicode(reader.readline())
        if not token:
            break
        token = token.strip()
        vocab[token] = index
        index += 1

In [6]:
vocab.get('this'), vocab.get('token')

(2023, 19204)

In [7]:
len(vocab)

30522

In [8]:
list(vocab.keys())[::1000]

['[PAD]',
 '"',
 'to',
 'paris',
 'tears',
 'knight',
 'peninsula',
 'licensed',
 'mouse',
 'screenplay',
 'raven',
 'tonnes',
 'princes',
 'osaka',
 'liability',
 '##lip',
 'kappa',
 'hasan',
 'belts',
 '##leader',
 'chunk',
 'colton',
 'artworks',
 'radiated',
 'plank',
 'fielder',
 'fide',
 'selector',
 'statehood',
 'gunners',
 '##ᄌ']

In [9]:
# vocab의 key와 value를 바꾼 dict 객체 생성
inv_vocab = {v:k for k, v in vocab.items()}

In [10]:
do_lower_case = True

In [11]:
# Char 단위 함수 작성
import unicodedata

def _is_whitespace(char):
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        # 공백 혹은 개행문자일 경우 True 반환
        return True
    cat = unicodedata.category(char)
    if cat == 'Zs':
        # unicode category가 "Space Separator"일 경우 True 반환
        return True
    return False
    
def _is_control(char):
    if char == "\t" or char == "\n" or char == "\r":
        # 개행문자일 경우 False 반환
        return False
    cat = unicodedata.category(char)
    if cat in ('Cc', 'Cf'):
        # unicode category가 "Control", 혹은 "Format"일 경우 True 반환
        return True
    return False

def _is_punctuation(char):
    cp = ord(char)
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        # unicode category가 P로 시작할 경우 True 반환
        # Pc (Connector Punctuatoin)
        # Pd (Dash Punctuation)
        # Pe (Close Punctuation)
        # Pf (Final Punctuatoin)
        # Pi (Initial Punctuation)
        # Po (Other Punctuation)
        # Ps (Open Punctuation)
        return True
    return False

In [12]:
chr(0), chr(0xfffd)

('\x00', '�')

In [26]:
# 예제 text를 할당한다.
text = "\n This \t here's \t an example of using the BERT tokenizer"
text

"\n This \t here's \t an example of using the BERT tokenizer"

In [32]:
print('1. BasicTokenizer로 tokenize\n')
text = convert_to_unicode(text)
## _clean_text(self, text):
output = []
for char in text:
    cp = ord(char)
    if cp == 0 or cp == 0xfffd or _is_control(char):
        continue
    if _is_whitespace(char): # 공백 혹은 개행문자면 
        output.append(" ")
    else:
        output.append(char)
print('Origin Text  :', text)
text = "".join(output)
print('Cleaned Text :', text)

## whitespace_tokenize(text)
text = text.strip()
orig_tokens = text.split()
split_tokens = []
for token in orig_tokens:
    if do_lower_case:
        print('  Do Lower Case... run strip accents.')
        token = token.lower()
        ## _run_strip_accents(self, text)
        print('\t   origin Token :', token)
        token = unicodedata.normalize("NFD", token)
        print('\tnormalize Token :', token)
        output = []
        for char in token:
            cat = unicodedata.category(char)
            if cat == 'Mn':
                # unicode category가 "Nonspacing Mark"일 경우 pass
                continue
            output.append(char)
        token = "".join(output)
        print('\t   output Token :', token)
    ## _run_split_on_punc(self, text)
    print('  \t\trun split on Punctuation.')
    chars = list(token)
    i, start_new_word, output = 0, True, []
    print('\tStart on list(token) :', chars)
    while i < len(chars):
        char = chars[i]
        if _is_punctuation(char):
            output.append([char])
            start_new_word = True
        else:
            if start_new_word:
                output.append([])
            start_new_word = False
            output[-1].append(char)
        i += 1
    print('\tEnd output :', output)
    split_tokens.extend(["".join(x) for x in output])
print('split_tokens :', split_tokens)
t = " ".join(split_tokens)
t = t.strip()
output_tokens = t.split()
print('-' * 100 + '\n' + 'Final Result :', output_tokens)

1. BasicTokenizer로 tokenize

Origin Text  : This   here's   an example of using the BERT tokenizer
Cleaned Text : This   here's   an example of using the BERT tokenizer
  Do Lower Case... run strip accents.
	   origin Token : this
	normalize Token : this
	   output Token : this
  		run split on Punctuation.
	Start on list(token) : ['t', 'h', 'i', 's']
	End output : [['t', 'h', 'i', 's']]
  Do Lower Case... run strip accents.
	   origin Token : here's
	normalize Token : here's
	   output Token : here's
  		run split on Punctuation.
	Start on list(token) : ['h', 'e', 'r', 'e', "'", 's']
	End output : [['h', 'e', 'r', 'e'], ["'"], ['s']]
  Do Lower Case... run strip accents.
	   origin Token : an
	normalize Token : an
	   output Token : an
  		run split on Punctuation.
	Start on list(token) : ['a', 'n']
	End output : [['a', 'n']]
  Do Lower Case... run strip accents.
	   origin Token : example
	normalize Token : example
	   output Token : example
  		run split on Punctuation.
	Start on li

In [51]:
print('2. WordpieceTokenizer로 tokenize\n')

split_tokens = []
for token in output_tokens:
    print('token :', token)
    ## wordpiece tokenizing (greedy longest-match-first algorithm)
    unk_token = "[UNK]"
    max_input_chars_per_word = 200
    # Start
    token = convert_to_unicode(token)
    output_tokens_ = []
    ## whitspacing
    if not token.strip():
        tokens = []
    else:
        tokens = token.strip().split()
    for token in tokens:
        chars = list(token)
        if len(chars) > max_input_chars_per_word:
            # 200글자를 넘을 경우 UNK 처리
            output_tokens_.append(unk_token)
            continue
        
        is_bad = False
        start = 0
        sub_tokens = []
        while start < len(chars):
            end = len(chars)
            print(start, end, end='')
            cur_substr = None
            # 첫번째 글짜부터 천천히 vocab에 있는 단어인지 체크
            while start < end:
                substr = "".join(chars[start:end])
                print('\t', substr)
                if start > 0:
                    ## start에 end가 할당됐을 경우,
                    ## 이는 어미이므로 ##을 붙여서 vocab에 있는지 체크
                    substr = "##" + substr
                if substr in vocab:
                    cur_substr = substr
                    break
                end -= 1
            # 만일 못찾았을 경우, [UNK]으로 처리
            if cur_substr is None:
                is_bad = True
                break
            sub_tokens.append(cur_substr)
            # 어미를 추가하기 위해 start에 end값을 할당
            start = end
        if is_bad:
            output_tokens_.append(unk_token)
        else:
            output_tokens_.extend(sub_tokens)
    for sub_token in sub_tokens:
        split_tokens.append(sub_token)
        
print('-' * 100 + '\n' + 'Final Result :', split_tokens)

2. WordpieceTokenizer로 tokenize

token : this
0 4	 this
token : here
0 4	 here
token : '
0 1	 '
token : s
0 1	 s
token : an
0 2	 an
token : example
0 7	 example
token : of
0 2	 of
token : using
0 5	 using
token : the
0 3	 the
token : bert
0 4	 bert
token : tokenizer
0 9	 tokenizer
	 tokenize
	 tokeniz
	 tokeni
	 token
5 9	 izer
----------------------------------------------------------------------------------------------------
Final Result : ['this', 'here', "'", 's', 'an', 'example', 'of', 'using', 'the', 'bert', 'token', '##izer']
