In [1]:
import torch
import gluonnlp as nlp     # GluonNLP는 버트를 간단하게 로딩하는 인터페이스를 제공하는 API 임
import numpy as np
from transformers import BertTokenizer, BertTokenizerFast, BertModel

import sys
sys.path.append("..")
from myutils import seed_everything, GPU_info, pytorch_cos_sim

In [2]:
'''
#config.json, pytorch_model.bin 같이 있는 폴더 지정
model_path = 'model/bert-multilingual-cased_furter_pt_model_0216' 
# voab.txt, special_tokens_map.json, tokenizer_config.json,added_tokens.json 폴더 경로 지정
vocab_path = "model/bert-multilingual-cased_furter_pt_model_0216/vocab"
'''

model_path = 'model/bert-multilingual-cased' 
vocab_path = "model/bert-multilingual-cased/vocab"

# True로 해야, hidden_states 가 출력됨
output_hidden_states = True
return_dict = False

seed = 111

In [3]:
cuda = GPU_info()
print(cuda)

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0


In [4]:
#seed 설정
seed_everything(seed)

In [5]:
# model 불러옴
model = BertModel.from_pretrained(model_path, 
                                  output_hidden_states=output_hidden_states,
                                  return_dict=return_dict)

Some weights of the model checkpoint at model/bert-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
device = torch.device(cuda)
#model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [7]:
model.num_parameters()

177853440

In [8]:
# tokenize 설정
tokenizer = BertTokenizerFast.from_pretrained(vocab_path)

In [9]:
len(tokenizer)

119547

In [10]:
#list(tokenizer.vocab.keys())[8000:8010]

In [11]:
tokenized_input = tokenizer("식당에 가서 밥을 배 부르게 먹고 낙시배를 타고 고기 잡고 요트배를 타고 관광을 해야 겠다", return_tensors="pt")

In [12]:

token_str = [[tokenizer.convert_ids_to_tokens(s) for s in tokenized_input['input_ids'].tolist()[0]]]

# token indexs ( 토큰을 index로 변한한 값)
token_ids = [tokenized_input['input_ids'].tolist()[0]]
# attention_mask (중요토큰 : 1)
token_attention_mask = [tokenized_input['attention_mask'].tolist()[0]]
# segment_id (첫번째문자:0, 다음 문장:1)
token_type_ids = [tokenized_input['token_type_ids'].tolist()[0]]

print(token_str)
print(token_ids)
print(token_attention_mask)
print(token_type_ids)


[['[CLS]', '식', '##당', '##에', '가', '##서', '밥', '##을', '배', '부', '##르게', '먹', '##고', '낙', '##시', '##배', '##를', '타', '##고', '고', '##기', '잡', '##고', '요', '##트', '##배', '##를', '타', '##고', '관', '##광', '##을', '해', '##야', '겠', '##다', '[SEP]']]
[[101, 9486, 21928, 10530, 8843, 12424, 9327, 10622, 9330, 9365, 78131, 9266, 11664, 8983, 14040, 76036, 11513, 9845, 11664, 8888, 12310, 9656, 11664, 9599, 15184, 76036, 11513, 9845, 11664, 8900, 118649, 10622, 9960, 21711, 8876, 11903, 102]]
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [13]:
tokenizer.convert_ids_to_tokens(6313)

'纯'

In [14]:
print(tokenized_input)

{'input_ids': tensor([[   101,   9486,  21928,  10530,   8843,  12424,   9327,  10622,   9330,
           9365,  78131,   9266,  11664,   8983,  14040,  76036,  11513,   9845,
          11664,   8888,  12310,   9656,  11664,   9599,  15184,  76036,  11513,
           9845,  11664,   8900, 118649,  10622,   9960,  21711,   8876,  11903,
            102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [15]:
outputs = model(**tokenized_input)

In [16]:
print(len(outputs))

3


In [17]:
sequence_output = outputs[0]
print('sequence 길이: {}'.format(sequence_output.size()))

pooled_output = outputs[1]
print('pooled 길이:{}'.format(pooled_output.size()))

hidden_states = outputs[2]
layer_idx = 0
batch_idx = 0
token_idx = 0
print('hidden_states')
print("-레이어 수:{}".format(len(hidden_states)))
print("-배치 수: {}".format(len(hidden_states[layer_idx])))
print("-토큰 수 : {}".format(len(hidden_states[layer_idx][batch_idx])))
print("-hidden 유닛 수 : {}".format(len(hidden_states[layer_idx][batch_idx][token_idx])))

sequence 길이: torch.Size([1, 37, 768])
pooled 길이:torch.Size([1, 768])
hidden_states
-레이어 수:13
-배치 수: 1
-토큰 수 : 37
-hidden 유닛 수 : 768


In [20]:
##### 단어별 유사도 측정

# premute를 사용하여 레이어 와 tokens 차원을 바꾼다.
sequence_output_embedding = sequence_output.permute(1,0,2)
print('sequence_output size: {}'.format(sequence_output_embedding.size()))

# tensor -> list 로 변환
output_list = sequence_output_embedding.tolist()

word_1 = 8
word_2 = 15
word_3 = 25
simul_score1 = pytorch_cos_sim(output_list[word_1][0], output_list[word_2][0])
simul_score2 = pytorch_cos_sim(output_list[word_1][0], output_list[word_3][0])
simul_score3 = pytorch_cos_sim(output_list[word_2][0], output_list[word_3][0])

print("몸에 있는배 vs 낙시배 유사도:{}".format(simul_score1))
print("몸에 있는배 vs 요트배 유사도:{}".format(simul_score2))
print("낙시배 vs 요트배 유사도:{}".format(simul_score3))

sequence_output size: torch.Size([37, 1, 768])
몸에 있는배 vs 낙시배 유사도:tensor([[0.6440]])
몸에 있는배 vs 요트배 유사도:tensor([[0.6280]])
낙시배 vs 요트배 유사도:tensor([[0.9716]])


In [22]:
print(output_list[6][0])

[-1.0133668184280396, -0.43167948722839355, 0.18542343378067017, 0.12088800966739655, 0.41148102283477783, 0.1533239185810089, -0.43989312648773193, 0.3770343065261841, 0.46202901005744934, 0.21814458072185516, -0.14310239255428314, -0.08492572605609894, 0.5519950985908508, 0.22635769844055176, -0.47427546977996826, 0.37364429235458374, 1.0226417779922485, -0.9009159207344055, 0.002453470602631569, 0.28234314918518066, 0.39326807856559753, -0.16260898113250732, 0.6133151054382324, 0.20685473084449768, -0.019095974043011665, 0.739651620388031, -1.3276050090789795, 0.4493487775325775, 0.6708970069885254, 0.11150505393743515, 0.2853521704673767, 0.3729919195175171, -0.06044900417327881, -0.4527873694896698, -0.4696868062019348, 0.7036134600639343, 0.3248489797115326, 0.41218745708465576, 0.0751100406050682, 0.3187400698661804, -0.4837758541107178, 0.6226616501808167, 0.31029072403907776, -0.058546096086502075, 0.2470712512731552, -0.43169334530830383, 0.11027448624372482, -0.5035675764083