In [1]:
import torch
import gluonnlp as nlp     # GluonNLP는 버트를 간단하게 로딩하는 인터페이스를 제공하는 API 임
import numpy as np
from transformers import BertTokenizer, BertTokenizerFast, BertModel

import utils
from utils import seed_everything, GPU_info, pytorch_cos_sim

In [8]:
#config.json, pytorch_model.bin 같이 있는 폴더 지정
model_path = 'model/mymodel_0207' 
# True로 해야, hidden_states 가 출력됨
output_hidden_states = True
return_dict = False

# voab.txt, special_tokens_map.json, tokenizer_config.json,added_tokens.json 폴더 경로 지정
vocab_path = "Tokenizer/kowikitext_20200920_speical"

seed = 111

In [3]:
cuda = GPU_info()
print(cuda)

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0


In [4]:
#seed 설정
seed_everything(seed)

In [9]:
# model 불러옴
model = BertModel.from_pretrained(model_path, 
                                  output_hidden_states=output_hidden_states,
                                  return_dict=return_dict)

Some weights of the model checkpoint at model/mymodel_0207 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
device = torch.device(cuda)
#model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32022, 768, padding_idx=0)
    (position_embeddings): Embedding(256, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [11]:
# tokenize 설정
tokenizer = BertTokenizerFast.from_pretrained(vocab_path)

In [12]:
len(tokenizer)

32022

In [9]:
#list(tokenizer.vocab.keys())[8000:8010]

In [13]:
tokenized_input = tokenizer("식당에 가서 밥을 배 부르게 먹고 낙시배를 타고 고기 잡고 요트배를 타고 관광을 해야 겠다", return_tensors="pt")

In [14]:

token_str = [[tokenizer.convert_ids_to_tokens(s) for s in tokenized_input['input_ids'].tolist()[0]]]

# token indexs ( 토큰을 index로 변한한 값)
token_ids = [tokenized_input['input_ids'].tolist()[0]]
# attention_mask (중요토큰 : 1)
token_attention_mask = [tokenized_input['attention_mask'].tolist()[0]]
# segment_id (첫번째문자:0, 다음 문장:1)
token_type_ids = [tokenized_input['token_type_ids'].tolist()[0]]

print(token_str)
print(token_ids)
print(token_attention_mask)
print(token_type_ids)


[['[CLS]', '식당', '##에', '가', '##서', '밥', '##을', '배', '부르', '##게', '먹', '##고', '낙', '##시', '##배', '##를', '타고', '고기', '잡', '##고', '요트', '##배', '##를', '타고', '관광', '##을', '해야', '겠', '##다', '[SEP]']]
[[2, 7894, 1307, 185, 1129, 502, 1207, 504, 2837, 1503, 459, 1072, 266, 1080, 1290, 1251, 22087, 7221, 733, 1072, 17258, 1290, 1251, 22087, 3448, 1207, 2362, 207, 1151, 3]]
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [12]:
print(tokenized_input)

{'input_ids': tensor([[    2,  8650,  1467,   117,  1023,     1,   481,  2603,  1478,   431,
          1056,   213,  1123,  1189,  1091,   860,  1056,  6331,   721,  1056,
         11013,  1189,  1091,   860,  1056,  3369,  1218,  2321,   140,  1084,
             3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}


In [15]:
outputs = model(**tokenized_input)

In [16]:
print(len(outputs))

3


In [17]:
sequence_output = outputs[0]
print('sequence 길이: {}'.format(sequence_output.size()))

pooled_output = outputs[1]
print('pooled 길이:{}'.format(pooled_output.size()))

hidden_states = outputs[2]
layer_idx = 0
batch_idx = 0
token_idx = 0
print('hidden_states')
print("-레이어 수:{}".format(len(hidden_states)))
print("-배치 수: {}".format(len(hidden_states[layer_idx])))
print("-토큰 수 : {}".format(len(hidden_states[layer_idx][batch_idx])))
print("-hidden 유닛 수 : {}".format(len(hidden_states[layer_idx][batch_idx][token_idx])))

sequence 길이: torch.Size([1, 30, 768])
pooled 길이:torch.Size([1, 768])
hidden_states
-레이어 수:13
-배치 수: 1
-토큰 수 : 30
-hidden 유닛 수 : 768


In [18]:
# 단어별 유사도 측정

# premute를 사용하여 레이어 와 tokens 차원을 바꾼다.
sequence_output_embedding = sequence_output.permute(1,0,2)
print('sequence_output size: {}'.format(sequence_output_embedding.size()))

# tensor -> list 로 변환
output_list = sequence_output_embedding.tolist()

simul_score1 = pytorch_cos_sim(output_list[5][0], output_list[14][0])
simul_score2 = pytorch_cos_sim(output_list[5][0], output_list[21][0])
simul_score3 = pytorch_cos_sim(output_list[14][0], output_list[21][0])

print("몸에 있는배 vs 낙시배 유사도:{}".format(simul_score1))
print("몸에 있는배 vs 요트배 유사도:{}".format(simul_score2))
print("낙시배 vs 요트배 유사도:{}".format(simul_score3))


sequence_output size: torch.Size([30, 1, 768])
몸에 있는배 vs 낙시배 유사도:tensor([[0.3481]])
몸에 있는배 vs 요트배 유사도:tensor([[0.2896]])
낙시배 vs 요트배 유사도:tensor([[0.8939]])
