In [1]:
#===================================================================================================
# 단어 embedding 벡터들을 3D 로 보여주는 예제임
# => 단어들은 meta.tsv 파일로 저장, 임베딩값들은 vecs.tsv 파일로 저장(*이때 임베딩 각 값들은 탭으로 띄어야 함)
# => 이후 https://projector.tensorflow.org 접속하여, [load] 버튼 클릭->[Choose file] 버튼 클릭하여, 
#   vecs.tsv, meta.tsv 파일 선택 하면 완료
#===================================================================================================

import torch
import pandas as pd
import numpy as np

from os import sys
sys.path.append('..')
from myutils import seed_everything, GPU_info

from transformers import DistilBertTokenizer, DistilBertModel

device = GPU_info()
seed_everything(111)

logfilepath:bwdataset_2022-04-11.log
logfilepath:qnadataset_2022-04-11.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [2]:
# tokenizer와 model 설정
model_path = '../../model/distilbert/distilbert-0331-TS-nli-0.1-10'

tokenizer = DistilBertTokenizer.from_pretrained(model_path, do_lower_cased=False)
model = DistilBertModel.from_pretrained(model_path)
model.to(device)

Some weights of the model checkpoint at ../../model/distilbert/distilbert-0331-TS-nli-0.1-10 were not used when initializing DistilBertModel: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(167550, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(

In [3]:
#tokenizer 에서 10000개 단어만 뽑아냄.
# 시작 단어 index, 끝 단어 index = 시작 단어 index + 1000
start_word_len = 119547
end_word_len = start_word_len + 10000

word_list = []
token_id_list = []
for i in range(start_word_len, end_word_len):
    word = tokenizer.convert_ids_to_tokens(i)  
    #print(word)
    #idx = tokenizer.convert_tokens_to_ids(word)
    #print(idx)
    
    word_list.append(word)
    #token_id_list.append(i)
    

In [4]:
print(word_list[0:5])

['사용', '때문', '시작', '사람', '기록']


In [5]:
tokenizer_input = tokenizer(word_list, padding=True, truncation=True, max_length=16, return_tensors='pt')
tokenizer_input.to(device)
#print(tokenizer_input)

{'input_ids': tensor([[   101, 119547,    102,  ...,      0,      0,      0],
        [   101, 119548,    102,  ...,      0,      0,      0],
        [   101, 119549,    102,  ...,      0,      0,      0],
        ...,
        [   101, 129544,    102,  ...,      0,      0,      0],
        [   101,    108,    108,  ...,    102,      0,      0],
        [   101,    108,    108,  ...,    102,      0,      0]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0]], device='cuda:0')}

In [6]:
model.eval()
outputs = model(**tokenizer_input)
print(type(outputs))
last_hidden_state = outputs.last_hidden_state
print(last_hidden_state.shape)

<class 'transformers.modeling_outputs.BaseModelOutput'>
torch.Size([10000, 7, 768])


In [7]:
embedding_list = []
for idx, hidden in enumerate(last_hidden_state):
    #print(hidden.shape)  # [3,768]
    means_embedding = torch.mean(hidden, dim=0)  #0번째 dim 3은 날리고, 768만 남음(즉 0번째 dim의 평균을 구함)
    #print(means_embedding.shape) #[768]
    embedding_list.append(means_embedding)

In [8]:
# word_list와 embedding_list를 가지고, 각각 meta.tsv 와 vecs.tsv파일을 만듬
import io
import os

vecs_file = 'vecs.tsv'
meta_file = 'meta.tsv'

out_v = io.open(vecs_file, 'w', encoding='utf-8')
out_m = io.open(meta_file, 'w', encoding='utf-8')

for word, embeddings in zip(word_list, embedding_list):
    #print(word)
    #print(len(embeddings))
    #break
    out_m.write(word + "\n")
    # **embedding 값은 gpu로 계산된 tensor이므로, 일단 detach().cpu() 하여 tensor를 gpu에서 cpu로 변환 이후, tensor->numpy()로 변환
    out_v.write('\t'.join([str(x) for x in embeddings.detach().cpu().numpy()]) + "\n")
    
out_v.close()
out_m.close()
    