In [31]:
#====================================================================================================
# BERT에서 512가 넘는 문장은 어떻게 할까?
# => BERT는 최대 토큰 입력 계수가 512이다. 
# => 여기서는 512 가 넘는 토큰입력인 경우, input_id_chunks 토큰(512 계수)들로 분할하여 처리하는 방법에 대해 설명한다.
#
# 과정
# 1) 입력 문장에 대해 tokenizer 처리함(*이때 add_special_tokens=False로 하여, special_token은 포함되지 않도록 처리)
# 2) 입력 token들을 510 씩 나눈다.(splits 함수 이용)
# 3) 나눈 token들에 앞뒤로 [CLS] tokens [SEP] 붙임.
# 4) 512 보다 작은 맨뒤에 남는 token에는 [PAD] 입력
# 5) 분할된 input_id_chunks 들을 dict 형태로 만들고, 모델에 입력
# 6) 이후 출력된 outputs 에 대해 평균(mean) 값을 구하면 됨

# 참고 : https://towardsdatascience.com/how-to-apply-transformers-to-any-length-of-text-a5601410af7f
# 소스 참고 : https://github.com/jamescalam/transformers/blob/main/course/language_classification/04_window_method_in_pytorch.ipynb
#===============================================================================================

import torch
import numpy as np
from transformers import BertTokenizer, BertTokenizerFast, BertModel

import sys
sys.path.append("..")
from myutils import seed_everything, GPU_info, pytorch_cos_sim

In [2]:
vocab_path = "../model/classification/bmc-ft-nsmc-cfmodel/vocab"

seed = 111

In [3]:
cuda = GPU_info()
print(cuda)

#seed 설정
seed_everything(seed)

True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30
cuda:0


In [4]:
# tokenize 설정
tokenizer = BertTokenizerFast.from_pretrained(vocab_path)

In [32]:

txt = """
I would like to get your all  thoughts on the bond yield increase this week.  I am not worried about the market downturn but the sudden increase in yields. On 2/16 the 10 year bonds yields increased by almost  9 percent and on 2/19 the yield increased by almost 5 percent.

Key Points from the CNBC Article:

* **The “taper tantrum” in 2013 was a sudden spike in Treasury yields due to market panic after the Federal Reserve announced that it would begin tapering its quantitative easing program.**
* **Major central banks around the world have cut interest rates to historic lows and launched unprecedented quantities of asset purchases in a bid to shore up the economy throughout the pandemic.**
* **However, the recent rise in yields suggests that some investors are starting to anticipate a tightening of policy sooner than anticipated to accommodate a potential rise in inflation.**

The recent rise in bond yields and U.S. inflation expectations has some investors wary that a repeat of the 2013 “taper tantrum” could be on the horizon.

The benchmark U.S. 10-year Treasury note climbed above 1.3% for the first time since February 2020 earlier this week, while the 30-year bond also hit its highest level for a year. Yields move inversely to bond prices.

Yields tend to rise in lockstep with inflation expectations, which have reached their highest levels in a decade in the U.S., powered by increased prospects of a large fiscal stimulus package, progress on vaccine rollouts and pent-up consumer demand.

The “taper tantrum” in 2013 was a sudden spike in Treasury yields due to market panic after the Federal Reserve announced that it would begin tapering its quantitative easing program.

Major central banks around the world have cut interest rates to historic lows and launched unprecedented quantities of asset purchases in a bid to shore up the economy throughout the pandemic. The Fed and others have maintained supportive tones in recent policy meetings, vowing to keep financial conditions loose as the global economy looks to emerge from the Covid-19 pandemic.

However, the recent rise in yields suggests that some investors are starting to anticipate a tightening of policy sooner than anticipated to accommodate a potential rise in inflation.

With central bank support removed, bonds usually fall in price which sends yields higher. This can also spill over into stock markets as higher interest rates means more debt servicing for firms, causing traders to reassess the investing environment.

“The supportive stance from policymakers will likely remain in place until the vaccines have paved a way to some return to normality,” said Shane Balkham, chief investment officer at Beaufort Investment, in a research note this week.

“However, there will be a risk of another ‘taper tantrum’ similar to the one we witnessed in 2013, and this is our main focus for 2021,” Balkham projected, should policymakers begin to unwind this stimulus.

Long-term bond yields in Japan and Europe followed U.S. Treasurys higher toward the end of the week as bondholders shifted their portfolios.
"""

# 입력 문장길이가 708 즉 512를 넘으면, BERT 모델 에서 에러 발생한다.(최대 512까지만 지원함)
# 따라서 512 + 198로 나눠서 2개의 input_ids를 만듬.
# add_special_tokens=False로 해서, 일단 [CLS], [SEP]등이 포함되지 않느 input_ids를 구함.

tokenized_input = tokenizer(txt, add_special_tokens=False, return_tensors="pt")
#tokenized_input = tokenizer(txt, max_length=512, add_special_tokens=False, truncation=True, padding="max_length", return_tensors="pt")
#tokenized_input = tokenizer.encode_plus(txt, add_special_tokens=False, return_tensors='pt')
#tokenized_input = tokenizer.encode_plus(txt, add_special_tokens=True, max_length=512, truncation=True, padding="max_length", return_tensors='pt')

print(len(tokenized_input.input_ids[0]))
print(tokenized_input)

708
{'input_ids': tensor([[   146,  10894,  11850,  10114,  15329,  20442,  10435,  18957,  10107,
          10135,  10105,  55185,  95757,  20299,  10531,  16118,    119,    146,
          10392,  10472,  12796,  24874,  10336,  10978,  10105,  17313,  12935,
          15698,  10115,  10473,  10105,  94994,  20299,  10106,  95757,  10107,
            119,  10576,    123,    120,  10250,  10105,  10150,  10924,  93163,
          95757,  10107,  19299,  10155,  17122,    130,  22362,  10111,  10135,
            123,    120,  10270,  10105,  95757,  19299,  10155,  17122,    126,
          22362,    119,  21663,  49544,  10188,  10105,  73067,  38964,  26295,
            131,    115,    115,    115,  10117,    100,  54260,  10129,  14222,
          12659,    100,  10106,  10207,  10134,    169,  94994,  32650,  21353,
          10106,  77201,  95757,  10107,  10850,  10114,  17313,  97586,  10350,
          10662,  10105,  14492,  23120,  13854,  10189,  10271,  10894,  16135,
          

In [33]:
# 참고 소스 :https://github.com/jamescalam/transformers/blob/main/course/language_classification/04_window_method_in_pytorch.ipynb

# define target chunksize
chunksize = 512

# split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
# 510으로 큰 문장을 나누고
input_id_chunks = list(tokenized_input['input_ids'][0].split(chunksize - 2))
mask_chunks = list(tokenized_input['attention_mask'][0].split(chunksize - 2))

# loop through each chunk
for i in range(len(input_id_chunks)):
    
    # add CLS and SEP tokens to input IDs
    # 나눈 input_id_chunks에 [CLS], [SEP]를 추가함 
    input_id_chunks[i] = torch.cat([
        torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
    ])
    
    # add attention tokens to attention mask
    # attentionmask에도 [CLS], [SEP]에 1을 추가
    mask_chunks[i] = torch.cat([
        torch.tensor([1]), mask_chunks[i], torch.tensor([1])
    ])
    
    # get required padding length
    # PAD 길이를 얻어와서, 512보다 작은곳에는 PAD 추가함
    pad_len = chunksize - input_id_chunks[i].shape[0]
    
    # check if tensor length satisfies required chunk size
    if pad_len > 0:
        # if padding length is more than 0, we must add padding
        input_id_chunks[i] = torch.cat([
            input_id_chunks[i], torch.Tensor([0] * pad_len)
        ])
        mask_chunks[i] = torch.cat([
            mask_chunks[i], torch.Tensor([0] * pad_len)
        ])

# check length of each tensor
for chunk in input_id_chunks:
    print(len(chunk))
    print(chunk)

# print final chunk so we can see 101, 102, and 0 (PAD) tokens are all correctly placed
#print(chunk)

512
tensor([   101,    146,  10894,  11850,  10114,  15329,  20442,  10435,  18957,
         10107,  10135,  10105,  55185,  95757,  20299,  10531,  16118,    119,
           146,  10392,  10472,  12796,  24874,  10336,  10978,  10105,  17313,
         12935,  15698,  10115,  10473,  10105,  94994,  20299,  10106,  95757,
         10107,    119,  10576,    123,    120,  10250,  10105,  10150,  10924,
         93163,  95757,  10107,  19299,  10155,  17122,    130,  22362,  10111,
         10135,    123,    120,  10270,  10105,  95757,  19299,  10155,  17122,
           126,  22362,    119,  21663,  49544,  10188,  10105,  73067,  38964,
         26295,    131,    115,    115,    115,  10117,    100,  54260,  10129,
         14222,  12659,    100,  10106,  10207,  10134,    169,  94994,  32650,
         21353,  10106,  77201,  95757,  10107,  10850,  10114,  17313,  97586,
         10350,  10662,  10105,  14492,  23120,  13854,  10189,  10271,  10894,
         16135,  54260,  13135,  104

In [34]:
# True로 해야, hidden_states 가 출력됨
output_hidden_states = True
#False로 지정하는 경우 일반적인 tuple을 리턴, True인 경우는 transformers.file_utils.ModelOutput 으로 리턴
return_dict = False

model_path = '../model/bert-multilingual-cased' 

# model 불러옴
model = BertModel.from_pretrained(model_path, 
                                  output_hidden_states=output_hidden_states,
                                  return_dict=return_dict)
model.eval()

Some weights of the model checkpoint at ../model/bert-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [35]:
# input_ids_chunks르 dict로 만듬
input_ids = torch.stack(input_id_chunks)
attention_mask = torch.stack(mask_chunks)

input_dict = {
    'input_ids': input_ids.long(),
    'attention_mask': attention_mask.int()
}

print(input_dict)

{'input_ids': tensor([[  101,   146, 10894,  ..., 33687, 60287,   102],
        [  101, 11912, 10114,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32)}


In [36]:
# 모델어 input_dict 넣고 출력 
outputs = model(**input_dict)
print(len(outputs))

3


In [37]:
sequence_output = outputs[0]
print('sequence type: {}'.format(type(sequence_output)))
print('sequence 길이: {}'.format(sequence_output.size()))
print('\n')
            
pooled_output = outputs[1]
print('pooled type: {}'.format(type(pooled_output)))
print('pooled 길이:{}'.format(pooled_output.size()))
print('\n')

hidden_states = outputs[2]
print('hidden_states type: {}'.format(type(hidden_states)))
layer_idx = 0
batch_idx = 0
token_idx = 0
print('hidden_states')
print("-레이어 수:{}".format(len(hidden_states)))
print("-배치 수: {}".format(len(hidden_states[layer_idx])))
print("-토큰 수 : {}".format(len(hidden_states[layer_idx][batch_idx])))
print("-hidden 유닛 수 : {}".format(len(hidden_states[layer_idx][batch_idx][token_idx])))

sequence type: <class 'torch.Tensor'>
sequence 길이: torch.Size([2, 512, 768])


pooled type: <class 'torch.Tensor'>
pooled 길이:torch.Size([2, 768])


hidden_states type: <class 'tuple'>
hidden_states
-레이어 수:13
-배치 수: 2
-토큰 수 : 512
-hidden 유닛 수 : 768


In [38]:
# pooled_out에 2개의 문장의 평균값으로 임베딩값 표현함
mean_pooled = torch.mean(pooled_output, dim=0)
print(len(mean_pooled))
print(mean_pooled.size())
print(mean_pooled)


768
torch.Size([768])
tensor([ 0.2065, -0.1807,  0.2499, -0.3467, -0.1316,  0.2024,  0.2877,  0.2766,
        -0.4050,  0.2525, -0.2488, -0.3253, -0.1884, -0.3464,  0.2898, -0.2977,
         0.6143,  0.3040,  0.0855, -0.1333, -0.9998, -0.4182, -0.2352, -0.2241,
        -0.5292,  0.3844, -0.2604,  0.4396,  0.2811, -0.2399,  0.1456, -0.9998,
         0.7237,  0.6057,  0.3402, -0.2484,  0.1226,  0.1253,  0.1584, -0.4195,
        -0.1671,  0.1556, -0.1758,  0.1589, -0.2212, -0.2882, -0.3309,  0.2314,
        -0.3749,  0.2454, -0.0100,  0.1833,  0.4785,  0.1836,  0.1699,  0.1592,
         0.3646,  0.2219,  0.3616, -0.2915,  0.0265,  0.3371,  0.3091, -0.3351,
        -0.2154, -0.3299,  0.2114, -0.1359,  0.2699, -0.3360, -0.3172, -0.2742,
        -0.1866,  0.1523,  0.3244, -0.2772,  0.3102,  0.2228,  0.1599, -0.1991,
        -0.3259, -0.3332, -0.3347,  0.2152, -0.1937,  0.2903,  0.2723, -0.3829,
         0.3020, -0.2513,  0.0803,  0.4680, -0.2964,  0.4038, -0.2600, -0.0880,
        -0.7526, -