In [1]:
#=======================================================================================
# P-TUNING Q&A=기계독해 훈련 테스트 예제
#
# => input_ids : [CLS]질문[SEP]지문[SEP]
# => attention_mask : 1111111111(질문, 지문 모두 1)
# => token_type_ids : 0000000(질문)1111111(지문)
# => start_positions : 45 (질문에 대한 지문에서의 답변 시작 위치)
# => end_positions : 60 (질문에 대한 지문에서의 답변 끝 위치)
#
# prefix-tuning => GPT-2, T5등 LM에서 접두사 prompt를 추가하여 훈련시키는 방식
#
# p-tuning => P-tuning은 prefix-tuning보다 유연 합니다. 
# 시작할 때뿐만 아니라 프롬프트 중간에 학습 가능한 토큰을 삽입하기 때문입니다. 
# https://github.com/THUDM/P-tuning
#
# p-tuing v2 => 새로운 방식이 아니라 NLU 향상을 위해, MLM 모델에 prefix-tuning을 적용한 방식
# https://github.com/THUDM/P-tuning-v2
#=======================================================================================

import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch.nn.functional as F
import os
from os import sys
sys.path.append('..')
from myutils import seed_everything, GPU_info, mlogging
from tqdm.notebook import tqdm


logger = mlogging(loggername="bertnlitest", logfilename="bertnlitest")
device = GPU_info()
seed_everything(111)

# 모델, tokenizer 경로 지정 
model_path = '../../data11/model/bert/bert-multilingual-cased-p-tuing-pp-qa-20'
vocab_path = '../../data11/model/bert/bert-multilingual-cased-p-tuing-pp-qa-20'

# p-tuningv2 prefixt 튜닝일때 eval 할 모델이 기존 훈련시 설정했던값과 동일하게 설정해야 함.
pre_seq_len = 20             # prefix 계수
prefix_projection = True     # True = two-layer MLP 사용함(Multi-layer perceptron(다중퍼셉트론))
prefix_hidden_size = 512     # prefix hidden size


logfilepath:bertnlitest_2022-07-14.log
True
device: cuda:0
cuda index: 0
gpu 개수: 1
graphic name: NVIDIA A30


In [18]:
#==============================================================================
# p-tuing v2 => 새로운 방식이 아니라 NLU 향상을 위해, MLM 모델에 prefix-tuning을 적용한 방식
# 참고 소스 : https://github.com/THUDM/P-tuning-v2
# 
# p-tuning-v2의 주요 기여는 원래 입력 전에 사용자 정의 길이의 레이어 프롬프트를 추가하고 
# 다운스트림 작업에 대한 후속 교육에서 BERT 모델의 모든 매개변수를 고정하고 이러한 프롬프트만 교육하는 것임.
# 설명 : https://zhuanlan.zhihu.com/p/459305102
#
# => P-tuning-v2의 구현 방식은 prefix N 시퀀스를 생성한 다음, 원래 bert 모델과 연결한다. 이때 bert의 past_key_values(*여기서는 decoding 속도 개선 목적이 아님)를 이용함
# => bert의 past_key_values로 prefix에대한 key 와 value 를 넘겨줘서, 기존 입력 key, value와 연결시키도록 함.
# => get_prompt() 함수 : prefix를 past_key_value 형식(batch_size, num_heads, sequence_length - 1, embed_size_per_head)으로 조정(만듬)
# => attention_mask : 기존 attention_mask +  prefix_attention_mask 
#==============================================================================
import copy
import torch
from torch._C import NoopLogger
import torch.nn
import torch.nn.functional as F
from torch import Tensor
from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss
from transformers import BertModel, BertPreTrainedModel
from transformers.modeling_outputs import QuestionAnsweringModelOutput, BaseModelOutput, Seq2SeqLMOutput

# PrefixEncoder 클래스
class PrefixEncoder(torch.nn.Module):
    r'''
    The torch.nn model to encode the prefix
    Input shape: (batch-size, prefix-length)
    Output shape: (batch-size, prefix-length, 2*layers*hidden)
    '''
    def __init__(self, config):
        super().__init__()
        self.prefix_projection = config.prefix_projection
        if self.prefix_projection:
            # Use a two-layer MLP to encode the prefix
            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.hidden_size)
            self.trans = torch.nn.Sequential(
                torch.nn.Linear(config.hidden_size, config.prefix_hidden_size),
                torch.nn.Tanh(),
                torch.nn.Linear(config.prefix_hidden_size, config.num_hidden_layers * 2 * config.hidden_size)
            )
        else:
            self.embedding = torch.nn.Embedding(config.pre_seq_len, config.num_hidden_layers * 2 * config.hidden_size)

    def forward(self, prefix: torch.Tensor):
        if self.prefix_projection:
            prefix_tokens = self.embedding(prefix)
            past_key_values = self.trans(prefix_tokens)
        else:
            past_key_values = self.embedding(prefix)
        return past_key_values
    
# BertPrefixForQuestionAnswering 클래스
class BertPrefixForQuestionAnswering(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.pre_seq_len = config.pre_seq_len
        self.n_layer = config.num_hidden_layers
        self.n_head = config.num_attention_heads
        self.n_embd = config.hidden_size // config.num_attention_heads

        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_outputs = torch.nn.Linear(config.hidden_size, config.num_labels)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.prefix_encoder = PrefixEncoder(config)
        self.prefix_tokens = torch.arange(self.pre_seq_len).long()

        for param in self.bert.parameters():
            param.requires_grad = False

        self.init_weights()

    def get_prompt(self, batch_size):
        prefix_tokens = self.prefix_tokens.unsqueeze(0).expand(batch_size, -1).to(self.bert.device)
        past_key_values = self.prefix_encoder(prefix_tokens)
        bsz, seqlen, _ = past_key_values.shape
        past_key_values = past_key_values.view(
            bsz,
            seqlen,
            self.n_layer * 2, 
            self.n_head,
            self.n_embd
        )
        past_key_values = self.dropout(past_key_values)
        past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split(2)
        return past_key_values

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
            sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        batch_size = input_ids.shape[0]
        past_key_values = self.get_prompt(batch_size=batch_size)
        prefix_attention_mask = torch.ones(batch_size, self.pre_seq_len).to(self.bert.device)
        attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            past_key_values=past_key_values,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [2]:
# tokeniaer 및 model 설정
#tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer = BertTokenizer.from_pretrained(vocab_path, strip_accents=False, do_lower_case=False)
# strip_accents=False : True로 하면, 가자 => ㄱ ㅏ ㅈ ㅏ 식으로 토큰화 되어 버림(*따라서 한국어에서는 반드시 False)
# do_lower_case=False : # 소문자 입력 사용 안함(한국어에서는 반드시 False)
#tokenizer = BertTokenizer(vocab_file=vocab_path, strip_accents=False, do_lower_case=False) 


# tokenier 테스트
print(len(tokenizer))
print(tokenizer.encode("눈에 보이는 반전이었지만 영화의 흡인력은 사라지지 않았다", "정말 재미있다"))
print(tokenizer.convert_ids_to_tokens(131027))
print(tokenizer.convert_tokens_to_ids('정말'))

119547
[101, 9034, 10530, 9356, 31728, 9321, 16617, 10739, 69708, 42428, 10459, 10020, 12030, 28143, 10892, 9405, 17342, 12508, 12508, 49137, 102, 9670, 89523, 9659, 22458, 76820, 102]
[UNK]
100


In [7]:
# 학습 data loader 생성
sys.path.append('..')
from myutils import KorQuADCorpus, QADataset, data_collator
from torch.utils.data import DataLoader, RandomSampler

#############################################################################
# 변수 설정
#############################################################################
max_seq_len = 128  # 질문 + 지문 최대 크기
doc_stride = 64     # 지문이 128을 넘을 경우, 얼만큼씩 다음 지문으로 대체할지
max_query_length = 32  # 질문 최대 크기
batch_size = 32        # 배치 사이즈(64면 GUP Memory 오류 나므로, 32 이하로 설정할것=>max_seq_length 를 줄이면, 64도 가능함)
cache = False   # 캐쉬파일 생성할거면 True로 (True이면 loding할때 캐쉬파일있어도 이용안함)
#############################################################################

# corpus 파일 설정
corpus = KorQuADCorpus()

# 평가 dataloader 생성
print('end train_loader===========================================================')

print('create eval_loader===========================================================')
eval_file_fpath = '../../data11/korpora/korQuAD/KorQuAD_v1.0_dev.json'
eval_dataset = QADataset(file_fpath=eval_file_fpath, tokenizer=tokenizer, corpus=corpus, max_seq_length=max_seq_len, max_query_length = max_query_length, doc_stride= doc_stride, overwrite_cache=cache)

eval_loader = DataLoader(eval_dataset, 
                          batch_size=batch_size, 
                          #shuffle=True, # dataset을 섞음
                          sampler=RandomSampler(eval_dataset, replacement=False), #dataset을 랜덤하게 샘플링함
                          collate_fn=data_collator, # dataset을 tensor로 변환(예시 {'input_ids':tensor[0,1,2,3,1,], 'token_type_id:tensor[0,0,0,0,0], 'attention_mask:tensor[1,1,1,1,1], 'labels':tensor[5]}
                          num_workers=4)
print('end eval_loader===========================================================')

print('eval_loader_len: {}'.format(len(eval_loader)))

Loading features from cached file ../../data11/korpora/korQuAD/cached_BertTokenizer_128_32_64_KorQuAD_v1.0_dev.json [took %.3f s] 1.6221225261688232
eval_loader_len: 870


In [19]:
from transformers import AutoConfig
# config 설정 
config = AutoConfig.from_pretrained(
    model_path,
    num_labels=2,  # q&a 는 2
    revision="main"
)

#========================================================================
# 훈련 모델에 따라 아래값들을 바꿔줘야 함.
#========================================================================
#get_model 에서 --prefix인경우 config 인자 설정해 주고 있음.

config.hidden_dropout_prob = 0.1
config.pre_seq_len = pre_seq_len             # prefix 계수
config.prefix_projection = prefix_projection    # True = two-layer MLP 사용함(Multi-layer perceptron(다중퍼셉트론))
config.prefix_hidden_size = prefix_hidden_size     # prefix hidden size
#========================================================================

print(config.num_hidden_layers)
print(config.num_attention_heads)
print(config.hidden_size)

12
12
768


In [20]:
#############################################################################################
# 변수들 설정
# - model_path : from_pretrained() 로 호출하는 경우에는 모델파일이 있는 폴더 경로나 
#          huggingface에 등록된 모델명(예:'bert-base-multilingual-cased')
#          torch.load(model)로 로딩하는 경우에는 모델 파일 풀 경로
#
# - vocab_path : from_pretrained() 호출하는 경우에는 모델파일이 있는 폴더 경로나
#          huggingface에 등록된 모델명(예:'bert-base-multilingual-cased')   
#          BertTokenizer() 로 호출하는 경우에는 vocab.txt 파일 풀 경로,
#############################################################################################
     
# p-tuning 모델
model = BertPrefixForQuestionAnswering.from_pretrained(model_path, config=config, revision="main")

# 레벨을 1개만 선택하는 경우
#model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
#model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=6)

# 레벨을 멀티로 선택해야 하는 경우
#model = BertForSequenceClassification.from_pretrained(model_path, problem_type="multi_label_classification",num_labels=6)

model.to(device)

BertPrefixForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, eleme

In [21]:
logger.info(f"=== model: {model_path} ===")
logger.info(f"num_parameters: {model.num_parameters()}")

2022-07-14 13:10:55,838 - bertnlitest - INFO - === model: ../../data11/model/bert/bert-multilingual-cased-p-tuing-pp-qa-20 ===
2022-07-14 13:10:55,838 - bertnlitest - INFO - === model: ../../data11/model/bert/bert-multilingual-cased-p-tuing-pp-qa-20 ===
2022-07-14 13:10:55,844 - bertnlitest - INFO - num_parameters: 187129090
2022-07-14 13:10:55,844 - bertnlitest - INFO - num_parameters: 187129090


In [24]:
import time
from transformers import AdamW, get_linear_schedule_with_warmup

# 평가 시작
model.eval()

total_loss = 0
total_len = 0
total_correct = 0

start = time.time()
logger.info(f'---------------------------------------------------------')

for data in tqdm(eval_loader):
     # 입력 값 설정
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)       
    start_positions = data['start_positions'].to(device)
    end_positions = data['end_positions'].to(device)
 
    # 손실률 계산하는 부분은 no_grade 시켜서, 계산량을 줄임.
    # => torch.no_grad()는 gradient을 계산하는 autograd engine를 비활성화 하여 
    # 필요한 메모리를 줄이고, 연산속도를 증가시키는 역활을 함
    with torch.no_grad():
        # 모델 실행
        outputs = model(input_ids=input_ids, 
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids,
                       start_positions=start_positions,
                       end_positions=end_positions)
    
        # 출력값 loss,logits를 outputs에서 얻어옴
        loss = outputs.loss
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits
            
        # 총 손실류 구함
        # start 포지션 정확도 구함
        start_pred = torch.argmax(F.softmax(start_scores), dim=1)
        start_correct = start_pred.eq(start_positions)
            
        # end 포지션 정확도 구함
        end_pred = torch.argmax(F.softmax(end_scores), dim=1)
        end_correct = start_pred.eq(end_positions)
            
        # start 포지션과 end 포지션 정확도를 더하고 2로 나줌
        total_correct += (start_correct.sum().item() + end_correct.sum().item()) / 2
        total_len += len(start_positions)
    

logger.info(f"eval-accuracy: {total_correct / total_len}")
logger.info(f'---------------------------------------------------------')
logger.info(f'=== 처리시간: {time.time() - start:.3f} 초 ===')
logger.info(f'-END-\n')

2022-07-14 13:12:28,392 - bertnlitest - INFO - ---------------------------------------------------------
2022-07-14 13:12:28,392 - bertnlitest - INFO - ---------------------------------------------------------


  0%|          | 0/870 [00:00<?, ?it/s]

  start_pred = torch.argmax(F.softmax(start_scores), dim=1)
  end_pred = torch.argmax(F.softmax(end_scores), dim=1)
2022-07-14 13:13:01,237 - bertnlitest - INFO - eval-accuracy: 0.8122730051422201
2022-07-14 13:13:01,237 - bertnlitest - INFO - eval-accuracy: 0.8122730051422201
2022-07-14 13:13:01,240 - bertnlitest - INFO - ---------------------------------------------------------
2022-07-14 13:13:01,240 - bertnlitest - INFO - ---------------------------------------------------------
2022-07-14 13:13:01,242 - bertnlitest - INFO - === 처리시간: 32.850 초 ===
2022-07-14 13:13:01,242 - bertnlitest - INFO - === 처리시간: 32.850 초 ===
2022-07-14 13:13:01,244 - bertnlitest - INFO - -END-

2022-07-14 13:13:01,244 - bertnlitest - INFO - -END-

