<a href="https://colab.research.google.com/github/ingabLee/Transformers_book/blob/main/TransformerLearning_Chap56_65.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
dataset = [
    ["What music do you like?", "I like Rock music.", 1],
    ["What is your favorite food?", "I like sushi the best", 1],
    ["What is your favorite color?", "I'm going to be a doctor", 0],
    ["What is your favorite song?", "Tokyo olympic game in 2020 was postponed", 0],
    ["Do you like watching TV show?", "Yeah, I often watch it in my spare time", 1]
]

In [None]:
from transformers import BertPreTrainedModel, BertConfig, BertModel, BertTokenizer
from torch.optim import adamw
from torch import nn

# define class
class BertEnsembleForNextSentencePrediction(BertPreTrainedModel):
  # creator
  def __init__(self, config, *args, **kwargs):
    super().__init__(config)

    #QA BERT Model
    self.bert_model_1 = BertModel(config)

    #AQ BERT Model
    self.bert_model_2 = BertModel(config)

    # linear function
    self.cls = nn.Linear(2*self.config.hidden_size, 2)

    # init weight
    self.init_weights()

  # forward
  def forward(
      self,
      input_ids=None,
      attention_mask=None,
      token_type_ids=None,
      position_ids=None,
      head_mask=None,
      inputs_embeds=None,
      next_sentence_label=None ):
    # outputs list
    outputs = []

    #save input_ids first text
    input_ids_1 = input_ids[0]

    # save input_ids attention_mask
    attention_mask_1 = attention_mask[0]

    #save outputs result that  bert_model_1 result
    outputs.append( self.bert_model_1(input_ids_1, attention_mask=attention_mask_1))

    #save second text(input_ids)
    input_ids_2 = input_ids[1]

    #save second text attenton_mask
    attention_mask_2 = attention_mask[1]

    #save result of bert_model_2 input_ids_2
    outputs.append(self.bert_model_2(input_ids_2, attention_mask=attention_mask_2))

    # outputs에 쌓인 output의 두번째 요소(output[1])를 하나씩 추출하여
    # torch.cat()으로 토치 텐서 형태로 병합
    # 이를 통해 마지막 은닉층 임베딩 상태를 구함.
    last_hidden_states = torch.cat([output[1] for output in outputs], dim=1)

    #self.cls 선형함수에 마지막 은닉층 임베딩 상태를 투입하여 로짓 추출.
    logits = self.cls(last_hidden_states)

    # crossentropy loss
    if next_sentence_label is not None:
      # nn.CrossEntropyLoss() 입력 데이타의 마지막 인덱스는 계산에서 제외.
      loss_fct = nn.CrossEntropyLoss(ignore_index=-1)

      # logits.view(-1, 2)는 열이 2개 형태로 logits를 정렬
      # next_sentence_label.view(-1)는 행이 하나인 형태로 정렬
      next_sentence_loss = loss_fct(logits.view(-1,2), next_sentence_label.view(-1))
      return next_sentence_loss, logits
    else:
      return logits

In [None]:
import torch

# colab -> gpu, else -> cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model, config
config = BertConfig()
model = BertEnsembleForNextSentencePrediction(config)

# send model to gpu or cpu
model.to(device)

# load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# set learning rate
learning_rate = 1e-5

# 절편과 가중치를 no_decay변수에 저장
no_decay = ["bias", "LayerNorm.weight"]

# set optimize parameter group
optimizer_grouped_parameters = [{
    "params":[ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
}]

# optimizer
optimizer = adamw.AdamW(optimizer_grouped_parameters,lr=learning_rate)

In [None]:
# prepare_data function
def prepare_data(dataset, qa=True):
  # empty list
  input_ids, attention_masks = [], []
  labels = []

  for point in dataset:
    if qa is True:
      # point에 있는 3개의 원소를 앞에서부터 q, a, _로 할당
      q, a, _ = point
    else :
      a, q, _ = point

    # q, a encode by tokenizer
    encoded_dict = tokenizer.encode_plus(
        q, # text1 encode
        a,  # text2 encode
        add_special_tokens = True,  #  speicial token[CLS], [SEP] add
        max_length=128,
        pad_to_max_length=True,   # padding to max_length
        return_attention_mask = True,
        return_tensors = 'pt',  #pt -> pytorch
        truncation=True
    )

    # encoded_dict("input_ids")를 컨테이너 변수 input_ids에 순서대로 저장
    input_ids.append(encoded_dict["input_ids"])

    # encoded_dict("attention_mask")를 attention_masks에 순서대로 저장
    attention_masks.append(encoded_dict["attention_mask"])

    # point의 마직막 (세번째) 원소(레이블)을 labels에 순서대로 저장.
    labels.append(point[-1])

    # end for loop

  # input_ids 첫번째 축(dim=0), 세로방향으로 병합
  input_ids = torch.cat(input_ids, dim=0)

  #attention_mask도 첫번째 축(dim=0), 새로방향으로 병합
  attention_masks = torch.cat(attention_masks, dim=0)

  # return input_ids, attention_masks, labels
  return input_ids, attention_masks, labels

In [None]:
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, Dataset, SequentialSampler

# QADataset class
class QADataset(Dataset):

  # constructor
  def __init__(self, input_ids, attention_masks, labels=None):
    self.input_ids = np.array(input_ids)
    self.attention_masks = np.array(attention_masks)
    # torch.long은 정수타입을 의미
    self.labels = torch.tensor(labels, dtype=torch.long) if labels is not None else None

  def __getitem__(self, index):
    return self.input_ids[index], self.attention_masks[index], self.labels[index]# if self.labels is not None else None

  def __len__(self):
    return self.input_ids.shape[0]

In [None]:
# dataset을 prepare_data에 투입하여 결과를 각기
# input_ids_qa, attention_mask_qa, label_qa에 저장
input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)

# 위의 결과물을 QADataset클래스에 투입
train_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

# 맨 윗줄 코드와 동일하나 이번에는 prepare_data에 qa플래그 값이 false일때 적용
input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)

# 위의 결과물을 QADataset에 클래스에 투입
train_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

# train_dataset_qa를 dataloader로 처리
dataloader_qa = DataLoader(dataset=train_dataset_qa,
                           batch_size=5, sampler=SequentialSampler(train_dataset_qa))

# train_dataset_aq dataloader 처리
dataloader_aq = DataLoader(dataset=train_dataset_aq,
                           batch_size=5, sampler=SequentialSampler(train_dataset_aq))

In [None]:
# run time 8minute
# epoch = 30
epochs = 30

# for loop epoch
for epoch in range(epochs):

  # dataloader_qa, dataloader_aq 쌍을 동시 반복 루프에서 처리
  # enumerate 및 zip으로 두 데이터 쌍을 묶고 반복가능한 순서 부여
  for step,combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
    # enumerate로 묶인 데이터쌍을 순서대로 batch_1,batch_2로 저장
    batch_1, batch_2 = combined_batch

    #모델을 학습모드로 전환
    model.train()

    # 가능한 경우 batch_1과 batch_2의 데이터를 GPU에 전달하고
    # 그렇지 않은 경우 cpu로 전달
    batch_1 = tuple(t.to(device) for t in batch_1)
    batch_2 = tuple(t.to(device) for t in batch_2)

    #모델에 투입할 변수 inputs의 내용입력
    inputs = {
        "input_ids" : [batch_1[0], batch_2[0]],
        "attention_mask": [ batch_1[1], batch_2[1]],
        "next_sentence_label" : batch_1[2]
    }

    # 모델에 inputs를 **kwargs 형식(**inputs)으로 투입
    # 딕셔너리 타입인 inputs의 키와 값 모두 입력
    outputs = model(**inputs)

    # 모델의 결과물인 outputs는 tuple타입으로  출력
    # 그중 첫번째 요소, 즉 outputs[0]을 변수 loss에 저장
    loss = outputs[0]

    # error backward
    loss.backward()

    # print epoch, loss
    print(f"epoch:{epoch}, loss{loss}")

    #update weight
    optimizer.step()

    # gradient init for next epoch
    model.zero_grad()


In [None]:
# dataset prepare_data function
input_ids_qa, attention_masks_qa, labels_qa = prepare_data(dataset)

#QADataset class
test_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

#dataset prepare_data function with qa flags is False
input_ids_aq, attention_masks_as, labels_aq = prepare_data(dataset, qa=False)

#QADataset class
test_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

# test_dataset_qa -> DataLoader
dataloader_qa = DataLoader(dataset=test_dataset_qa,
                           batch_size=16, sampler=SequentialSampler(test_dataset_qa))

# test_dataset_aq -> DalataLoader
dataloader_aq = DataLoader(dataset=test_dataset_aq, batch_size=16,
                           sampler=SequentialSampler(test_dataset_aq))

# container list for result
complete_outputs, complete_label_ids = [], []

# dataloader_qa, dataloader_aq를 동시에 반복루프 작업실시
for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)) :
  # model eval
  model.eval()

  # batch1, batch2 <- enumerated zip
  batch_1, batch_2 = combined_batch

  # 가능한 경우 batch_1, batch_2를 gpu로 전달. 아니면 cpu전달
  batch_1 = tuple(t.to(device) for t in batch_1)
  batch_2 = tuple(t.to(device) for t in batch_2)

  # no grade. evalution only need forward propagation.
  # prevent auto gradient calc
  with torch.no_grad():
    # inputs -> model
    inputs = {
        "input_ids":[batch_1[0], batch_2[0]],
        "attention_mask":[batch_1[1], batch_2[1]],
        "next_sentence_label":batch_1[2]
    }

    #inputs -> model (**kwargs)
    outputs = model(**inputs)

    # temp_eval_loss <- outputs first item
    # logits <- outputs second item
    tmp_eval_loss, logits = outputs[:2]

    # logit -> cpu -> numpy
    logits = logits.detach().cpu().numpy()

    # logits에 담긴 로짓값을 axis=1, 즉 가로방향으로 최대값 인덱스 축출
    outputs = np.argmax(logits, axis=1)

    #inputs['next_sentence_label'] -> cpu -> numpy
    label_ids = inputs["next_sentence_label"].detach().cpu().numpy()

    # with torch.no_grad end

  # outputs, label_ids를 각각 container list 에 순서대로 저장
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

# print final result
print(complete_outputs,complete_label_ids)

# [1,1,0,0,1] [1,1,0,0,1] 첫번째는 예측된 결과, 두번째는 label 결과



In [None]:
# 이전과 동일 내용이므로 주석 설명생략
dataset = [["What music to you like?","I like Rock music.", 1]]

input_ids_qa,attention_masks_qa, labels_qa = prepare_data(dataset)
test_dataset_qa = QADataset(input_ids_qa, attention_masks_qa, labels_qa)

input_ids_aq, attention_masks_aq, labels_aq = prepare_data(dataset, qa=False)
test_dataset_aq = QADataset(input_ids_aq, attention_masks_aq, labels_aq)

dataloader_qa = DataLoader(dataset=test_dataset_qa,
                           batch_size=16, sampler=SequentialSampler(test_dataset_qa))

dataloader_aq = DataLoader(dataset=test_dataset_aq,
                           batch_size=16, sampler=SequentialSampler(test_dataset_aq))

complete_outputs, complete_label_ids = [], []

for step, combined_batch in enumerate(zip(dataloader_qa, dataloader_aq)):
  model.eval()
  batch_1, batch_2 = combined_batch

  batch_1 = tuple(t.to(device) for t in batch_1)
  batch_2 = tuple(t.to(device) for t in batch_2)

  with torch.no_grad():
    inputs = {
        "input_ids":[batch_1[0], batch_2[0]],
        "attention_mask":[batch_1[1], batch_2[1]],
        "next_sentence_label":batch_1[2]
    }

    outputs = model(**inputs)
    tmp_eval_loss, logits = outputs[:2]
    logits = logits.detach().cpu().numpy()

    outputs = np.argmax(logits, axis=1)
    label_ids = inputs["next_sentence_label"].detach().cpu().numpy()
  complete_outputs.extend(outputs)
  complete_label_ids.extend(label_ids)

print(complete_outputs, complete_label_ids)