In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#SQUAD in BERT
QA with BERT
dataset: https://www.kaggle.com/datasets/stanfordu/stanford-question-answering-dataset

##Dataset preprocessing
학습에 용이하도록 데이터셋의 포멧을 변환  
paragraph를 하나의 list에 묶고 paragraph index, question과 answer index에 대한 pair list로 구성함 

In [8]:
import json

train_file_path = "/content/drive/MyDrive/Colab Notebooks/Natural_Language_Processing/6일차_배포/dataset/train-v1.1.json"
test_file_path = "/content/drive/MyDrive/Colab Notebooks/Natural_Language_Processing/6일차_배포/dataset/dev-v1.1.json"


train_data = json.load(open(train_file_path,"r"))
test_data = json.load(open(test_file_path,"r"))

#paragraph를 list에 저장하고 모든 data pair를 list에 저장함으로써 사용하기 쉽게만듬, 각 pair는 paragraph index, question, answer로 구성
train_paras = []
train_pairs = []
for p_index, i in enumerate(train_data["data"][0]["paragraphs"]):   # enumerate -> paragraphs의 인덱스까지 같이 반환
  train_paras.append(i["context"])
  for j in i["qas"]:
    pair = (p_index, j["question"], j["answers"][0]["text"])    # answer 3개 중 0번째 사람의 정답 사용
    train_pairs.append(pair)

print(train_paras)
print(train_pairs)

test_paras = []
test_pairs = []
for p_index, i in enumerate(test_data["data"][0]["paragraphs"]):
  test_paras.append(i["context"])
  for j in i["qas"]:
    pair = (p_index, j["question"], j["answers"][0]["text"])
    test_pairs.append(pair)

print(test_paras)
print(test_pairs)

['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', "As at most other universities, Notre Dame's students run a number of news media outlets. The nine student-run outlets include three newspapers, both a radio and television station, and several magazines and journals. Begun as a one-page journal in September 1876, the Scholastic magazine is issued 

##construct dataset

In [9]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import torch

#sublist의 index를 찾아주는 함수
def find_sub_list(sl,l):
    results=[]
    i
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            results.append((ind,ind+sll-1))

    return results

class myDataset(Dataset):
  def __init__(self, paras, pairs) -> None:
      super().__init__()
      self.paras = paras
      self.pairs = pairs
      self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

  def __len__(self):
      return len(self.pairs)

  def __getitem__(self, index):
      data_pair = self.pairs[index]
      paragraph = self.paras[data_pair[0]]
      question = data_pair[1]
      answer = data_pair[2]

      #question과 paragraph를 tokenization
      data = self.tokenizer(question, paragraph, max_length=512, padding="max_length")

      #answer를 tokenization
      tokenized_answer = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(answer))

      #data에서 answer의 position 찾기
      ans = find_sub_list(tokenized_answer, data["input_ids"])

      #print(ans)

      #position을 못찾으면 정답을 0이라 가정
      if len(ans) == 0:
        start_token_index = 0
        end_token_index = 0
      else:
        start_token_index = ans[0][0]
        end_token_index = ans[0][1]

      #BERT의 input들을 각각 tensor로 변환해서 반환
      input_ids = torch.IntTensor(data["input_ids"])
      token_type_ids = torch.IntTensor(data["token_type_ids"])
      attention_mask = torch.IntTensor(data["attention_mask"])

      return input_ids, token_type_ids, attention_mask, start_token_index, end_token_index

train_dataset = myDataset(train_paras, train_pairs)
test_dataset = myDataset(test_paras, test_pairs)

for i in train_dataset:
  print(i)
  break

batch_size = 3
train_dataloader = DataLoader(train_dataset, batch_size = batch_size)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size)

for i in train_dataloader:
  print(i[0].shape)   # shape 따는게 진짜 중요
  print(i[1].shape)
  print(i[2].shape)
  print(i[3].shape)
  print(i[4].shape)
  break

(tensor([  101,  2000,  3183,  2106,  1996,  6261,  2984,  9382,  3711,  1999,
         8517,  1999, 10223, 26371,  2605,  1029,   102,  6549,  2135,  1010,
         1996,  2082,  2038,  1037,  3234,  2839,  1012, 10234,  1996,  2364,
         2311,  1005,  1055,  2751,  8514,  2003,  1037,  3585,  6231,  1997,
         1996,  6261,  2984,  1012,  3202,  1999,  2392,  1997,  1996,  2364,
         2311,  1998,  5307,  2009,  1010,  2003,  1037,  6967,  6231,  1997,
         4828,  2007,  2608,  2039, 14995,  6924,  2007,  1996,  5722,  1000,
         2310,  3490,  2618,  4748,  2033, 18168,  5267,  1000,  1012,  2279,
         2000,  1996,  2364,  2311,  2003,  1996, 13546,  1997,  1996,  6730,
         2540,  1012,  3202,  2369,  1996, 13546,  2003,  1996, 24665, 23052,
         1010,  1037, 14042,  2173,  1997,  7083,  1998,  9185,  1012,  2009,
         2003,  1037, 15059,  1997,  1996, 24665, 23052,  2012, 10223, 26371,
         1010,  2605,  2073,  1996,  6261,  2984, 22353,  2135,

##Model

In [10]:
from torch import nn
from transformers import BertModel

class MyModel(nn.Module):
  def __init__(self) -> None:
      super().__init__()
      #bert model 선언
      self.bert = BertModel.from_pretrained("bert-base-uncased")

      #start/end score로 변환하는 linear
      self.ln_start_score = nn.Linear(768,1)
      self.ln_end_score = nn.Linear(768,1)

  def forward(self, input_ids, token_type_ids, attention_mask):
      out = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
      out = out[0]
      #print(out.shape)
      start_logit = self.ln_start_score(out)
      #print(start_logit.shape)
      end_logit = self.ln_end_score(out)
      #print(end_logit.shape)

      return start_logit.squeeze(), end_logit.squeeze()

model = MyModel()

for i in train_dataloader:
  input_ids = i[0]
  token_type_ids = i[1]
  attention_mask = i[2]
  out = model(input_ids, token_type_ids, attention_mask)
  print(out[0].size(),out[1].size())
  print(i[3])
  print(i[4])
  break

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([3, 512]) torch.Size([3, 512])
tensor([130,  52,  28])
tensor([137,  56,  30])


##optimization

In [None]:
from torch.optim import Adam

model = MyModel()
model.cuda()

optimizer = Adam(model.parameters(), lr=0.0001)
lf = nn.CrossEntropyLoss()
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

#100번의 에폭을 실행
for e in range(100):
  print("\n\nepoch ", e)
  epoch_loss = 0
  train_start_correct = 0 
  train_end_correct = 0 
  
  #선언한 모델 오브젝트를 학습가능한 상태로 변경
  model.train()

  #모든 학습데이터에 대해서 학습
  for i in train_dataloader:
    #매 배치에 대한 gradient계산 이전에 optimizer에 저장된 이전 batch에 gradient를 삭제(초기화)
    optimizer.zero_grad()
    input_ids = i[0]
    input_ids = input_ids.cuda()
    token_type_ids = i[1]
    token_type_ids = token_type_ids.cuda()
    attention_mask = i[2]
    attention_mask = attention_mask.cuda()
    target_start = i[3]
    target_start = target_start.cuda()
    target_end = i[4]
    target_end = target_end.cuda()

    #결과 도출 및 정답수 연산
    start_logit, end_logit = model(input_ids, token_type_ids, attention_mask)

    start_pred_label = torch.argmax(start_logit, dim=-1)
    end_pred_label = torch.argmax(end_logit, dim=-1)

    train_start_correct += sum(start_pred_label == target_start.reshape(-1))
    train_end_correct += sum(end_pred_label == target_end.reshape(-1))

    #loss연산 start에 대한 loss와 end에 대한 loss를 따로 연산해서 더해줌
    loss_start = lf(start_logit, target_start)
    loss_end = lf(end_logit, target_end)
    loss = loss_start+loss_end
    #print(loss)

    #loss backpropagation
    loss.backward()

    #gradient update
    optimizer.step()

    epoch_loss += loss.item()
  
  print(train_start_correct)
  print(train_end_correct)
  print("train loss", epoch_loss/len(train_dataloader))
  print("train start acc", train_start_correct/len(train_dataset))
  print("train end acc", train_end_correct/len(train_dataset))

  #model이 학습되지 않는 상태로 변경
  model.eval()
  test_loss = 0
  test_start_correct = 0 
  test_end_correct = 0 

  #gradient를 계산하지 않도록 하여 cost낭비 방지
  with torch.no_grad():
    #모든 test dataset에 대해서 결과연산
    for i in test_dataloader:
      input_ids = i[0]
      input_ids = input_ids.cuda()
      token_type_ids = i[1]
      token_type_ids = token_type_ids.cuda()
      attention_mask = i[2]
      attention_mask = attention_mask.cuda()
      target_start = i[3]
      target_start = target_start.cuda()
      target_end = i[4]
      target_end = target_end.cuda()

      start_logit, end_logit = model(input_ids, token_type_ids, attention_mask)

      
      start_pred_label = torch.argmax(start_logit, dim=-1)
      end_pred_label = torch.argmax(end_logit, dim=-1)

      test_start_correct += sum(start_pred_label == target_start.reshape(-1))
      test_end_correct += sum(end_pred_label == target_end.reshape(-1))

      loss_start = lf(start_logit, target_start)
      loss_end = lf(end_logit, target_end)
      loss = loss_start+loss_end

      test_loss += loss.item()

  print("test loss", test_loss/len(test_dataloader))
  print("test start acc", test_start_correct/len(test_dataset))
  print("test end acc", test_end_correct/len(test_dataset))
    


KeyboardInterrupt: ignored