## torch, device 선언

In [1]:
import torch
device = torch.device("cuda")

## squad json 파일 다운로드

In [2]:
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

mkdir: `squad' 디렉토리를 만들 수 없습니다: 파일이 있습니다
--2020-12-04 16:41:09--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.110.153, 185.199.109.153, ...
접속 rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... 접속됨.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2020-12-04 16:41:14 (8.58 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2020-12-04 16:41:14--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.111.153, 185.199.110.153, 185.199.109.153, ...
접속 rajpurkar.github.io (rajpurkar.github.io)|185.199.111.153|:443... 접속됨.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2020-12-04 16:41:15 (8.51 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/437

## json parsing (read_squad)

In [3]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [4]:
print(train_contexts[0], '\n')
print('질문 : ', train_questions[0], '\n')
print(train_answers[0])

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy". 

질문 :  When did Beyonce start becoming popular? 

{'text': 'in the late 1990s', 'answer_start': 269}


## answers에 end index 추가 (add_end_idx)

In [5]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [6]:
print(train_questions[:3])
print(train_answers[:3])
print(len(train_answers))

['When did Beyonce start becoming popular?', 'What areas did Beyonce compete in when she was growing up?', "When did Beyonce leave Destiny's Child and become a solo singer?"]
[{'text': 'in the late 1990s', 'answer_start': 269, 'answer_end': 286}, {'text': 'singing and dancing', 'answer_start': 207, 'answer_end': 226}, {'text': '2003', 'answer_start': 526, 'answer_end': 530}]
86821


## contexts, questions tokenizing (DistilBertTokenizer)

In [7]:
# 빠른 처리를 위해 DistilBertTokenizerFast를 활용

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)



In [8]:
# encoding의 형태 파악

print(type(train_encodings))
print(train_encodings.keys())

<class 'transformers.tokenization_utils_base.BatchEncoding'>
dict_keys(['input_ids', 'attention_mask'])


## add_token_positions

In [9]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [10]:
# encoding의 형태 파악

print(type(train_encodings))
print(train_encodings.keys())

<class 'transformers.tokenization_utils_base.BatchEncoding'>
dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])


## Dataset 만들기

In [11]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [12]:
# dataset의 크기 확인

print(type(train_dataset))
print(len(train_dataset))
print(len(val_dataset))

<class '__main__.SquadDataset'>
86821
20302


## pretrained model 활용

In [13]:
# QA task의 빠른 학습을 위해 DistilBertForQuestionAnswering 활용

from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [14]:
model.to(device)
model.train()

DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

## dataset을 dataloader로 불러오기

In [15]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

In [16]:
print(type(train_loader))
print(len(train_loader))

<class 'torch.utils.data.dataloader.DataLoader'>
21706


In [17]:
print(type(val_loader))
print(len(val_loader))

<class 'torch.utils.data.dataloader.DataLoader'>
5076


## optimizer 불러오기

In [18]:
from transformers import AdamW
optim = AdamW(model.parameters(), lr=5e-5)

In [19]:
batch_ex = next(iter(val_loader))

input_ids = batch_ex['input_ids'].to(device, dtype=torch.long)
attention_mask = batch_ex['attention_mask'].to(device, dtype=torch.long)
start_positions = batch_ex['start_positions'].to(device, dtype=torch.long)
end_positions = batch_ex['end_positions'].to(device, dtype=torch.long)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    print(loss)

tensor(6.2189, device='cuda:0')


## train (val_dataset & val_loader)

In [33]:
from tqdm import tqdm
val_tqdm = tqdm(val_loader, total=len(val_loader))

for i, batch in enumerate(val_tqdm):
    input_ids = batch['input_ids'].to(device, dtype=torch.long)
    attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
    start_positions = batch['start_positions'].to(device, dtype=torch.long)
    end_positions = batch['end_positions'].to(device, dtype=torch.long)

    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]

    if i % 100 == 0:
        print(i,'/', len(val_loader),'loss : ', loss)
        
    optim.zero_grad()
    loss.backward()
    optim.step()

  0%|          | 1/5076 [00:00<12:54,  6.55it/s]

0 / 5076 loss :  tensor(4.1757, device='cuda:0', grad_fn=<DivBackward0>)


  2%|▏         | 101/5076 [00:13<10:53,  7.61it/s]

100 / 5076 loss :  tensor(3.5448, device='cuda:0', grad_fn=<DivBackward0>)


  4%|▍         | 201/5076 [00:26<10:46,  7.54it/s]

200 / 5076 loss :  tensor(2.3504, device='cuda:0', grad_fn=<DivBackward0>)


  6%|▌         | 301/5076 [00:39<10:33,  7.54it/s]

300 / 5076 loss :  tensor(2.0208, device='cuda:0', grad_fn=<DivBackward0>)


  8%|▊         | 401/5076 [00:53<10:19,  7.55it/s]

400 / 5076 loss :  tensor(2.2185, device='cuda:0', grad_fn=<DivBackward0>)


 10%|▉         | 501/5076 [01:06<10:10,  7.50it/s]

500 / 5076 loss :  tensor(2.9996, device='cuda:0', grad_fn=<DivBackward0>)


 12%|█▏        | 601/5076 [01:19<09:56,  7.50it/s]

600 / 5076 loss :  tensor(2.9889, device='cuda:0', grad_fn=<DivBackward0>)


 14%|█▍        | 701/5076 [01:32<09:43,  7.49it/s]

700 / 5076 loss :  tensor(0.6843, device='cuda:0', grad_fn=<DivBackward0>)


 16%|█▌        | 801/5076 [01:46<09:29,  7.51it/s]

800 / 5076 loss :  tensor(2.8365, device='cuda:0', grad_fn=<DivBackward0>)


 18%|█▊        | 901/5076 [01:59<09:16,  7.50it/s]

900 / 5076 loss :  tensor(1.6167, device='cuda:0', grad_fn=<DivBackward0>)


 20%|█▉        | 1001/5076 [02:12<09:05,  7.47it/s]

1000 / 5076 loss :  tensor(0.7936, device='cuda:0', grad_fn=<DivBackward0>)


 22%|██▏       | 1101/5076 [02:26<08:50,  7.49it/s]

1100 / 5076 loss :  tensor(1.7962, device='cuda:0', grad_fn=<DivBackward0>)


 24%|██▎       | 1201/5076 [02:39<08:42,  7.42it/s]

1200 / 5076 loss :  tensor(2.7783, device='cuda:0', grad_fn=<DivBackward0>)


 26%|██▌       | 1301/5076 [02:53<08:29,  7.41it/s]

1300 / 5076 loss :  tensor(1.4161, device='cuda:0', grad_fn=<DivBackward0>)


 28%|██▊       | 1401/5076 [03:06<08:10,  7.49it/s]

1400 / 5076 loss :  tensor(2.0730, device='cuda:0', grad_fn=<DivBackward0>)


 30%|██▉       | 1501/5076 [03:19<07:57,  7.48it/s]

1500 / 5076 loss :  tensor(2.1901, device='cuda:0', grad_fn=<DivBackward0>)


 32%|███▏      | 1601/5076 [03:33<07:43,  7.49it/s]

1600 / 5076 loss :  tensor(1.0996, device='cuda:0', grad_fn=<DivBackward0>)


 34%|███▎      | 1701/5076 [03:46<07:33,  7.45it/s]

1700 / 5076 loss :  tensor(1.0656, device='cuda:0', grad_fn=<DivBackward0>)


 35%|███▌      | 1801/5076 [04:00<07:18,  7.48it/s]

1800 / 5076 loss :  tensor(0.5908, device='cuda:0', grad_fn=<DivBackward0>)


 37%|███▋      | 1901/5076 [04:13<07:04,  7.48it/s]

1900 / 5076 loss :  tensor(1.2040, device='cuda:0', grad_fn=<DivBackward0>)


 39%|███▉      | 2001/5076 [04:26<06:49,  7.50it/s]

2000 / 5076 loss :  tensor(0.7265, device='cuda:0', grad_fn=<DivBackward0>)


 41%|████▏     | 2101/5076 [04:40<06:39,  7.45it/s]

2100 / 5076 loss :  tensor(0.4462, device='cuda:0', grad_fn=<DivBackward0>)


 43%|████▎     | 2201/5076 [04:53<06:25,  7.46it/s]

2200 / 5076 loss :  tensor(1.5313, device='cuda:0', grad_fn=<DivBackward0>)


 45%|████▌     | 2301/5076 [05:06<06:11,  7.47it/s]

2300 / 5076 loss :  tensor(0.9298, device='cuda:0', grad_fn=<DivBackward0>)


 47%|████▋     | 2401/5076 [05:20<05:57,  7.49it/s]

2400 / 5076 loss :  tensor(4.1517, device='cuda:0', grad_fn=<DivBackward0>)


 49%|████▉     | 2501/5076 [05:33<05:45,  7.45it/s]

2500 / 5076 loss :  tensor(0.9003, device='cuda:0', grad_fn=<DivBackward0>)


 51%|█████     | 2601/5076 [05:46<05:32,  7.44it/s]

2600 / 5076 loss :  tensor(0.9413, device='cuda:0', grad_fn=<DivBackward0>)


 53%|█████▎    | 2701/5076 [06:00<05:17,  7.48it/s]

2700 / 5076 loss :  tensor(0.5809, device='cuda:0', grad_fn=<DivBackward0>)


 55%|█████▌    | 2801/5076 [06:13<05:05,  7.45it/s]

2800 / 5076 loss :  tensor(1.6859, device='cuda:0', grad_fn=<DivBackward0>)


 57%|█████▋    | 2901/5076 [06:26<04:50,  7.48it/s]

2900 / 5076 loss :  tensor(2.2482, device='cuda:0', grad_fn=<DivBackward0>)


 59%|█████▉    | 3001/5076 [06:40<04:38,  7.46it/s]

3000 / 5076 loss :  tensor(1.7603, device='cuda:0', grad_fn=<DivBackward0>)


 61%|██████    | 3101/5076 [06:53<04:25,  7.45it/s]

3100 / 5076 loss :  tensor(1.3867, device='cuda:0', grad_fn=<DivBackward0>)


 63%|██████▎   | 3201/5076 [07:06<04:21,  7.17it/s]

3200 / 5076 loss :  tensor(2.1190, device='cuda:0', grad_fn=<DivBackward0>)


 65%|██████▌   | 3301/5076 [07:20<03:58,  7.45it/s]

3300 / 5076 loss :  tensor(1.9225, device='cuda:0', grad_fn=<DivBackward0>)


 67%|██████▋   | 3401/5076 [07:33<03:43,  7.49it/s]

3400 / 5076 loss :  tensor(1.8230, device='cuda:0', grad_fn=<DivBackward0>)


 69%|██████▉   | 3501/5076 [07:47<03:29,  7.51it/s]

3500 / 5076 loss :  tensor(2.2925, device='cuda:0', grad_fn=<DivBackward0>)


 71%|███████   | 3601/5076 [08:01<03:16,  7.49it/s]

3600 / 5076 loss :  tensor(0.3657, device='cuda:0', grad_fn=<DivBackward0>)


 73%|███████▎  | 3701/5076 [08:14<03:09,  7.24it/s]

3700 / 5076 loss :  tensor(0.8385, device='cuda:0', grad_fn=<DivBackward0>)


 75%|███████▍  | 3801/5076 [08:28<03:05,  6.87it/s]

3800 / 5076 loss :  tensor(1.0334, device='cuda:0', grad_fn=<DivBackward0>)


 77%|███████▋  | 3901/5076 [08:42<02:49,  6.93it/s]

3900 / 5076 loss :  tensor(3.1731, device='cuda:0', grad_fn=<DivBackward0>)


 79%|███████▉  | 4001/5076 [08:55<02:32,  7.05it/s]

4000 / 5076 loss :  tensor(0.5125, device='cuda:0', grad_fn=<DivBackward0>)


 81%|████████  | 4101/5076 [09:09<02:13,  7.33it/s]

4100 / 5076 loss :  tensor(1.6006, device='cuda:0', grad_fn=<DivBackward0>)


 83%|████████▎ | 4201/5076 [09:23<02:05,  6.99it/s]

4200 / 5076 loss :  tensor(1.4252, device='cuda:0', grad_fn=<DivBackward0>)


 85%|████████▍ | 4301/5076 [09:37<01:50,  7.01it/s]

4300 / 5076 loss :  tensor(0.6163, device='cuda:0', grad_fn=<DivBackward0>)


 87%|████████▋ | 4401/5076 [09:51<01:30,  7.46it/s]

4400 / 5076 loss :  tensor(1.4293, device='cuda:0', grad_fn=<DivBackward0>)


 89%|████████▊ | 4501/5076 [10:05<01:18,  7.31it/s]

4500 / 5076 loss :  tensor(0.5823, device='cuda:0', grad_fn=<DivBackward0>)


 91%|█████████ | 4601/5076 [10:18<01:03,  7.53it/s]

4600 / 5076 loss :  tensor(0.7328, device='cuda:0', grad_fn=<DivBackward0>)


 93%|█████████▎| 4701/5076 [10:31<00:49,  7.54it/s]

4700 / 5076 loss :  tensor(1.1981, device='cuda:0', grad_fn=<DivBackward0>)


 95%|█████████▍| 4801/5076 [10:45<00:36,  7.49it/s]

4800 / 5076 loss :  tensor(0.6213, device='cuda:0', grad_fn=<DivBackward0>)


 97%|█████████▋| 4901/5076 [10:59<00:23,  7.40it/s]

4900 / 5076 loss :  tensor(1.2792, device='cuda:0', grad_fn=<DivBackward0>)


 99%|█████████▊| 5001/5076 [11:13<00:10,  7.49it/s]

5000 / 5076 loss :  tensor(0.8637, device='cuda:0', grad_fn=<DivBackward0>)


100%|██████████| 5076/5076 [11:23<00:00,  7.43it/s]


## eval mode

In [None]:
model.eval()