In [1]:
!pip install -q python-Levenshtein
!pip install -q wandb --upgrade
!pip install -q torch_optimizer

In [2]:
!pip install transformers



In [3]:
!pip install wandb -qqq

In [4]:
import json
import random

import os
import torch
import numpy as np
import pandas as pd
import time

import torch.cuda.amp as amp
from tqdm.notebook import tqdm
from torch.optim.optimizer import Optimizer, required
import torch_optimizer as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from transformers import get_linear_schedule_with_warmup
from Levenshtein import distance

In [5]:
import wandb

wandb.login(key='a1f10601b0c1673c8aa27b6d679d773261ca5fa7')

[34m[1mwandb[0m: Currently logged in as: [33mgoorm-team3[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
config = dict(
    epochs=4,
    dev_ratio=.2,
    num_workers=4,
    batch_size=16,
    learning_rate=5e-5,
    seed=42,
    amp=True,
    optimizer='RAdam',
    scheduler=True,
    architecture="klue/roberta-base")

In [8]:
platform = 'Colab'

if platform == 'Kaggle':
    data_path = 'Your Path'
elif platform == 'Colab':
    data_path = '/content/gdrive/MyDrive/k-digital-goorm-3-korean-mrc/'  
else:
    print('Again!')

In [9]:
wandb.init(project='wonkyung', entity="wonkyung", config=config)
wandb.run.name = 'klue/bert-base-2'
wandb.run.save()



True

In [10]:
sweep_config = dict(
    epochs=4,
    dev_ratio=.2,
    num_workers=4,
    batch_size=16,
    learning_rate=5e-5,
    seed=42,
    amp=True,
    optimizer='RAdam',
    scheduler=True,
    architecture="klue/roberta-base")


In [11]:
sweep_config = {
    "method": "random", # try grid or random
    "metric": {
      "name": "eval_loss",
      "goal": "minimize"   
    },
    "parameters": {
        "epochs": {
            "max": 5,
            "min": 2,
        },
        "learning_rate": {
            "max": 5e-5,
            "min": 1e-5
        },
        "batch_size": {
            "values": [4,8,16]
        },
        "optimizer": {
            "values": ['RAdam, AdamW']
        },
        "scheduler":{
            "values":[True, False]
        }
    }
}

In [13]:
wandb.config.update(config)
config = wandb.config

In [14]:
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seeds(seed=config.seed)

In [15]:
with open(data_path + "train.json", 'rb') as f:
    data_dict = json.load(f)

In [16]:
def read_data(path):
    with open(path, 'rb') as f:
        data_dict = json.load(f)

    categories = []
    contexts = []
    questions = []
    answers = []
    for group in tqdm(data_dict['data']):
        category = group["news_category"]  
        
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
                    categories.append(category)
  
    return contexts, questions, answers

In [17]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [18]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [19]:
class CustomedDataset(Dataset):
    def __init__(self, contexts, questions, answers, model_max_position_embedings, tokenizer):
        self.tokenizer = tokenizer
        self.answers = answers
        self.questions = questions
        self.contexts = contexts
        self.model_max_position_embedings = model_max_position_embedings
        print("Tokenizing ...")
        self.encodings = self.tokenizer(self.contexts,
                                        self.questions,
                                        max_length=512,
                                        truncation='only_first',
                                        padding="max_length",
                                        return_token_type_ids=False)

        print("Done !!!")
        self.add_token_positions()
        
    def add_token_positions(self):

        start_positions = []
        end_positions = []
        for i in range(len(self.answers)):
            start_positions.append(self.encodings.char_to_token(i, self.answers[i]['answer_start']))
            end_positions.append(self.encodings.char_to_token(i, self.answers[i]['answer_end'] - 1))
    
            if start_positions[-1] is None:
                start_positions[-1] = self.model_max_position_embedings
            if end_positions[-1] is None:
                end_positions[-1] = self.model_max_position_embedings
        self.encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

        
    def get_data(self):
        return {"contexts":self.contexts, 'questions':self.questions, 'answers':self.answers}
    
    
    def get_encodings(self):
        return self.encodings
        
    
    def __getitem__(self, idx):
        return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [20]:
contexts, questions, answers = read_data(data_path + "train.json")
add_end_idx(answers, contexts)


def train_test_split(contexts, questions, answers, ratio =.2):
    num_ratio = int(len(contexts) * ratio)
    return contexts[num_ratio:], questions[num_ratio:], answers[num_ratio:], contexts[:num_ratio], questions[:num_ratio], answers[:num_ratio],

train_contexts, train_questions, train_answers, dev_contexts, dev_questions, dev_answers = train_test_split(contexts, questions, answers, ratio = config.dev_ratio)

  0%|          | 0/9789 [00:00<?, ?it/s]

In [21]:
train_dataset = CustomedDataset(train_contexts, train_questions, train_answers, 512, tokenizer)
dev_dataset = CustomedDataset(dev_contexts, dev_questions, dev_answers, 512, tokenizer)

Tokenizing ...
Done !!!
Tokenizing ...
Done !!!


In [22]:
print(len(train_dataset), len(dev_dataset))

14131 3532


In [23]:
model = AutoModelForQuestionAnswering.from_pretrained("klue/bert-base")

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model chec

In [24]:
def train_runner(model, train_dataset, dev_dataset, dev_answers, config):
    train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  pin_memory=True,
                                  num_workers=config.num_workers)
    
    dev_loader = DataLoader(dataset=dev_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=config.num_workers)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    model.train()

    def get_learning_rate(optimizer):
      lr=[]
      for param_group in optimizer.param_groups:
          lr +=[ param_group['lr'] ]

      assert(len(lr)==1) 
      lr = lr[0]

      return lr

    global_total_step = len(train_loader) * config.epochs
    if config.optimizer == 'AdamW':
        optimizer = AdamW(model.parameters(), lr=config.learning_rate)
    else:
        optimizer = optim.Lookahead(optim.RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate), alpha=0.5, k=5)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = global_total_step)
        scaler = amp.GradScaler()
    

    wandb.watch(model, log='all', log_freq=10)
    
    preds = []
    
    for epoch in range(config.epochs):
        with tqdm(train_loader, unit="batch", desc='Train') as t:
            total = 0
            total_loss = 0
        
            start = time.time()
            lr = get_learning_rate(optimizer)
            wandb.log({'learning_rate': lr})
            print(f'learning rate : {lr : .6f}')
            for i, batch in enumerate(train_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)

                optimizer.zero_grad()

                if config.amp:
                  with amp.autocast():
                    outputs = model(input_ids,
                                    attention_mask=attention_mask,
                                    start_positions=start_positions,
                                    end_positions=end_positions)

                    loss = outputs.loss
                  scaler.scale(loss).backward()
                  scaler.step(optimizer)
                  scaler.update()
                else:
                  outputs = model(input_ids,
                                  attention_mask=attention_mask,
                                  start_positions=start_positions,
                                  end_positions=end_positions)
                  loss = outputs.loss
                  loss.backward()
                  optimizer.step()
                if config.scheduler:
                    scheduler.step()
                
                batch_loss = loss.item() * len(input_ids)
                total += len(input_ids)
                total_loss += batch_loss
                global_total_step += 1

                wandb.log({'epoch': epoch, 'loss': total_loss / total, 'batch_loss': batch_loss})
                t.set_postfix(loss="{:.6f}".format(total_loss / total), batch_loss="{:.6f}".format(batch_loss))
                t.update(1)

                del input_ids, attention_mask, start_positions, end_positions, outputs, loss
            
            end = time.time()
            print('Train End! Total time spent: ', end-start)

        with torch.no_grad():
            losses = []
            levs = []
            exact_match = 0          
            count = 0

            model.eval()
            for batch in tqdm(dev_loader, unit="batch", desc='Evaluate'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                
                if config.amp:
                  with amp.autocast():
                    outputs = model(input_ids,
                                    attention_mask=attention_mask,
                                    start_positions=start_positions,
                                    end_positions=end_positions)
                    loss = outputs.loss
                else:
                  outputs = model(input_ids,
                                  attention_mask=attention_mask,
                                  start_positions=start_positions,
                                  end_positions=end_positions)
                  loss = outputs.loss
                
                start_logits, end_logits = outputs.start_logits, outputs.end_logits
                token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
                for i in range(len(batch)):
                    pred_id = input_ids[i][token_start_index[i]: token_end_index[i] + 1]
                    pred = tokenizer.decode(pred_id)
                 
                    if epoch == config.epochs - 1:
                      preds.append(pred)
                    
                    lev = distance(pred, dev_answers[count]['text'])
                    levs.append(lev)
                  
                    if pred in dev_answers[count]['text']:
                      exact_match += 1
                    
                    count += 1
                
                losses.append(loss.item())
                
                batch_loss = loss.item() * len(input_ids)
                del input_ids, attention_mask, start_positions, end_positions, outputs, loss, lev
            
            em_score = exact_match / len(levs)
            loss = sum(losses) / len(losses)
            lev = sum(levs)/ len(levs)
            wandb.log({'eval_loss': loss,'Levenshtein': lev, 'em_score': em_score })
                       
    model.save_pretrained("/content/gdrive/MyDrive/Question-Answering-main")
    print('Eval Loss: ',loss)
    print("TRAIN END")

    return preds

In [25]:
# Need to know what it works
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [27]:
!nvidia-smi

Thu Apr  7 02:26:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    29W / 250W |      2MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [28]:
preds = []
preds = train_runner(model, train_dataset, dev_dataset, dev_answers,config)

Train:   0%|          | 0/884 [00:00<?, ?batch/s]

learning rate :  0.000050
Train End! Total time spent:  917.4373843669891


Evaluate:   0%|          | 0/221 [00:00<?, ?batch/s]

Train:   0%|          | 0/884 [00:00<?, ?batch/s]

learning rate :  0.000038
Train End! Total time spent:  896.1727697849274


Evaluate:   0%|          | 0/221 [00:00<?, ?batch/s]

Train:   0%|          | 0/884 [00:00<?, ?batch/s]

learning rate :  0.000025
Train End! Total time spent:  896.1790208816528


Evaluate:   0%|          | 0/221 [00:00<?, ?batch/s]

Train:   0%|          | 0/884 [00:00<?, ?batch/s]

learning rate :  0.000013
Train End! Total time spent:  896.4634726047516


Evaluate:   0%|          | 0/221 [00:00<?, ?batch/s]

Eval Loss:  1.7743121490759008
TRAIN END


In [29]:
dev_loader = DataLoader(dataset=dev_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=config.num_workers)

In [30]:
def read_test_data(path):
    with open(data_path + "test.json", 'rb') as f:
        test_data_dict = json.load(f)

    contexts = []
    questions = []
    guids = []
    for group in tqdm(test_data_dict['data']):
        category = group["news_category"]
        
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                guid = qa['guid']
                contexts.append(context)
                questions.append(question)
                guids.append(guid)


    return contexts, questions, guids

In [31]:
def read_data(path):
    with open(path, 'rb') as f:
        aihub_dict = json.load(f)

    categories = []
    contexts = []
    questions = []
    answers = []
    for group in tqdm(aihub_dict['data']):
        category = group["paragraphs"]  
        
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
                    categories.append(category)
  
    return contexts, questions, answers

In [32]:
with open(data_path + "test.json", 'rb') as f:
    test_data_dict = json.load(f)

In [33]:
test_contexts, test_questions, test_guids = read_test_data(data_path + 'test.json')

  0%|          | 0/3709 [00:00<?, ?it/s]

In [34]:
def prediction(contexts, questions, guids):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    model.eval()
    
    result = []
    
    with torch.no_grad():
        for context, question, guid in tqdm(zip(contexts, questions, guids)):
            if len(context) > 512:
                context = context[:512-len(question)-3]
            token_type_ids = [1] * (len(context) + 1) + [0] * (len(question) + 2)
            
            encodings = tokenizer(context, question, max_length=512, truncation='only_first',
                                     padding="max_length", return_token_type_ids=False)
            encodings = {key: torch.tensor([val]) for key, val in encodings.items()}
            
            input_ids = encodings["input_ids"].to(device)
            attention_mask = encodings["attention_mask"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            
            start_logits, end_logits = outputs.start_logits, outputs.end_logits
            token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)

#             pred_ids = input_ids[0][token_start_index: token_end_index + 1]
#             pred = tokenizer.decode(pred_ids)
#             result.append([guid,pred])
            result.append([guid,token_start_index.cpu().item(), token_end_index.cpu().item()])
#             result.append([token_start_index, token_end_index, pred])


            #start_prob = start_logits[token_type_ids.bool()][1:-1].softmax(-1)
            #end_prob = end_logits[token_type_ids.bool()][1:-1].softmax(-1)
            
            #probability = torch.triu(start_prob[:, None] @ end_prob[None, :])
            #index = torch.argmax(probability).cpu()

            #start = index // len(end_prob)
            #end = index % len(end_prob)
            #result.append([guid, start, end])

    return result

In [None]:
pred_idx = prediction(test_contexts, test_questions, test_guids)

0it [00:00, ?it/s]

In [None]:
predictions = []

for pred, context, question in zip(pred_idx, test_contexts, test_questions):
    position = 0
    text = context + '[SEP]' + question
    context_position =[]
    for morph in tokenizer.tokenize(text):
        morph_text_only = morph.replace('#','')
        position = context.find(morph_text_only, position)
        context_position.append((position, position + len(morph_text_only)))
        position += len(morph_text_only)
    
    start = pred[1] - 1
    end = pred[2] - 1
    answer = context[context_position[start][0]: context_position[end][1]]
    
    predictions.append((pred[0], answer))

In [None]:
results = pd.DataFrame(predictions, columns = ['ID', 'Predicted'])
print(results)

In [None]:
results.to_csv('/content/gdrive/MyDrive/wandb_experiment_klue_roberta_base.csv', index=False)

In [None]:
def truncation(text):
    return text if len(text) < 16 else text[:16]

In [None]:
results['Predicted'] = results['Predicted'].apply(truncation)

In [None]:
results.to_csv('/content/gdrive/MyDrive/wandb_experiment_klue_bert_base_tr.csv', index=False)