In [None]:
!pip install -q python-Levenshtein
!pip install -q wandb --upgrade
!pip install -q torch_optimizer

In [None]:
import json
import random

import os
import torch
import numpy as np
import pandas as pd
import time

import torch.cuda.amp as amp
from tqdm.notebook import tqdm
from torch.optim.optimizer import Optimizer, required
import torch_optimizer as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from transformers import get_linear_schedule_with_warmup
from Levenshtein import distance

In [None]:
import wandb

wandb.login(key='Your Key')

In [None]:
config = dict(
    epochs=1,
    dev_ratio=.2,
    num_workers=4,
    batch_size=8,
    learning_rate=5e-5,
    seed=42,
    amp=False,
    optimizer='AdamW',
    scheduler=False,
    architecture="klue/bert-base")

In [None]:
platform = 'Kaggle'

if platform == 'Kaggle':
    data_path = 'Your Path'
elif platform == 'Colab':
    data_path = 'Your Path'  
else:
    print('Again!')

In [None]:
wandb.init(project='Your Name', entity="Your Entity", config=config)
wandb.run.name = 'Your Name'
wandb.run.save()

In [None]:
sweep_config = dict(
    epochs=4,
    dev_ratio=.2,
    num_workers=4,
    batch_size=16,
    learning_rate=5e-5,
    seed=42,
    amp=True,
    optimizer='RAdam',
    scheduler=True,
    architecture="klue/roberta-base")

sweep_config = {
    "name" : "my-sweep",
    "method" : "bayes",
    
    "metric" : {
       "goal": 'minimize',
       "name": 'eval_loss'
    }
    "parameters" :{
        "epochs" : {
            "max" : 5,
            "min" : 2,
            "distribution": "int_uniform"

        },
        "learning_rate" :{
            "min": 5e-5,
            "max": 1e-5
        },
        "batch_size":{
            "values": [4,8,16]
        },
        "optimizer":{
            "values": ['RAdam, AdamW']
        },
        "scheduler":{
            "values": [True, False]
    }
}

In [None]:
wandb.config.update(config)
config = wandb.config

In [None]:
def set_seeds(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seeds(seed=config.seed)

In [None]:
with open(f"{data_path}/train.json", 'rb') as f:
    data_dict = json.load(f)

In [None]:
def read_data(path):
    with open(path, 'rb') as f:
        data_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in tqdm(data_dict['data']):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
  

    return contexts, questions, answers

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [None]:
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")

In [None]:
class CustomedDataset(Dataset):
    def __init__(self, contexts, questions, answers, model_max_position_embedings, tokenizer):
        self.tokenizer = tokenizer
        self.answers = answers
        self.questions = questions
        self.contexts = contexts
        self.model_max_position_embedings = model_max_position_embedings
        print("Tokenizing ...")
        self.encodings = self.tokenizer(self.contexts,
                                        self.questions,
                                        max_length=512,
                                        truncation='only_first',
                                        padding="max_length",
                                        return_token_type_ids=False)

        print("Done !!!")
        self.add_token_positions()
        
    def add_token_positions(self):

        start_positions = []
        end_positions = []
        for i in range(len(self.answers)):
            start_positions.append(self.encodings.char_to_token(i, self.answers[i]['answer_start']))
            end_positions.append(self.encodings.char_to_token(i, self.answers[i]['answer_end'] - 1))
    
            if start_positions[-1] is None:
                start_positions[-1] = self.model_max_position_embedings
            if end_positions[-1] is None:
                end_positions[-1] = self.model_max_position_embedings
        self.encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

        
    def get_data(self):
        return {"contexts":self.contexts, 'questions':self.questions, 'answers':self.answers}
    
    
    def get_encodings(self):
        return self.encodings
        
    
    def __getitem__(self, idx):
        return {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
contexts, questions, answers = read_data(f"{data_path}/train.json")
add_end_idx(answers, contexts)


def train_test_split(contexts, questions, answers, ratio =.2):
    num_ratio = int(len(contexts) * ratio)
    return contexts[num_ratio:], questions[num_ratio:], answers[num_ratio:], contexts[:num_ratio], questions[:num_ratio], answers[:num_ratio],

train_contexts, train_questions, train_answers, dev_contexts, dev_questions, dev_answers = train_test_split(contexts, questions, answers, ratio = config.dev_ratio)

In [None]:
train_dataset = CustomedDataset(train_contexts, train_questions, train_answers, 512, tokenizer)
dev_dataset = CustomedDataset(dev_contexts, dev_questions, dev_answers, 512, tokenizer)

In [None]:
print(len(train_dataset), len(dev_dataset))

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("klue/bert-base")

In [None]:
def train_runner(model, train_dataset, dev_dataset, dev_answers, config):
    train_loader = DataLoader(dataset=train_dataset,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  pin_memory=True,
                                  num_workers=config.num_workers)
    
    dev_loader = DataLoader(dataset=dev_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=config.num_workers)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    model.train()

    def get_learning_rate(optimizer):
      lr=[]
      for param_group in optimizer.param_groups:
          lr +=[ param_group['lr'] ]

      assert(len(lr)==1) 
      lr = lr[0]

      return lr

    global_total_step = len(train_loader) * config.epochs
    if config.optimizer == 'AdamW':
        optimizer = AdamW(model.parameters(), lr=config.learning_rate)
    else:
        optimizer = optim.Lookahead(optim.RAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate), alpha=0.5, k=5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = global_total_step)
    scaler = amp.GradScaler()
    

    wandb.watch(model, log='all', log_freq=10)
    
    preds = []
    
    for epoch in range(config.epochs):
        with tqdm(train_loader, unit="batch", desc='Train') as t:
            total = 0
            total_loss = 0
        
            start = time.time()
            lr = get_learning_rate(optimizer)
            wandb.log({'learning_rate': lr})
            print(f'learning rate : {lr : .6f}')
            for i, batch in enumerate(train_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)

                optimizer.zero_grad()

                if config.amp:
                  with amp.autocast():
                    outputs = model(input_ids,
                                    attention_mask=attention_mask,
                                    start_positions=start_positions,
                                    end_positions=end_positions)

                    loss = outputs.loss
                  scaler.scale(loss).backward()
                  scaler.step(optimizer)
                  scaler.update()
                else:
                  outputs = model(input_ids,
                                  attention_mask=attention_mask,
                                  start_positions=start_positions,
                                  end_positions=end_positions)
                  loss = outputs.loss
                  loss.backward()
                  optimizer.step()
                if config.scheduler:
                    scheduler.step()
                
                batch_loss = loss.item() * len(input_ids)
                total += len(input_ids)
                total_loss += batch_loss
                global_total_step += 1

                wandb.log({'epoch': epoch, 'loss': total_loss / total, 'batch_loss': batch_loss})
                t.set_postfix(loss="{:.6f}".format(total_loss / total), batch_loss="{:.6f}".format(batch_loss))
                t.update(1)

                del input_ids, attention_mask, start_positions, end_positions, outputs, loss
            
            end = time.time()
            print('Train End! Total time spent: ', end-start)

        with torch.no_grad():
            losses = []
            levs = []
            exact_match = 0          
            count = 0

            model.eval()
            for batch in tqdm(dev_loader, unit="batch", desc='Evaluate'):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                
                if config.amp:
                  with amp.autocast():
                    outputs = model(input_ids,
                                    attention_mask=attention_mask,
                                    start_positions=start_positions,
                                    end_positions=end_positions)
                    loss = outputs.loss
                else:
                  outputs = model(input_ids,
                                  attention_mask=attention_mask,
                                  start_positions=start_positions,
                                  end_positions=end_positions)
                  loss = outputs.loss
                
                start_logits, end_logits = outputs.start_logits, outputs.end_logits
                token_start_index, token_end_index = start_logits.argmax(dim=-1), end_logits.argmax(dim=-1)
                for i in range(len(batch)):
                    pred_id = input_ids[i][token_start_index[i]: token_end_index[i] + 1]
                    pred = tokenizer.decode(pred_id)
                 
                    if epoch == config.epochs - 1:
                      preds.append(pred)
                    
                    lev = distance(pred, dev_answers[count]['text'])
                    levs.append(lev)
                  
                    if pred in dev_answers[count]['text']:
                      exact_match += 1
                    
                    count += 1
                
                losses.append(loss.item())
                
                batch_loss = loss.item() * len(input_ids)
                del input_ids, attention_mask, start_positions, end_positions, outputs, loss, lev
            
            em_score = exact_match / len(levs)
            loss = sum(losses) / len(losses)
            lev = sum(levs)/ len(levs)
            wandb.log({'eval_loss': loss,'Levenshtein': lev, 'em_score': em_score })
                       
    model.save_pretrained("Your Path")
    print('Eval Loss: ',loss)
    print("TRAIN END")

    return preds

In [None]:
# Need to know what it works
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
!nvidia-smi

In [None]:
preds = []
preds = train_runner(model, train_dataset, dev_dataset, dev_answers, config)

In [None]:
dev_loader = DataLoader(dataset=dev_dataset,
                            batch_size=config.batch_size,
                            shuffle=False,
                            pin_memory=True,
                            num_workers=config.num_workers)

In [None]:
def read_test_data(path):
    with open(path, 'rb') as f:
        data_dict = json.load(f)

    contexts = []
    questions = []
    guids = []
    for group in tqdm(data_dict['data']):
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                guid = qa['guid']
                contexts.append(context)
                questions.append(question)
                guids.append(guid)


    return contexts, questions, guids

In [None]:
test_contexts, test_questions, test_guids = read_test_data(f"{data_path}/test.json")

In [None]:
def prediction(contexts, questions, guids):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model.to(device)
    model.eval()
    
    result = []
    
    with torch.no_grad():
        for context, question, guid in tqdm(zip(contexts, questions, guids)):
            if len(context) > 512:
                context = context[:512-len(question)-3]
            token_type_ids = [1] * (len(context) + 1) + [0] * (len(question) + 2)
            
            encodings = tokenizer(context, question, max_length=512, truncation='only_first',
                                     padding="max_length", return_token_type_ids=False)
            encodings = {key: torch.tensor([val]) for key, val in encodings.items()}
            
            input_ids = encodings["input_ids"].to(device)
            attention_mask = encodings["attention_mask"].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            
            start_logit, end_logit = outputs.start_logits, outputs.end_logits
            start_prob = start_logit[token_type_ids.bool()][1:-1].softmax(-1)
            end_prob = end_logit[token_type_ids.bool()][1:-1].softmax(-1)
            
            probability = torch.triu(start_prob[:, None] @ end_prob[None, :])
            index = torch.argmax(probability).cpu()

            start = index // len(end_prob)
            end = index % len(end_prob)
            result.append([guid, start, end])

    return result

In [None]:
pred_idx = prediction(test_contexts, test_questions, test_guids)

In [None]:
predictions = []

for pred, context, question in zip(pred_idx, test_contexts, test_questions):
    position = 0
    text = context + '[SEP]' + question
    context_position =[]
    for morph in tokenizer.tokenize(text):
        morph_text_only = morph.replace('#','')
        position = context.find(morph_text_only, position)
        context_position.append((position, position + len(morph_text_only)))
        position += len(morph_text_only)
    
    start = pred[1] - 1
    end = pred[2] - 1
    answer = context[context_position[start][0]: context_position[end][1]]
    
    predictions.append((pred[0], answer))

In [None]:
results = pd.DataFrame(predictions, columns = ['ID', 'Predicted'])
print(results)

In [None]:
results.to_csv('Your Path', index=False)

In [None]:
def truncation(text):
    return text if len(text) < 16 else text[:16]

In [None]:
results['Predicted'] = results['Predicted'].apply(truncation)

In [None]:
results.to_csv('Your Path', index=False)