
# Load Library & Initialize Wandb

In [None]:
## Load Library
import os, sys
import json
import pandas as pd
from tqdm import tqdm
import random

# Weights & Biases
import wandb
from pytorch_lightning.loggers import WandbLogger

# Pytorch modules
import torch
from torch.nn import functional as F
from torch import nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import Dataset, DataLoader, random_split

# Pytorch-Lightning
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
import pytorch_lightning as pl

In [None]:
wandb.login()
wandb.init(project="qa_contrastive", entity="hannabros")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhannabros[0m (use `wandb login --relogin` to force relogin)


# Load Dataset

In [None]:
def extract_questions_and_answers(file_path):
  with open(file_path) as f:
    data = json.load(f)
  
  data_rows = []
  for doc in data['data']:
    for paragraph in doc['paragraphs']:
      context = paragraph['context']
      for question_and_answers in paragraph['qas']:
        question = question_and_answers['question']
        answers = question_and_answers['answers']
        
        for answer in answers:
          answer_text = answer['text']
          answer_start = answer['answer_start']
          answer_end = answer_start + len(answer_text)

          data_rows.append({
            'question': question,
            'context': context,
            'answer_text': answer_text,
            'answer_start': answer_start,
            'answer_end': answer_end
        })
  return pd.DataFrame(data_rows)

In [None]:
data_path = '/home/ubuntu/workspace/kaist.ir/qa/data'
kor_train = extract_questions_and_answers(os.path.join(data_path, 'korquad/KorQuAD_v1.0_train.json'))
kor_valid = extract_questions_and_answers(os.path.join(data_path, 'korquad/KorQuAD_v1.0_dev.json'))

# Load Pretrained Model

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import WEIGHTS_NAME, CONFIG_NAME
from transformers.tokenization_utils import trim_batch

In [2]:
## AIRC-KETI/ke-t5
# tokenizer = T5Tokenizer.from_pretrained('ke-t5-base')
# model = T5ForConditionalGeneration.from_pretrained('ke-t5-base')

## ET5
et5_pretrained_path = '/home/ubuntu/workspace/kaist.ir/qa/model/et5'
tokenizer = T5Tokenizer.from_pretrained(et5_pretrained_path)
model = T5ForConditionalGeneration.from_pretrained(et5_pretrained_path)

In [None]:
class WarmupLinearSchedule(LambdaLR):
    """ Linear warmup and then linear decay.
        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
    """
    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1, self.warmup_steps))
        return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))

In [None]:
class KorQADataset(Dataset):
  def __init__(self,
               df: pd.DataFrame,
               tokenizer: T5Tokenizer,
               src_max_len: int = 512,
               tgt_max_len: int = 32):
    self.df = df
    self.src_max_len = src_max_len
    self.tgt_max_len = tgt_max_len
    self.tokenizer = tokenizer

  def __getitem__(self, idx):
    data_row = self.df.iloc[idx]

    source_encoding = self.tokenizer(
      data_row['question'],
      data_row['context'],
      max_length=self.src_max_len,
      padding='max_length',
      truncation='only_second',
      return_attention_mask=True,
      add_special_tokens=True
    )

    target_encoding = self.tokenizer(
      data_row['answer_text'],
      max_length = self.tgt_max_len,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      add_special_tokens=True,
      return_tensors='pt'
    )

    labels = target_encoding['input_ids']
    labels[labels == 0] = -100

    return dict(
        question=data_row['question'],
        context=data_row['context'],
        answer_text=data_row['answer_text'],
        source_ids=source_encoding['input_ids'].squeeze(),
        source_mask=source_encoding['attention_mask'].squeeze(),
        target_ids=target_encoding['input_ids'].squeeze()
    )

  def __len__(self):
    return len(self.df.index)

  def trim_seq2seq_batch(batch, pad_token_id=0, test=False):
    source_ids, source_mask = trim_batch(batch["source_ids"], pad_token_id, attention_mask=batch["source_mask"])
    if test:
      return source_ids, source_mask, None
    y = trim_batch(batch["target_ids"], pad_token_id)
      return source_ids, source_mask, y

In [None]:
def train_log(loss, example_ct, epoch, lr):
    # Where the magic happens
    wandb.log({"epoch": epoch, "loss": loss, "lr": lr}, step=example_ct)
    print(f"Loss after " + str(example_ct).zfill(5) + f" examples: {loss:.3f}" + f" with lr: {lr}")

def train_batch(step, batch, model, optimizer, scheduler, criterion, config):
    for param_group in optimizer.param_groups:
        lr = param_group['lr']
        
    source_ids = batch['source_ids'].to(device)
    attention_mask = batch['source_mask'].to(device)
    target_ids = batch['target_ids'][:, 1:].contiguous()
    labels = target_ids[:, 1:].clone()
    labels[labels == 0] = -100
    labels = labels.to(device)

    # Forward pass ➡
    outputs = model(input_ids=source_ids,
                    attention_mask=attention_mask.
                    labels=labels)
    #loss = criterion(outputs, labels)
    loss = outputs[0]

    if config.gradient_accumulation_steps > 1:
        loss = loss / config.gradient_accumulation_steps

    # Backward pass ⬅
    loss.backward()

    if (step + 1) % config.gradient_accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)

        # Step with optimizer
        optimizer.step()
        optimizer.zero_grad()

        scheduler.step()

    return loss, lr

def save(model, tokenizer, config):
    print(f'saving model to {config.model_path}')
    os.makedirs(config.model_path, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(config.model_path, WEIGHTS_NAME))
    model.config.to_json_file(os.path.join(config.model_path, CONFIG_NAME))
    tokenizer.save_pretrained(config.model_path)

In [None]:
def make(config, train_df, valid_df, tokenizer, model, device):
    # Make the data
    train_dataset = KorQADataset(train_df, max_len=config.max_len, tokenizer=tokenizer)
    valid_dataset = KorQADataset(valid_df, max_len=config.max_len, tokenizer=tokenizer)
    train_loader = DataLoader(train_dataset,
                              batch_size=config.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=2)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=config.batch_size,
                              shuffle=True,
                              pin_memory=True,
                              num_workers=2)

    # Make the model
    model = model.to(device)

    # Make the loss and optimizer
    criterion = nn.CrossEntropyLoss()
    t_total = len(train_loader) // config.gradient_accumulation_steps * config.epochs
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': config.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total*config.warmup_proportion, t_total=t_total)
    
    return model, train_loader, valid_loader, criterion, optimizer, scheduler

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


def test_step(model, tokenizer, batch):
    """
    Runs forward pass on test set and returns calculated loss, predictions, and targets
    Note: this assumes that your test set has targets (doesn't have for kaggle).
    """
    pad_token_id = tokenizer.pad_token_id
    source_ids, source_mask, _ = KorQADataset.trim_seq2seq_batch(batch, pad_token_id, test=True)
    generated_ids = model.generate(
        input_ids=source_ids,
        attention_mask=source_mask,
        num_beams=1,
        max_length=80,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True,
        use_cache=True,
    )
    preds = [
        tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for g in generated_ids
    ]

    return {"preds": preds}

In [None]:
def model_pipeline(hyperparameters, train_df, valid_df, tokenizer, model, device):

    # tell wandb to get started
    with wandb.init(project="qa_contrastive", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
        model, train_loader, valid_loader, criterion, optimizer, scheduler = make(config, train_df, valid_df, tokenizer, model, device)
        #print(model)

        # Tell wandb to watch what the model gets up to: gradients, weights, and more!
        wandb.watch(model, criterion, log="all", log_freq=10)

        # Run training and track with wandb
        total_batches = len(train_loader) * config.epochs
        example_ct = 0  # number of examples seen
        batch_ct = 0
        best_loss = 10.0
        
        for epoch in tqdm(range(config.epochs)):
            for step, batch in enumerate(train_loader):

                loss, lr = train_batch(step, batch, model, optimizer, scheduler, criterion, config)
                example_ct +=  batch['input_ids'].shape[0]
                batch_ct += 1

                # Report metrics every 25th batch
                if ((batch_ct + 1) % 500) == 0:
                    train_log(loss, example_ct, epoch, lr)

            model.eval()

            # Run the model on some test examples
            with torch.no_grad():
                val_loss = []
                valid_example_ct = 0
                for step, batch in enumerate(valid_loader):
                    valid_example_ct += batch['input_ids'].shape[0]
                    source_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    target_ids = batch['target_ids'][:, 1:].contiguous()
                    labels = target_ids[:, 1:].clone()
                    labels[labels == 0] = -100
                    labels = labels.to(device)

                    outputs = model(input_ids=source_ids,
                            attention_mask=attention_mask,
                            labels=labels)
                    loss = outputs[0].item()
                    val_loss.append(loss)
                    #return target text
                    target_text = [tokenizer.decode(ids) for ids in target_ids]

                    #generate text
                    preds = test_step(model, tokenizer, batch)
                    preds_text = preds['preds']

                    jaccard_score = [jaccard(p. t) for p, t in zip(preds_text, target_text)]

                avg_val_loss = sum(loss)/len(loss)
                print(f"average valid loss: {avg_val_loss}, jaccard_score: {jaccard_score}")
                
                wandb.log({"valid_loss": avg_val_loss, "jaccard_score": jaccard_score})

            # Save the model
            if best_loss > avg_val_loss:
                best_loss = avg_val_loss
                print(f'Best Loss is {avg_val_loss}')
                save(model, tokenizer, config)
            
            model.train()

    return model

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

config = dict(
    epochs=5,
    batch_size=16,
    learning_rate=5e-5,
    max_len=512,
    model_path='/home/ubuntu/workspace/kaist.ir/qa/model/qa_t5',
    weight_decay=0.01,
    adam_epsilon=1e-6,
    warmup_proportion=0.1,
    gradient_accumulation_steps=1,
    max_grad_norm=1.0)

In [None]:
train_dataset = KorQADataset(kor_train_sample, max_len=512, tokenizer=tokenizer)
valid_dataset = KorQADataset(kor_valid_sample, max_len=512, tokenizer=tokenizer)

In [None]:
train_loader = DataLoader(train_dataset,
                            batch_size=16,
                            shuffle=True,
                            pin_memory=True,
                            num_workers=2)
valid_loader = DataLoader(valid_dataset,
                            batch_size=16,
                            shuffle=True,
                            pin_memory=True,
                            num_workers=2)

In [None]:
#kor_train_sample = kor_train.sample(len(kor_train)//100, random_state=1234)
#kor_valid_sample = kor_valid.sample(len(kor_valid)//100, random_state=1234)

#model = model_pipeline(config, kor_train_sample, kor_valid_sample, tokenizer, model, device)
model = model_pipeline(config, kor_train, kor_valid, tokenizer, model, device)

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

  0%|          | 0/5 [00:00<?, ?it/s]

Loss after 07984 examples: 0.454 with lr: 1.31885593220339e-05
Loss after 15984 examples: 0.384 with lr: 2.643008474576271e-05
Loss after 23984 examples: 0.223 with lr: 3.967161016949153e-05
Loss after 31984 examples: 0.228 with lr: 4.967631826741996e-05
Loss after 39984 examples: 0.605 with lr: 4.820503766478343e-05
Loss after 47984 examples: 0.940 with lr: 4.6733757062146894e-05
Loss after 55984 examples: 0.507 with lr: 4.526247645951036e-05
Accuracy of the model on the 5774 test samples: 85.51271272167935%
Best Accuracy is 0.8551271272167935
saving model to /home/ubuntu/workspace/kaist.ir/qa/model/qa_bert


 20%|██        | 1/5 [40:24<2:41:37, 2424.38s/it]

Loss after 63975 examples: 0.277 with lr: 4.3791195856873825e-05
Loss after 71975 examples: 0.806 with lr: 4.2319915254237294e-05
Loss after 79975 examples: 0.235 with lr: 4.0848634651600756e-05
Loss after 87975 examples: 0.220 with lr: 3.9377354048964224e-05
Loss after 95975 examples: 0.293 with lr: 3.7906073446327686e-05
Loss after 103975 examples: 0.358 with lr: 3.643479284369115e-05
Loss after 111975 examples: 0.195 with lr: 3.496351224105461e-05
Loss after 119975 examples: 0.179 with lr: 3.349223163841808e-05
Accuracy of the model on the 5774 test samples: 87.09314404432132%
Best Accuracy is 0.8709314404432132
saving model to /home/ubuntu/workspace/kaist.ir/qa/model/qa_bert


 40%|████      | 2/5 [1:21:48<2:02:59, 2459.75s/it]

Loss after 127966 examples: 0.112 with lr: 3.202095103578154e-05
Loss after 135966 examples: 0.157 with lr: 3.054967043314501e-05
Loss after 143966 examples: 0.045 with lr: 2.9078389830508472e-05
Loss after 151966 examples: 0.233 with lr: 2.760710922787194e-05
Loss after 159966 examples: 0.240 with lr: 2.6135828625235403e-05
Loss after 167966 examples: 0.060 with lr: 2.466454802259887e-05
Loss after 175966 examples: 0.012 with lr: 2.3193267419962337e-05
Accuracy of the model on the 5774 test samples: 87.2279382808717%
Best Accuracy is 0.872279382808717
saving model to /home/ubuntu/workspace/kaist.ir/qa/model/qa_bert


 60%|██████    | 3/5 [2:03:20<1:22:28, 2474.47s/it]

Loss after 183957 examples: 0.014 with lr: 2.1721986817325802e-05
Loss after 191957 examples: 0.354 with lr: 2.0250706214689268e-05
Loss after 199957 examples: 0.014 with lr: 1.877942561205273e-05
