<a href="https://colab.research.google.com/github/jhmlee/when2eat/blob/main/Copy_of_Question_Answering_System_Stencil_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets==2.10.1
#load_dataset sometimes hangs on a higher version
!pip install transformers tqdm

Collecting datasets==2.10.1
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.10.1)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from datasets==2.10.1)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.10.1)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, responses, multiprocess, datasets
  Attemptin

# Preprocessing

In [2]:
from datasets import load_dataset

from tqdm.auto import tqdm
import torch
import numpy as np
import random

# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


In [3]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

'\nSome options for BERT model that can be run in colab:\n\n"distilbert-base-uncased",\n"distilbert-base-uncased-distilled-squad",\n"distilbert-base-cased",\n"distilbert-base-cased-distilled-squad",\n\n'

In [4]:
# Change train.json / dev.json to the appropriate filepaths =====
data_files = {"train": "all_train.json", "dev": "all_dev.json"}
dataset = load_dataset('json', data_files=data_files, chunksize=10<<23)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-1daf74af690ed3cd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-1daf74af690ed3cd/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def preprocess_and_tokenize(data, tokenizer):

  final_data = [] # eventually stores the final list of tokenized examples

  for example in data: # iterates through the data
    # grab all the necessary information from each example in the loop – context, question, and answer info
    context = example["contexts"]
    question = example['questions'][0]['input_text']
    start_answer = example['answers'][0]['span_start']
    end_answer = example['answers'][0]['span_end']

    # tokenize the context and question
    tokenized_input = tokenizer(question, context, return_offsets_mapping = True,
                              truncation = True, padding = "max_length",
                              max_length = 512)

    # take the answer span's start & end positions in the input
    mapped_offset = tokenized_input['offset_mapping']
    start_pos = max(0, min(start_answer, len(mapped_offset) - 1))
    end_pos = max(0, min(end_answer - 1, len(mapped_offset) - 1))

    # store tokenized info for the example using a dict
    tokenized_example = {"input_ids": torch.tensor(tokenized_input['input_ids']),
                       "attention_mask": torch.tensor(tokenized_input['attention_mask']),
                       'start_pos': torch.tensor([start_pos]),
                       'end_pos': torch.tensor([end_pos])}

    final_data.append(tokenized_example)

  return final_data

Training the Data

In [11]:
def end_pt_loss(probs, ground_truth):
  # Based on the index of the ground truth, computes the probability of finding the correct answer
  correct_prob = probs.gather(1, ground_truth.view(-1, 1))
  return torch.log(correct_prob) # returns the logarithmic value of the probability

def total_loss(start_prob, end_prob, start_span, end_span):
  starting_loss = end_pt_loss(start_prob, start_span)
  ending_loss = end_pt_loss(end_prob, end_span)
  # the above calculates the loss for the start and end positions

  loss = -starting_loss - ending_loss
  return torch.mean(loss) # returns the average of the losses

def train_loop(model, optimizer, epochs: int,
               train_data, validation_data, lr_scheduler, device, tokenizer) -> None:

  # keeps track of the various losses
  train_losses = []
  validation_losses = []


  for epoch in range(epochs): # goes through each epoch
    model.train()

    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(train_data)  # keeps track of the progress

    training_loss = 0.0 # initializing the training loss

    for batch_dict in progress_bar:

      batch = {} # avoid possible self-referencing

      # Obtain the predictions, as well as the labels and hook them onto the GPU
      batch['input_ids'] = batch_dict['input_ids'].to(device)
      batch['attention_mask'] = batch_dict['attention_mask'].to(device)
      batch['start_pos'] = batch_dict['start_pos'].to(device)
      batch['end_pos'] = batch_dict['end_pos'].to(device)

      # create a dictionary with test tensors
      input_batch = {
          'input_ids': batch['input_ids'],
          'attention_mask': batch['attention_mask'],
          'start_pos': batch['start_pos'],
          'end_pos': batch['end_pos']
      }

      # forward pass through the model
      outputs = model(input_batch['input_ids'], input_batch['attention_mask'])
      start_logits = outputs.start_logits
      end_logits = outputs.end_logits

      # apply softmax to get the start & end probabilities
      softmax = torch.nn.Softmax(dim=1)
      start_prob = softmax(start_logits)
      end_prob = softmax(end_logits)

      # total loss computation
      loss = total_loss(start_prob, end_prob, input_batch['start_pos'], input_batch['end_pos'])
      training_loss += loss.item() #update the training loss

      loss.backward() # Backpropagate the loss and make a backward pass
      optimizer.zero_grad()
      optimizer.step() # optimization step
      lr_scheduler.step()



    avg_loss = training_loss / len(progress_bar) #find the avg training loss
    progress_bar.set_postfix({'loss': avg_loss})

    train_losses.append(avg_loss) # add to the training loss tracker

    # print useful info about running
    print("Running validation: ")
    validation_metrics = eval_loop(model, validation_data, device, tokenizer)

    print("Validation metrics: ", validation_metrics)
    val_loss = validation_metrics['loss']
    validation_losses.append(val_loss)


  return train_losses, validation_losses





Metric Computation and Evaluation

In [7]:
def calc_metrics(ground_truth, pred):
  if len(ground_truth) >= 1 and len(pred) >= 1: # if ground truth answer and prediction exists
    shared_tokens = 0 # create shared tokens counter

    for token in set(pred): # loop through all prediction tokens
      if token in ground_truth:
        shared_tokens +=1

    # compute the following metrics: precision, recall, and f1
    precision = shared_tokens / len(ground_truth)

    recall = shared_tokens / len(pred)

    f1 = (precision + recall) / 2

  else: # else this means the answer is [CLS]
    precision, recall, f1 = 0, 0, 0 # just return 0 for all the metrics

  return precision, recall, f1

In [12]:
def eval_loop(model: torch.nn.Module, val_data, device, tokenizer):

  model.eval()

  progress_bar = tqdm(val_data) # track the progress here!

  # initialize metrics
  precision_list = []
  recall_list = []
  f1_list = []

  val_loss = 0.0 # initialize the validation loss

  for batch_dict in progress_bar: # go through all the batches

    batch = {} # avoid possible self-referencing

    # hook the parameters onto the GPU
    batch['input_ids'] = batch_dict['input_ids'].to(device)
    batch['attention_mask'] = batch_dict['attention_mask'].to(device)
    batch['start_pos'] = batch_dict['start_pos'].to(device)
    batch['end_pos'] = batch_dict['end_pos'].to(device)

    test_batch = {
          'input_ids': batch['input_ids'],
          'attention_mask': batch['attention_mask'],
          'start_pos': batch['start_pos'],
          'end_pos': batch['end_pos']
      }

    # forward pass through the model
    outputs = model(test_batch['input_ids'], test_batch['attention_mask'])
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # apply softmax to get the start & end probabilities
    softmax = torch.nn.Softmax(dim=1)
    start_prob = softmax(start_logits)
    end_prob = softmax(end_logits)

    # total loss computation
    loss = total_loss(start_prob, end_prob, test_batch['start_pos'], test_batch['end_pos'])

    val_loss += loss.item() #update the validation loss

    # grab the predicted answer and decode it
    start = start_logits.argmax()
    end = end_logits.argmax()
    pred_tokens = test_batch['input_ids'][start: end + 1]
    pred_answer = tokenizer.decode(pred_tokens, skip_special_tokens=True) # with the chosen tokenizer format

    # grab the actual answer (ground truth) and also decode
    true_start = batch['start_pos'].item()
    true_end = batch['end_pos'].item()
    true_tokens = test_batch['input_ids'][true_start: true_end + 1]
    true_answer = tokenizer.decode(true_tokens, skip_special_tokens=True)

    progress_bar.update(1) # update the progress bar lol


    precision, recall, f1 = calc_metrics(true_answer, pred_answer) # use helper function to find the metrics
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

  # compute avg loss of validation
  avg_loss = val_loss / len(progress_bar)

  # find average scores of the metrics
  avg_precision = np.mean(np.array(precision_list))
  avg_recall = np.mean(np.array(recall_list))
  avg_f1 = np.mean(np.array(f1_list))

  # store these scores in a dictionary
  metrics = {'precision': avg_precision, 'recall': avg_recall, 'f1': avg_f1, 'loss': avg_loss}

  return metrics





In [13]:
from transformers import AutoModelForQuestionAnswering, DistilBertTokenizerFast, get_scheduler
device = "cuda" if torch.cuda.is_available() else "cpu"

def load_model():   # loads the model and tokenizer
  model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased").to(device)
  tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
  return model, tokenizer

def load_data():   # load training and validation datasets
  train = dataset["train"]
  val = dataset["dev"]
  return train, val


def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are
  needed and we expect to see precision, recall, and f1 scores printed out'''
  batch_size = 64

  # load in the model
  # load in the data
  # preprocess + tokenize (use an autotokenizer, for preprocessor check to see if u need anything else besides the tokenizer)
    # for tokenizer – find out if it's uncased or cased squad
  # train
    # need to set up optimizer, lr_scheduler and num of epochs here (all given in the research paper)
  # evaluate

  model, tokenizer = load_model()
  train, validation = load_data()

  train_data_loader = preprocess_and_tokenize(train, tokenizer)
  validation_data_loader = preprocess_and_tokenize(validation, tokenizer)

  # define epoch num, optimizer, and learning rate scheduler
  num_epochs = 2

  # instantiate it on the untrained model parameters with a learning rate of 3 * 10e-5 (taken from the paper)
  optimizer = torch.optim.AdamW(model.parameters(), lr=(3 * 10e-5))

  # now, we set up the learning rate scheduler
  lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=50,
    num_training_steps=len(train_data_loader) * num_epochs
  )

  # train!
  train_losses, val_losses = train_loop(model, optimizer, num_epochs, train_data_loader,
                                        validation_data_loader, lr_scheduler, device, tokenizer)

  # evaluate! (on the validation set)
  metrics = eval_loop(model, validation_data_loader, device, tokenizer)

  print("TRAINING LOSSES: ", train_losses)
  print("VALIDATION LOSSES: ", val_losses)
  print("PRECISION: ", metrics['precision'])
  print("RECALL: ", metrics['recall'])
  print("F1-SCORE: ", metrics['f1'])

if __name__ == "__main__":
  main()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 training:


  0%|          | 0/27866 [00:00<?, ?it/s]

Running validation: 


  0%|          | 0/1743 [00:00<?, ?it/s]

Validation metrics:  {'precision': 0.019641156121925053, 'recall': 0.017575826805968666, 'f1': 0.018608491463946858, 'loss': 12.594640320358232}
Epoch 2 training:


  0%|          | 0/27866 [00:00<?, ?it/s]

Running validation: 


  0%|          | 0/1743 [00:00<?, ?it/s]

Validation metrics:  {'precision': 0.019641156121925053, 'recall': 0.017575826805968666, 'f1': 0.018608491463946858, 'loss': 12.594640320358232}


  0%|          | 0/1743 [00:00<?, ?it/s]

TRAINING LOSSES:  [12.599083473170248, 12.599648174137222]
VALIDATION LOSSES:  [12.594640320358232, 12.594640320358232]
PRECISION:  0.019641156121925053
RECALL:  0.017575826805968666
F1-SCORE:  0.018608491463946858
