# Library Installation

In [None]:
!pip install datasets==2.10.1 # load_dataset sometimes hangs on a higher version
!pip install transformers

Collecting datasets==2.10.1
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/469.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m256.0/469.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.10.1)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets==2.10.1)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from datasets==2.10.1)
  Downloading responses-0.1

# Library Imports / Environment Setting

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import numpy as np
import random
from transformers import PreTrainedTokenizer
from typing import Dict, List, Optional, Tuple, Union, Any
from torch import nn
from tqdm.auto import tqdm
from transformers import get_scheduler
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_outputs import QuestionAnsweringModelOutput
from transformers import DistilBertPreTrainedModel
from transformers import DistilBertModel

In [None]:
# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing & Tokenization

In [None]:
# Change train.json / dev.json to the appropriate filepaths =====
FILEPATH = "drive/My Drive/CSCI_1460_Computational_Linguistics/Final_Project"
data_files = {"train": f"{FILEPATH}/all_train.json", "dev": f"{FILEPATH}/all_dev.json"}
dataset = load_dataset('json', data_files=data_files)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-8ada74f10e7ab365/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-8ada74f10e7ab365/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

## Turn char-based index to token-based index

This function will take the context and target token indices and output the indices at which the target exists in the context.

In [None]:
def find_target_indices(context, target):
  start_index = 0
  end_index = 1

  for i in range(len(context) - len(target) + 1):
    if context[i:i+len(target)].equal(target):
      start_index = i
      end_index = i + len(target) # exclusive
      break

  # return 0, 1 if no-answer
  return start_index, end_index

In [None]:
ex_context = torch.tensor([3, 7, 2, 9, 5, 1])
ex_target = torch.tensor([7, 2, 9])

start_idx, end_idx = find_target_indices(ex_context, ex_target)

print(f"target starts at index {start_idx}")
print(f"target ends at index {end_idx}")
print(ex_context[start_idx:end_idx])

target starts at index 1
target ends at index 4
tensor([7, 2, 9])


In [None]:
idx = 25
dataset["train"][idx]

{'name': 'Foreign direct investment',
 'id': '-1906097888956532661',
 'questions': [{'input_text': 'what do you mean by fdi in india'}],
 'answers': [{'candidate_id': 0,
   'input_text': 'short',
   'span_end': 35,
   'span_start': 2,
   'span_text': 'foreign direct investment ( FDI )'}],
 'has_correct_context': True,
 'contexts': 'A foreign direct investment ( FDI ) is an investment in the form of a controlling ownership in a business in one country by an entity based in another country . It is thus distinguished from a foreign portfolio investment by a notion of direct control .'}

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

char_start = dataset["train"][idx]['answers'][0]['span_start']
char_end = dataset["train"][idx]['answers'][0]['span_end']

print("ans_text (same as 'span_text'): ",dataset["train"][idx]['contexts'][char_start:char_end])

# tokenize the target and context
target = tokenizer.encode(dataset["train"][idx]['contexts'][char_start:char_end], add_special_tokens=False, return_tensors="pt")
context = tokenizer.encode(dataset["train"][idx]['contexts'], add_special_tokens=False, return_tensors="pt")

print("")
print("Answer token indices from char-based index:", target[0])
start, end = find_target_indices(context[0], target[0])
print("Answer token indices found by the function:", context[0][start:end]) # tokenize context and plug in indices

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

ans_text (same as 'span_text'):  foreign direct investment ( FDI )

Answer token indices from char-based index: tensor([ 2880,  2904,  5151,   113,   143, 17243,   114])
Answer token indices found by the function: tensor([ 2880,  2904,  5151,   113,   143, 17243,   114])


## Define our dataset

In [None]:
class QAContextDataset(torch.utils.data.Dataset):
  """
  A PyTorch Dataset for questions and contexts that can be iterated through using __getitem__
  """
  def __init__(self, data: Dict[str, Any], tokenizer: PreTrainedTokenizer, max_len = 512) -> None:
    """
    Initializes the QAContextDataset from a dictionary containing information about questions and contexts, and a tokenizer.

    Attributes
    ----------
    data : Dict[str, Any]
      A dictionary containing information about questions and contexts.
    tokenizer : PreTrainedTokenizer
      Any PreTrainedTokenizer from HuggingFace can be used to encode the string inputs for a model.
    max_len : int
      The maximum length of the tokenized question-context pair.

    Returns
    ----------
    None
    """
    self.data = data
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self) -> int:
    """
    Returns the number of question-context pairs in the dataset.
    """
    return len(self.data)

  def __getitem__(self, index: int) -> Dict[str, Any]:
    # specify the components needed for the task
    question = self.data[index]['questions'][0]['input_text']
    context = self.data[index]['contexts']
    answer_type = self.data[index]['answers'][0]['input_text']
    char_span_start = self.data[index]['answers'][0]['span_start']
    char_span_end = self.data[index]['answers'][0]['span_end']
    mapping = {'no_answer': 0, 'short': 1}
    answer_type = mapping[answer_type]

    # tokenize question and context
    encoded_data = self.tokenizer.encode_plus(
      question,
      context,
      return_token_type_ids=False,
      return_attention_mask=True,
      max_length=self.max_len,
      return_tensors="pt",
      padding="max_length",
      truncation = True
    )

    # change token span for no-answer instances to 0, 1 (i.e., [CLS])
    if answer_type == 0:
      token_span_start = 0
      token_span_end = 1
    # change char span to token span for short-answer instances
    else:
      target = self.tokenizer.encode(context[char_span_start:char_span_end], add_special_tokens=False, return_tensors="pt")
      token_span_start, token_span_end = find_target_indices(encoded_data['input_ids'][0], target[0])

    return {
      'input_ids': encoded_data['input_ids'][0],
      'attention_mask': encoded_data['attention_mask'][0],
      'token_span_start': torch.tensor(token_span_start, dtype=torch.long),
      'token_span_end': torch.tensor(token_span_end, dtype=torch.long),
      'answer_type': torch.tensor(answer_type, dtype=torch.long)
    }

In [None]:
def preprocess_and_tokenize(dataset : Dict[str, Any], tokenizer : PreTrainedTokenizer, batch_size : int = 64, max_len=512) -> DataLoader:
  """
  Preprocesses and tokenizes the dataset using the given tokenizer.

  Attributes
  ----------
  dataset : Dict[str, Any]
    A dictionary containing information about questions and contexts.
  tokenizer : PreTrainedTokenizer
    Any PreTrainedTokenizer from HuggingFace can be used to encode the string inputs for a model.
  batch_size : int
    The batch size for training and evaluation.
  max_len : int
    The maximum length of the tokenized question-context pair.

  Returns
  -------
  dataloader : DataLoader
    A DataLoader object that can be used for training and evaluation.
  """

  # initialize the QAContextDataset object
  dataset = QAContextDataset(dataset, tokenizer, max_len)

  # initialize the DataLoader object
  dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

  return dataloader

# Model Configuration

## Modify model output

In [None]:
class MyQuestionAnsweringModelOutput(QuestionAnsweringModelOutput):
  def __init__(self, loss = None, start_logits = None, end_logits = None, answer_type_logits = None, hidden_states = None, attentions = None, **kwargs):
    super().__init__(loss = loss, start_logits = start_logits, end_logits = end_logits, hidden_states = hidden_states, attentions = attentions, **kwargs)
    self.answer_type_logits = answer_type_logits # include ans type

## Modify model specs

In [None]:
class MyDistilBertForQuestionAnswering(DistilBertPreTrainedModel):
  def __init__(self, config: PretrainedConfig):
    super().__init__(config)

    self.distilbert = DistilBertModel(config)
    self.qa_outputs = nn.Linear(config.dim, config.num_labels)
    self.ans_type_outputs = nn.Linear(config.dim, config.num_labels)

    if config.num_labels != 2:
        raise ValueError(f"config.num_labels should be 2, but it is {config.num_labels}")

    self.dropout = nn.Dropout(config.qa_dropout)

    # Initialize weights and apply final processing
    self.post_init()

  def forward(
    self,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    token_span_start: Optional[torch.Tensor] = None,
    token_span_end: Optional[torch.Tensor] = None,
    answer_type: Optional[torch.Tensor] = None,
  ) -> Union[QuestionAnsweringModelOutput, Tuple[torch.Tensor, ...]]:
    r"""
    token_span_start (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the start of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    token_span_end (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
        Labels for position (index) of the end of the labelled span for computing the token classification loss.
        Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
        are not taken into account for computing the loss.
    """

    distilbert_output = self.distilbert(
      input_ids=input_ids,
      attention_mask=attention_mask,
    )

    hidden_states = distilbert_output[0]  # (bs, max_query_len, dim)
    hidden_states = self.dropout(hidden_states)  # (bs, max_query_len, dim)

    logits = self.qa_outputs(hidden_states)  # (bs, max_query_len, 2)
    start_logits, end_logits = logits.split(1, dim=-1)
    start_logits = start_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
    end_logits = end_logits.squeeze(-1).contiguous()  # (bs, max_query_len)
    ans_type_logits = self.ans_type_outputs(hidden_states[:, 0, :])  # (bs, 2)

    output = (start_logits, end_logits, ans_type_logits)

    total_loss = None
    if token_span_start is not None and token_span_end is not None and answer_type is not None:
      # sometimes the start/end positions are outside our model inputs, we ignore these terms
      ignored_index = start_logits.size(1)
      token_span_start = token_span_start.clamp(0, ignored_index)
      token_span_end = token_span_end.clamp(0, ignored_index)

      loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
      start_loss = loss_fct(start_logits, token_span_start)
      end_loss = loss_fct(end_logits, token_span_end)
      ans_type_loss = loss_fct(ans_type_logits, answer_type)

      total_loss = start_loss + end_loss + ans_type_loss

    return MyQuestionAnsweringModelOutput(
      loss=total_loss,
      start_logits=start_logits,
      end_logits=end_logits,
      answer_type_logits=ans_type_logits,
      hidden_states=distilbert_output.hidden_states,
      attentions=distilbert_output.attentions,
    )

# Load the Data

In [None]:
# Load the data
def load_data():
  train = dataset["train"]
  validation = dataset["dev"]
  return train, validation

# Load the Model / Tokenizer

In [None]:
# Some options for BERT model that can be run in colab:
# "distilbert-base-uncased",
# "distilbert-base-uncased-distilled-squad",
# "distilbert-base-cased",
# "distilbert-base-cased-distilled-squad"

# Load the model and tokenizer
def load_model():
  tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
  model = MyDistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased")
  return model, tokenizer

# Define Metrics

In [None]:
def compute_metrics_torch(predicted_indices : List, true_indices : List) -> Tuple[float, float, float]:
  # handle the case when both predicted and true indices are empty
  if len(predicted_indices) == 0 and len(true_indices) == 0:
    return 1.0, 1.0, 1.0  # perfect precision, recall, and F1 for empty sets

  # get the maximum index from both predicted and true indices
  max_index = max(max(predicted_indices, default=0), max(true_indices, default=0))

  # create binary tensors indicating the presence of indices
  predicted_tensor = torch.zeros(max_index + 1)
  true_tensor = torch.zeros(max_index + 1)

  predicted_tensor[predicted_indices] = 1
  true_tensor[true_indices] = 1

  # calculate true positives, false positives, and false negatives
  true_positives = torch.logical_and(predicted_tensor, true_tensor).sum().item()
  false_positives = torch.logical_and(predicted_tensor, true_tensor.logical_not()).sum().item()
  false_negatives = torch.logical_and(predicted_tensor.logical_not(), true_tensor).sum().item()

  # calculate precision and recall
  precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
  recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0

  # calculate f1 score
  f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

  return precision, recall, f1

# Example:
predicted = [1, 2, 3, 4, 5, 6, 7, 8]
true = [1, 2, 3, 4, 5]

precision, recall, f1 = compute_metrics_torch(predicted, true)
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1: {f1:.2f}')

Precision: 0.62
Recall: 1.00
F1: 0.77


# Training Loop

In [None]:
def train_loop(model: torch.nn.Module, num_epochs: int, train_dataloader: DataLoader,
               validation_dataloader: DataLoader, device: torch.device) -> Tuple[List[float], List[float]]:
  """
  Trains a model by performing a forward pass and backpropagating on batches to optimize loss.

  Parameters:
  -----------
  model : torch.nn.Module
    The model to be trained.
  num_epochs : int
    Number of epochs to train for.
  train_dataloader : DataLoader
    DataLoader containing training examples.
  validation_dataloader : DataLoader
    DataLoader containing validation examples.
  device : torch.device
    The device that the training will be performed on.

  Returns:
  --------
  train_losses : List
    Training loss for each epoch.
  val_losses : List
    Validation loss for each epoch.
  """

  # add model to device
  model.to(device)

  # define optimizer, training_steps, lr_scheduler
  learning_rate = 1e-4
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
  num_training_steps = num_epochs * len(train_dataloader)

  lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=50,
    num_training_steps=num_training_steps
  )

  # record training loss and val loss
  train_losses = []
  val_losses = []

  for epoch in range(num_epochs):
    # put the model in training mode
    model.train()

    print(f"Epoch {epoch + 1} training:")
    progress_bar = tqdm(range(len(train_dataloader)))

    # total training loss for this epoch
    total_train_loss = 0

    # iterate through batches in training set
    for batch in train_dataloader:
      # move batch to device
      batch = {k: v.to(device) for k, v in batch.items()}
      # zero gradients
      optimizer.zero_grad()
      # forward pass
      outputs = model(**batch)
      # compute loss
      loss = outputs.loss
      # backpropagate
      loss.backward()
      # update parameters
      optimizer.step()
      # update learning rate
      lr_scheduler.step()
      # add loss to total loss for this epoch
      total_train_loss += loss.item()
      # update progress bar
      progress_bar.update(1)

    # calculate avg training loss for this epoch and add it to the list
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # validation
    model.eval()

    # total val loss for this epoch
    total_val_loss = 0

    with torch.no_grad():
      print(f"Epoch {epoch + 1} validation:")
      progress_bar = tqdm(range(len(validation_dataloader)))

      # iterate through batches in validation set
      for batch in validation_dataloader:
        # move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        # forward pass
        outputs = model(**batch)
        # compute loss
        loss = outputs.loss
        # update progress bar
        progress_bar.update(1)
        # add loss to total loss for this epoch
        total_val_loss += loss.item()

    # calculate avg val loss for this epoch and add it to the list
    avg_val_loss = total_val_loss / len(validation_dataloader)
    val_losses.append(avg_val_loss)

    # switch back to training mode
    model.train()

  # return train_loss and val_loss for each epoch
  return train_losses, val_losses

# Eval Loop

In [None]:
def eval_loop(model: torch.nn.Module, validation_dataloader: torch.utils.data.DataLoader, device: torch.device) -> Tuple[float, float, float]:
  """
  Evaluate a PyTorch Model

  Parameters:
  -----------
  model : torch.nn.Module
    The model to be evaluated.
  validation_dataloader : torch.utils.data.DataLoader
    DataLoader containing validation examples.
  device : torch.device
    The device that the evaluation will be performed on.

  Returns:
  --------
  avg_precision : float
    Average precision across batches.
  avg_recall : float
    Averate recall across batches.
  avg_f1 : float
    Average f1 score across batches.
  """
  model.eval()

  print("Final Evalutation:")
  progress_bar = tqdm(range(len(validation_dataloader)))
  precisions = []
  recalls = []
  f1s = []

  # for each batch in validation_dataloader: forward pass, generate predictions, and update metrics
  for batch in validation_dataloader:

    # move batch to device
    batch = {k: v.to(device) for k, v in batch.items()}

    # forward pass
    outputs = model(**batch)

    # get predictions from outputs
    predicted_start_index = outputs.start_logits.argmax(dim=-1)
    predicted_end_index = outputs.end_logits.argmax(dim=-1)

    # store the predicted tokens
    predicted_answer_tokens = []

    # iterate over each start and end pair
    for s, e, b in zip(predicted_start_index, predicted_end_index, batch['input_ids']):
      pred = b[s:e]
      predicted_answer_tokens.append(pred)

    # get target answer tokens
    target_start_index = batch['token_span_start']
    target_end_index = batch['token_span_end']

    # store the true target tokens
    target_answer_tokens = []

    # iterate over each start and end pair
    for s, e, b in zip(target_start_index, target_end_index, batch['input_ids']):
      target = b[s:e]
      target_answer_tokens.append(target)

    # calculate metrics
    for pred, target in zip(predicted_answer_tokens, target_answer_tokens):
      precision, recall, f1 = compute_metrics_torch(pred, target)
      precisions.append(precision)
      recalls.append(recall)
      f1s.append(f1)

    # update progress bar
    progress_bar.update(1)

  # calculate average metrics
  avg_precision = sum(precisions) / len(precisions)
  avg_recall = sum(recalls) / len(recalls)
  avg_f1 = sum(f1s) / len(f1s)

  # return metrics
  return avg_precision, avg_recall, avg_f1

# Main Function

In [None]:
def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are
  needed and we expect to see precision, recall, and f1 scores printed out'''
  device = "cuda" if torch.cuda.is_available() else "cpu"
  batch_size = 16
  num_epochs = 1

  model, tokenizer = load_model()
  train, validation = load_data()

  train_data_loader = preprocess_and_tokenize(train, tokenizer, batch_size)
  validation_data_loader = preprocess_and_tokenize(validation, tokenizer, batch_size)

  train_losses, val_losses = train_loop(model, num_epochs, train_data_loader, validation_data_loader, device)
  precision, recall, f1_score  = eval_loop(model, validation_data_loader, device)

  print("")
  print("METRICS:")
  print("------------------------------")
  print("PRECISION: ", precision)
  print("RECALL: ", recall)
  print("F1-SCORE: ", f1_score)

if __name__ == "__main__":
  main()

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of MyDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'ans_type_outputs.bias', 'ans_type_outputs.weight', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 training:


  0%|          | 0/1742 [00:00<?, ?it/s]

Epoch 1 validation:


  0%|          | 0/109 [00:00<?, ?it/s]

Final Evalutation:


  0%|          | 0/109 [00:00<?, ?it/s]


METRICS:
------------------------------
PRECISION:  0.6626192045707358
RECALL:  0.6935908522962636
F1-SCORE:  0.6558165688619763
