In [1]:
#!pip install datasets==2.10.1
!pip install -U datasets
#load_dataset sometimes hangs on a higher version
!pip install transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from datasets import load_dataset

import pandas as pd
import torch
import numpy as np
import random
from transformers import pipeline
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


# 1. Load & Inspect Data

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
train_path = "/content/drive/MyDrive/Bert QA Data/all_train.json"

dev_path = "/content/drive/MyDrive/Bert QA Data/all_dev.json"

At this point, it seems like we should use load_dataset with the .json config from the huggingface website.

In [5]:
# Change train.json / dev.json to the appropriate filepaths =====
data_files = {"train": train_path, "dev": dev_path}
dataset = load_dataset('json', data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

In [6]:
print(dataset['train'][0])

{'name': "Stephanie Edwards (Grey's Anatomy)", 'id': '5495190773098085777', 'questions': [{'input_text': "when does stephanie die in grey's anatomy"}], 'answers': [{'candidate_id': 0, 'input_text': 'short', 'span_end': 324, 'span_start': 296, 'span_text': "`` Ring of Fire '' ( 13.24 )"}], 'has_correct_context': True, 'contexts': "Dr. Stephanie Edwards Grey 's Anatomy character The Season 12 Promotional Photo of Jerrika Hinton as Stephanie Edwards First appearance Going , Going , Gone ( 9.01 ) September 27 , 2012 ( as recurring cast ) `` Seal Our Fate '' ( 10.01 ) September 26 , 2013 ( as series regular ) Last appearance `` Ring of Fire '' ( 13.24 ) May 18 , 2017 Created by Shonda Rhimes Portrayed by Jerrika Hinton Information Full name Stephanie Edwards Nickname ( s ) Grumpy Steph Dr. Lavender Title M.D. Significant other ( s ) Jackson Avery Kyle Diaz ( deceased )"}


### What do we actually need here?
With limited scope, I want to narrow it down to the fields that we actually need to use.

'questions' -> \\
'input_text': str (query) \\
\
'answers' -> \\
'span_end': int, \\
'span_start': int, \\
~'span_text': str (content/ans) \\
\
~'has_correct_context': Bool, \\
'contexts': str (content)

# 2. Connect to Github

In [None]:
!git config --global user.name "hpeter11"
!git config --global user.email "hilton_petersen@brown.edu"

In [None]:
!git clone https://github.com/hpeter11/BERT_QA_System.git

fatal: destination path 'BERT_QA_System' already exists and is not an empty directory.


In [None]:
!git pull origin main

From https://github.com/hpeter11/BERT_QA_System
 * branch            main       -> FETCH_HEAD
Auto-merging BERT_QA.ipynb
CONFLICT (content): Merge conflict in BERT_QA.ipynb
Automatic merge failed; fix conflicts and then commit the result.


In [None]:
!mv /content/drive/MyDrive/BERT_QA.ipynb /content/BERT_QA_System/

In [None]:
%cd /content/BERT_QA_System/

/content/BERT_QA_System


In [None]:
from google.colab import userdata
key = userdata.get('git_key')

In [None]:
# Test
!git remote set-url origin https://{key}@github.com/hpeter11/BERT_QA_System.git
!git add BERT_QA.ipynb
!git commit -m "Periodic update from Colab"
!git push

[main 767a10e] Periodic update from Colab
 1 file changed, 1 insertion(+), 1 deletion(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 2.98 KiB | 2.98 MiB/s, done.
Total 3 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/hpeter11/BERT_QA_System.git
   1539369..767a10e  main -> main


# Preprocessing

In [8]:
class DistilBertQuestionAnsweringWithType(torch.nn.Module):
  def __init__(self, model_name="'distilbert-base-uncased-distilled-squad'") -> None:
     super().__init__()
     self.model = DistilBertTokenizer.from_pretrained(model_name)
     hidden_size = self.model.distilbert.config.dim
     self.classifier = torch.nn.Linear(hidden_size, 2)

     def forward(self, input_ids, attention_mask):
        outputs = self.qa_model.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        seq_output = outputs.last_hidden_state

        logits = seq_output @ self.qa_model.qa_outputs.weight.T + self.qa_model.qa_outputs.bias

        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        cls_rep = seq_output[:, 0, :]
        type_logits = self.type_classifier(cls_rep)

        return start_logits, end_logits, type_logits

In [None]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad

In [9]:
def load_model():

  model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
  tokenizer = DistilBertQuestionAnsweringWithType('distilbert-base-uncased-distilled-squad')
  return model, tokenizer

https://huggingface.co/docs/datasets/v1.8.0/processing.html

In [10]:
def load_data(split: str):

  print(dataset[split][0])
  data = dataset[split].train_test_split(shuffle=True, test_size=0.1)
  train, validation = data['train'], data['test']

  return train, validation

In [None]:
print(load_data('train')[0][0])

{'name': "Stephanie Edwards (Grey's Anatomy)", 'id': '5495190773098085777', 'questions': [{'input_text': "when does stephanie die in grey's anatomy"}], 'answers': [{'candidate_id': 0, 'input_text': 'short', 'span_end': 324, 'span_start': 296, 'span_text': "`` Ring of Fire '' ( 13.24 )"}], 'has_correct_context': True, 'contexts': "Dr. Stephanie Edwards Grey 's Anatomy character The Season 12 Promotional Photo of Jerrika Hinton as Stephanie Edwards First appearance Going , Going , Gone ( 9.01 ) September 27 , 2012 ( as recurring cast ) `` Seal Our Fate '' ( 10.01 ) September 26 , 2013 ( as series regular ) Last appearance `` Ring of Fire '' ( 13.24 ) May 18 , 2017 Created by Shonda Rhimes Portrayed by Jerrika Hinton Information Full name Stephanie Edwards Nickname ( s ) Grumpy Steph Dr. Lavender Title M.D. Significant other ( s ) Jackson Avery Kyle Diaz ( deceased )"}
{'name': 'Hadley cell', 'id': '2548511657805874478', 'questions': [{'input_text': 'what type of uplift of air is associat

In [9]:
class QuestionDataset(torch.utils.data.Dataset):
  def __init__(self, dataset, tokenizer, max_len=512) -> None:
      self.dataset = dataset
      self.tokenizer = tokenizer
      self.max_length = max_len
      self.features = self._prepare_features()

  def __len__(self) -> int:
      return len(self.features)

  def __getitem__(self, index : int) -> dict[str, torch.Tensor]:
      return self.features[index]

  def


In [None]:
def preprocess_and_tokenize():
  data_loader = None
  return data_loader

In [None]:
def train_loop():
  train_losses = None
  val_losses = None
  return train_losses, val_losses

In [None]:
def eval_loop():
  precision = None
  recall = None
  f1_score = None
  return precision, recall, f1_score

In [None]:
def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are
  needed and we expect to see precision, recall, and f1 scores printed out'''
  device = "cuda" if torch.cuda.is_available() else "cpu"
  batch_size = 64

  train_split = 'train'
  dev_split = 'dev'

  model, tokenizer = load_model()
  train, validation = load_data(split=dev_split)

  train_data_loader = preprocess_and_tokenize(train)
  validation_data_loader = preprocess_and_tokenize(validation)

  train_losses, val_losses = train_loop(train_data_loader, validation_data_loader)
  precision, recall, f1_score  = eval_loop(validation_data_loader)

  print("PRECISION: ", precision)
  print("RECALL: ", recall)
  print("F1-SCORE: ", f1_score)

if __name__ == "__main__":
  main()