In [1]:
#!pip install datasets==2.10.1
!pip install -U datasets
#load_dataset sometimes hangs on a higher version
!pip install transformers



In [6]:
from datasets import load_dataset

import pandas as pd
import torch
import numpy as np
import random

# we set up some seeds so that we can reproduce results
seed = 123
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


# 1. Load & Inspect Data

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
train_path = "/content/drive/MyDrive/Bert QA Data/all_train.json"

dev_path = "/content/drive/MyDrive/Bert QA Data/all_dev.json"

At this point, it seems like we should use load_dataset with the .json config from the huggingface website.

In [12]:
# Change train.json / dev.json to the appropriate filepaths =====
data_files = {"train": train_path, "dev": dev_path}
dataset = load_dataset('json', data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating dev split: 0 examples [00:00, ? examples/s]

In [15]:
print(dataset['train'][0])

{'name': "Stephanie Edwards (Grey's Anatomy)", 'id': '5495190773098085777', 'questions': [{'input_text': "when does stephanie die in grey's anatomy"}], 'answers': [{'candidate_id': 0, 'input_text': 'short', 'span_end': 324, 'span_start': 296, 'span_text': "`` Ring of Fire '' ( 13.24 )"}], 'has_correct_context': True, 'contexts': "Dr. Stephanie Edwards Grey 's Anatomy character The Season 12 Promotional Photo of Jerrika Hinton as Stephanie Edwards First appearance Going , Going , Gone ( 9.01 ) September 27 , 2012 ( as recurring cast ) `` Seal Our Fate '' ( 10.01 ) September 26 , 2013 ( as series regular ) Last appearance `` Ring of Fire '' ( 13.24 ) May 18 , 2017 Created by Shonda Rhimes Portrayed by Jerrika Hinton Information Full name Stephanie Edwards Nickname ( s ) Grumpy Steph Dr. Lavender Title M.D. Significant other ( s ) Jackson Avery Kyle Diaz ( deceased )"}


### What do we actually need here?
With limited scope, I want to narrow it down to the fields that we actually need to use.

'questions' -> \\
'input_text': str (query) \\
\
'answers' -> \\
'span_end': int, \\
'span_start': int, \\
~'span_text': str (content/ans) \\
\
~'has_correct_context': Bool, \\
'contexts': str (content)

# 2. Connect to Github

In [16]:
!git config --global user.name "hpeter11"
!git config --global user.email "hilton_petersen@brown.edu"

In [17]:
!git clone https://github.com/hpeter11/BERT_QA_System.git

fatal: destination path 'BERT_QA_System' already exists and is not an empty directory.


In [52]:
!git pull origin main

From https://github.com/hpeter11/BERT_QA_System
 * branch            main       -> FETCH_HEAD
Auto-merging BERT_QA.ipynb
CONFLICT (content): Merge conflict in BERT_QA.ipynb
Automatic merge failed; fix conflicts and then commit the result.


In [21]:
!mv /content/drive/MyDrive/BERT_QA.ipynb /content/BERT_QA_System/

mv: cannot stat '/content/drive/MyDrive/Colab_Notebooks/BERT_QA.ipynb': No such file or directory


In [58]:
%cd /content/BERT_QA_System/

/content/BERT_QA_System


In [None]:
from google.colab import userdata
key = userdata.get('git_key')

In [59]:
# Test
!git remote set-url origin https://{key}@github.com/hpeter11/BERT_QA_System.git
!git add BERT_QA.ipynb
!git commit -m "Periodic update from Colab"
!git push

[main cf69ef4] Periodic update from Colab
Enumerating objects: 10, done.
Counting objects: 100% (10/10), done.
Delta compression using up to 2 threads
Compressing objects: 100% (6/6), done.
Writing objects: 100% (6/6), 7.05 KiB | 3.53 MiB/s, done.
Total 6 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/hpeter11/BERT_QA_System.git
   da6a4d6..cf69ef4  main -> main


# Preprocessing

In [None]:
"""
Some options for BERT model that can be run in colab:

"distilbert-base-uncased",
"distilbert-base-uncased-distilled-squad",
"distilbert-base-cased",
"distilbert-base-cased-distilled-squad",

"""

In [None]:
def main():
  '''Here's the basic structure of the main block -- feel free to add or
  remove parameters/helper functions as you see fit, but all steps here are
  needed and we expect to see precision, recall, and f1 scores printed out'''
  device = "cuda" if torch.cuda.is_available() else "cpu"
  batch_size = 64

  model, tokenizer = load_model()
  train, validation = load_data()

  train_data_loader = preprocess_and_tokenize(train)
  validation_data_loader = preprocess_and_tokenize(validation)

  train_losses, val_losses = train_loop(train_data_loader, validation_data_loader)
  precision, recall, f1_score  = eval_loop(validation_data_loader)

  print("PRECISION: ", precision)
  print("RECALL: ", recall)
  print("F1-SCORE: ", f1_score)

if __name__ == "__main__":
  main()