### Install transformers and torch

In [1]:
#!pip install git+https://github.com/huggingface/transformers@main

In [2]:
#!pip install torch

### Set up Cloud Storage as place to store checkpoints and avoid disk space issues
### Note: steps must be run from terminal, not Jupyter Notebook 

In [3]:
#https://cloud.google.com/blog/topics/developers-practitioners/cloud-storage-file-system-vertex-ai-workbench-notebooks

In [4]:
#!gcsfuse --implicit-dirs --rename-dir-limit=100 --disable-http2 --max-conns-per-host=100 $MY_BUCKET "/home/jupyter/gcs/"

### Collect JSON file of QA pairs generated using Haystack and formatted to match Squad 

In [2]:
!gsutil cp gs://haystack_question_test/2022_11_12/jupyter/haystack/json_qa_pairs.json .

Copying gs://haystack_question_test/2022_11_12/jupyter/haystack/json_qa_pairs.json...
- [1 files][  9.2 MiB/  9.2 MiB]                                                
Operation completed over 1 objects/9.2 MiB.                                      


In [3]:
import json

In [5]:
with open('json_qa_pairs.json', encoding="utf-8") as f:
            cdcdata = json.load(f)

In [6]:
cdcdata['id']['0']

'100000000000000000000001'

In [7]:
len(cdcdata['id'])

10156

### Using SQuAD data loader (modified for our data format)

In [11]:
    def generate_examples(filepath):
        """This function returns the examples in the raw (text) form."""
        #logger.info("generating examples from = %s", filepath)
        key = 0
        with open(filepath, encoding="utf-8") as f:
            cdcdata = json.load(f)
            for x in range(0,len(cdcdata['id'])):
                curr = str(x)
                title = cdcdata['title'][curr]
                context = cdcdata['context'][curr]
                answer_starts = cdcdata['answer_start'][curr]
                answers = cdcdata['answer_text'][curr]
                question = cdcdata['question'][curr]
                ids = cdcdata['id'][curr]

                yield key, {
                    "title": title,
                    "context": context,
                    "question": question,
                    "id": ids,
                    "answers": {
                        "answer_start": answer_starts,
                        "text": answers,
                    },
                }
                key += 1

In [12]:
cdctest = generate_examples('json_qa_pairs.json')

In [13]:
print(next(cdctest))

(0, {'title': 'f059e215ee14a89be75d577ec5ad4eb2', 'context': 'cdc.gov/coronavirus\nWhat to Expect after Getting a COVID-19 Vaccine\nThe COVID-19 shot may cause side effects in some people. Side effects should go away in a few days. On the arm where you got the shot:\n• Pain\n• Redness\n• Swelling\nIf you are sore where you got the shot:\n• Apply a clean, cool, wet washcloth over the area\n• Use or move your arm gently\nIf you have a fever:\n• Drink a lot of water\n• Get plenty of rest\n• Dress lightly\nIf you have pain, headache, or fever, ask a healthcare provider (or facility\nstaff) if you can have medicine.', 'question': 'What can happen to some people after getting a COVID-19 Vaccine?', 'id': '100000000000000000000001', 'answers': {'answer_start': 96, 'text': 'side effects'}})


### This appears to match the format for SQuAD data

In [14]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
#testing123 = datasets.load_dataset("cdctest") ##Does not work without a local script

### Next to create the loading script, modified from squad.py 

In [22]:
#https://huggingface.co/docs/datasets/loading#local-loading-script

from datasets import load_dataset
dataset = load_dataset("/home/jupyter/cdc_test.py")

Downloading and preparing dataset cdc_test/plain_text to /home/jupyter/.cache/huggingface/datasets/cdc_test/plain_text/1.0.0/c93b7d49b570ab9ee2be29b6ed1ca87b47821f3ae95cf486f90660d66c33bd71...



Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4466.78it/s]

Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 714.53it/s]


NotImplementedError: 

### Hmm. Have to provide train/dev split. 

In [25]:
!pip install pandas



In [38]:
import pandas as pd
import json
#with open('json_qa_pairs.json') as f:
#    train = json.load(f)

In [294]:
df = pd.read_json('json_qa_pairs.json', dtype={'id':'object'})

In [295]:
type(df['answer_start'][0])

numpy.int64

In [243]:
#df["answer_start"] = pd.to_numeric(df["answer_start"], downcast = "integer")

In [296]:
#type(df['answer_start'][0]) #numpy.int16 not json serializable either... ugh

In [279]:
#df.astype({'answer_start': 'int8'}).dtypes

In [278]:
#df["answer_start"] = df["answer_start"].astype('int8')  #Converts some number to negative??

In [286]:
#type(df['answer_start'][0])

numpy.int8

In [298]:
#type(df["answer_start"][0].item())

In [297]:
#df["answer_start"]

In [299]:
#df["answer_start"] = df["answer_start"].item()

In [289]:
#len(df["answer_start"])

10156

In [300]:
#for x in range(0,len(df["answer_start"])):
#    df["answer_start"][x] = df["answer_start"][x].item()

In [301]:
#type(df['answer_start'][0]) #sigh

In [302]:
ans_st = df["answer_start"].tolist() #convert to python list

In [303]:
ans_st

[96,
 125,
 375,
 471,
 281,
 375,
 477,
 114,
 156,
 259,
 228,
 436,
 455,
 542,
 8,
 132,
 8,
 326,
 185,
 140,
 209,
 346,
 531,
 577,
 29,
 239,
 46,
 230,
 297,
 306,
 0,
 103,
 345,
 103,
 369,
 83,
 408,
 209,
 408,
 490,
 634,
 589,
 150,
 165,
 280,
 552,
 84,
 66,
 208,
 208,
 128,
 99,
 271,
 196,
 64,
 193,
 224,
 371,
 430,
 140,
 452,
 343,
 32,
 107,
 283,
 283,
 374,
 514,
 28,
 58,
 156,
 323,
 375,
 463,
 29,
 173,
 112,
 450,
 136,
 11,
 388,
 712,
 602,
 582,
 62,
 104,
 34,
 356,
 600,
 817,
 876,
 260,
 316,
 309,
 528,
 252,
 322,
 729,
 516,
 516,
 45,
 92,
 295,
 443,
 443,
 525,
 542,
 619,
 270,
 59,
 201,
 270,
 420,
 491,
 636,
 588,
 86,
 110,
 232,
 275,
 426,
 529,
 489,
 581,
 0,
 0,
 323,
 50,
 367,
 435,
 573,
 538,
 0,
 62,
 4,
 218,
 254,
 490,
 557,
 0,
 82,
 330,
 437,
 538,
 27,
 39,
 119,
 300,
 265,
 472,
 565,
 8,
 196,
 3,
 556,
 700,
 712,
 0,
 186,
 281,
 268,
 419,
 0,
 182,
 14,
 14,
 14,
 141,
 177,
 309,
 309,
 336,
 0,
 221,
 266,
 14

In [304]:
type(ans_st[0])

int

In [305]:
df["answer_start"] = ans_st

In [306]:
type(df["answer_start"][0])

numpy.int64

In [307]:
df

Unnamed: 0,id,title,context,question,answer_text,answer_start
0,100000000000000000000001,f059e215ee14a89be75d577ec5ad4eb2,cdc.gov/coronavirus\nWhat to Expect after Gett...,What can happen to some people after getting a...,side effects,96
1,100000000000000000000002,f059e215ee14a89be75d577ec5ad4eb2,cdc.gov/coronavirus\nWhat to Expect after Gett...,What should go away in a few days?,Side effects,125
2,100000000000000000000003,f059e215ee14a89be75d577ec5ad4eb2,cdc.gov/coronavirus\nWhat to Expect after Gett...,What should you do if you have a fever?,Drink a lot of water,375
3,100000000000000000000004,f059e215ee14a89be75d577ec5ad4eb2,cdc.gov/coronavirus\nWhat to Expect after Gett...,How do you treat a headache?,ask a healthcare provider (or facility staff) ...,471
4,100000000000000000000005,f059e215ee14a89be75d577ec5ad4eb2,cdc.gov/coronavirus\nWhat to Expect after Gett...,What is the name of the washcloth that you app...,"clean, cool, wet",281
...,...,...,...,...,...,...
10151,100000000000000000010152,feefd4010c8db878038b3867b0739ecb,30 minutes: Vaccination providers should consi...,How long should a person have an allergic reac...,30 minutes,109
10152,100000000000000000010153,feefd4010c8db878038b3867b0739ecb,30 minutes: Vaccination providers should consi...,What is not recommended for vaccine decision-m...,Antibody testing,454
10153,100000000000000000010154,feefd4010c8db878038b3867b0739ecb,30 minutes: Vaccination providers should consi...,What should be reported to VAERS?,Adverse events that occur following COVID-19 v...,586
10154,100000000000000000010155,89ea205deddff8b58ad49f5babca9ee9,COVID-19 providers are required to\nreport:\n...,What are providers of COVID-19 required to rep...,Vaccine administration errors  Serious advers...,45


In [268]:
#!pip install sklearn

In [309]:
import random
random.seed(10)
from sklearn.model_selection import train_test_split
    
train, test = train_test_split(df, test_size=0.3)
#val, test = train_test_split(test, test_size=0.5)

In [310]:
len(train)

7109

In [311]:
train

Unnamed: 0,id,title,context,question,answer_text,answer_start
5383,100000000000000000005384,e94a57098c4a84b0d6c7a56d4069cd31,Should you decide not\nto receive the Janssen ...,What is only authorized if other COVID-19 vacc...,The Janssen COVID-19 Vaccine,319
6834,100000000000000000006835,a3b7d0642ecafbe24abc57f5e6852ba7,Remember to bring the card when your child ret...,What may your provider include your child's va...,Immunization Information System,781
5201,100000000000000000005202,37202414135829f322ab4bcd44939b6c,CDC and FDA will use this information to guide...,Who will be looking at health effects after re...,scientists,181
3003,100000000000000000003004,ba220eeb6826ab5196a0b8d77503b8ce,The monovalent Novavax booster dose is adminis...,How many months after the last monovalent boos...,2 months,495
7693,100000000000000000007694,8a32db8038628066eb9e74798b1f93bb,They are:\nAnaphylaxis\nAnaphylaxis is a sever...,What is pericarditis inflammation of?,the outer lining of the heart,725
...,...,...,...,...,...,...
1659,100000000000000000001660,940280692f10d1951f8b172a84ec1292,COVID-19 Vaccine Administration Fees (updated ...,What does a program or plan cover?,COVID-19 Vaccine administration fees,649
7563,100000000000000000007564,530270efddf24e51ee98178e4beb4f40,You will need to show official documentation (...,What is the name of the vaccine candidate that...,COVID-19,267
4610,100000000000000000004611,81089b699b38110a7eb908ae7af13085,"10/30/22, 9:41 AM Interactive Home Ventilation...",What can help prevent you from getting and spr...,Good ventilation,354
3622,100000000000000000003623,9f09ca29a2171f252451a2b54b3e7b00,"10/30/22, 9:42 AM Domestic Travel During COVID...",What is effective at protecting people from ge...,COVID-19 vaccines,392


In [272]:
len(test)

3047

In [312]:
train.to_json('json_qa_pairs_train.json')

In [313]:
test.to_json('json_qa_pairs_dev.json')

### train/dev split established, try again

In [275]:
#https://huggingface.co/docs/datasets/loading#local-loading-script

#from datasets import load_dataset
#dataset = load_dataset("/home/jupyter/cdc_test.py")

### So discovered a misunderstanding - the loading script does not actually do the formatting
### like I thought - that must be preprocessed to match SQuAD format. Ugh.

In [315]:
#train._get_value(5383, 'title')

In [316]:
train[:1]

Unnamed: 0,id,title,context,question,answer_text,answer_start
5383,100000000000000000005384,e94a57098c4a84b0d6c7a56d4069cd31,Should you decide not\nto receive the Janssen ...,What is only authorized if other COVID-19 vacc...,The Janssen COVID-19 Vaccine,319


In [354]:
test[:1]

Unnamed: 0,id,title,context,question,answer_text,answer_start
674,100000000000000000000675,20e562776262c6cebe19de217cd8baa5,"February 3, 2022: Alaska made updates to data ...",What was the decrease in doses?,3945,105


In [317]:
values = train.index.values

In [318]:
values.sort()

In [319]:
values

array([    2,     4,     6, ..., 10153, 10154, 10155])

In [350]:
values = train.index.values
values.sort()
values = values[:20] #Used to test on smaller dataset
prevtitle = 'nofirsttitle'
cdcdict = {"version": "v2.0"}
datalist = []
#paralist = []

for x in values:
    #print("starting with index {}".format(x))
    #print("*********")
    currtitle = train._get_value(x, 'title')
    currcont = train._get_value(x, 'context')
    currid = train._get_value(x, 'id')
    currqas = train._get_value(x, 'question')
    #print("currqas = {}".format(currqas))
    currans = train._get_value(x, 'answer_text')
    #print("currans = {}".format(currans))
    currstart = train._get_value(x, 'answer_start')
    currstart = currstart.item()  #This converts to Python Int, which avoids JSON writing problems later
    #print("currstart type {}".format(currstart))

    if currtitle != prevtitle:
        
        if prevtitle != 'nofirsttitle':
            datalist.append(currdict)
            #print("current datalist {}".format(datalist))
        
        y = 0
        #print("currtitle {} does not equal prevtitle".format(x))
        #print("-----------")

        qaslist = []
        anslist = []
        currdict = {}
        #print("currdict initialized {}".format(currdict))
        paralist = []
        
        currdict["title"] = currtitle
        #print("currdict titled {}".format(currdict))
                
        currparagraphs = {}
                
        currqasdict = {}
                
        curransdict = {}

        curransdict["text"] = currans
        curransdict["answer_start"] = currstart
        
        anslist.append(curransdict)
        
        currqasdict["question"] = currqas
        currqasdict["id"] = currid
        currqasdict["answers"] = anslist
        currqasdict["is_impossible"] = False
        
        qaslist.append(currqasdict)
        
        #print("qaslist = {}".format(qaslist))
        #print("-----------")
        
        #paralist.append(qaslist)
        paralist.append(currqasdict)
        
        #print("paralist = {}".format(paralist))
        
        #currdict["paragraphs"] = paralist
        #currdict["title"]["paragraphs"]["context"] = currcont
        currdict["paragraphs"] = [{"qas" : paralist, "context" : currcont}]
        
        #print("currdict end of first round {}".format(currdict))
        
        prevtitle = currtitle
        y += 1
        
    else:
        y += 1
        #print("currtitle {} equals prevtitle".format(x))
        #print("-----------")
        #qaslist = []
        anslist = []
        #currqasdict = {}
        
        #print("qaslist before processing = {}".format(qaslist))
        
        #print("currdict current {}".format(currdict))
        
                
        curransdict = {}
        currqasdict = {}
        
        #print("refresehd curransdict = {}".format(curransdict))

        curransdict["text"] = currans
        curransdict["answer_start"] = currstart
        
        #print("qaslist after new dict entries = {}".format(qaslist))
        
        anslist.append(curransdict)
        #print("anslist = {}".format(anslist))
        #print("qaslist after appending answers to anslist = {}".format(qaslist))
        
        currqasdict["question"] = currqas ###This is where the data is getting corrupted
        #print("qaslist after updating question in currqasdict = {}".format(qaslist))
        currqasdict["id"] = currid
        currqasdict["answers"] = anslist
        currqasdict["is_impossible"] = False
        
        #print("currqasdict = {}".format(currqasdict))
        
        #print("qaslist right before processing = {}".format(qaslist))
        
        qaslist.append(currqasdict)
        
        #print("qaslist after processing = {}".format(qaslist))
        
        #print("qaslist = {}".format(qaslist))
        #print("-----------")
        
        #paralist.append(qaslist)
        paralist.append(currqasdict)
        
        currdict["paragraphs"] = [{"qas" : paralist, "context" : currcont}]
        
        #print("currdict after intermediate round = {}".format(currdict))

        
    #currparagraphs["qas"] = qaslist
    #currparagraphs['context'] = currcont
    
    #print("currparagraphs = {}".format(currparagraphs))
    #print("&&&&&&&&&&")
    
    #paralist.append(currparagraphs)
        
    #currdict["paragraphs"] = paralist
    
if y > 0:
    datalist.append(currdict)
    #print("current datalist {}".format(datalist))

cdcdict['data'] = datalist


In [337]:
len(cdcdict['data'])

8

In [327]:
cdcdict['data']

[{'title': 'f059e215ee14a89be75d577ec5ad4eb2',
  'paragraphs': {'qas': [{'question': 'What should you do if you have a fever?',
     'id': '100000000000000000000003',
     'answers': [{'text': 'Drink a lot of water', 'answer_start': 375}],
     'is_impossible': False},
    {'question': 'What is the name of the washcloth that you apply to the area of the shot?',
     'id': '100000000000000000000005',
     'answers': [{'text': 'clean, cool, wet', 'answer_start': 281}],
     'is_impossible': False},
    {'question': 'What is the name of the person who can give you medicine?',
     'id': '100000000000000000000007',
     'answers': [{'text': 'healthcare provider', 'answer_start': 477}],
     'is_impossible': False}],
   'context': 'cdc.gov/coronavirus\nWhat to Expect after Getting a COVID-19 Vaccine\nThe COVID-19 shot may cause side effects in some people. Side effects should go away in a few days. On the arm where you got the shot:\n• Pain\n• Redness\n• Swelling\nIf you are sore where you 

In [328]:
cdcdict

{'version': 'v2.0',
 'data': [{'title': 'f059e215ee14a89be75d577ec5ad4eb2',
   'paragraphs': {'qas': [{'question': 'What should you do if you have a fever?',
      'id': '100000000000000000000003',
      'answers': [{'text': 'Drink a lot of water', 'answer_start': 375}],
      'is_impossible': False},
     {'question': 'What is the name of the washcloth that you apply to the area of the shot?',
      'id': '100000000000000000000005',
      'answers': [{'text': 'clean, cool, wet', 'answer_start': 281}],
      'is_impossible': False},
     {'question': 'What is the name of the person who can give you medicine?',
      'id': '100000000000000000000007',
      'answers': [{'text': 'healthcare provider', 'answer_start': 477}],
      'is_impossible': False}],
    'context': 'cdc.gov/coronavirus\nWhat to Expect after Getting a COVID-19 Vaccine\nThe COVID-19 shot may cause side effects in some people. Side effects should go away in a few days. On the arm where you got the shot:\n• Pain\n• Redne

In [351]:
import json
with open("first20train.json", "w") as outfile:
    json.dump(cdcdict, outfile)

## Success! Our format now matches the SQuAD data exactly. Or so it seems...

## Now to create the real files

### First the train files

In [353]:
values = train.index.values
values.sort()
prevtitle = 'nofirsttitle'
cdcdict_train = {"version": "v2.0"}
datalist = []

for x in values:
    currtitle = train._get_value(x, 'title')
    currcont = train._get_value(x, 'context')
    currid = train._get_value(x, 'id')
    currqas = train._get_value(x, 'question')
    currans = train._get_value(x, 'answer_text')
    currstart = train._get_value(x, 'answer_start')
    currstart = currstart.item()  #This converts to Python Int, which avoids JSON writing problems later

    if currtitle != prevtitle:
        
        if prevtitle != 'nofirsttitle':
            datalist.append(currdict)
        
        y = 0 #Counter to make sure we write the last value at the end of the program

        qaslist = []
        anslist = []
        currdict = {}
        paralist = []
        
        currdict["title"] = currtitle
                
        currparagraphs = {}
                
        currqasdict = {}
                
        curransdict = {}

        curransdict["text"] = currans
        curransdict["answer_start"] = currstart
        
        anslist.append(curransdict)
        
        currqasdict["question"] = currqas
        currqasdict["id"] = currid
        currqasdict["answers"] = anslist
        currqasdict["is_impossible"] = False
        
        qaslist.append(currqasdict)
        
        paralist.append(currqasdict)
        
        currdict["paragraphs"] = [{"qas" : paralist, "context" : currcont}]
                
        prevtitle = currtitle
        y += 1
        
    else:
        y += 1
        anslist = []
                        
        curransdict = {}
        currqasdict = {}
        
        curransdict["text"] = currans
        curransdict["answer_start"] = currstart
                
        anslist.append(curransdict)
        
        currqasdict["question"] = currqas 
        currqasdict["id"] = currid
        currqasdict["answers"] = anslist
        currqasdict["is_impossible"] = False
        
        qaslist.append(currqasdict)
        
        paralist.append(currqasdict)
        
        currdict["paragraphs"] = [{"qas" : paralist, "context" : currcont}]
            
if y > 0:
    datalist.append(currdict)

cdcdict_train['data'] = datalist

import json
with open("final_qa_pairs_train.json", "w") as outfile:
    json.dump(cdcdict_train, outfile)

### Now the dev files - same treatment

In [355]:
values = test.index.values
values.sort()
prevtitle = 'nofirsttitle'
cdcdict_test = {"version": "v2.0"}
datalist = []

for x in values:
    currtitle = test._get_value(x, 'title')
    currcont = test._get_value(x, 'context')
    currid = test._get_value(x, 'id')
    currqas = test._get_value(x, 'question')
    currans = test._get_value(x, 'answer_text')
    currstart = test._get_value(x, 'answer_start')
    currstart = currstart.item()  #This converts to Python Int, which avoids JSON writing problems later

    if currtitle != prevtitle:
        
        if prevtitle != 'nofirsttitle':
            datalist.append(currdict)
        
        y = 0 #Counter to make sure we write the last value at the end of the program

        qaslist = []
        anslist = []
        currdict = {}
        paralist = []
        
        currdict["title"] = currtitle
                
        currparagraphs = {}
                
        currqasdict = {}
                
        curransdict = {}

        curransdict["text"] = currans
        curransdict["answer_start"] = currstart
        
        anslist.append(curransdict)
        
        currqasdict["question"] = currqas
        currqasdict["id"] = currid
        currqasdict["answers"] = anslist
        currqasdict["is_impossible"] = False
        
        qaslist.append(currqasdict)
        
        paralist.append(currqasdict)
        
        currdict["paragraphs"] = [{"qas" : paralist, "context" : currcont}]
                
        prevtitle = currtitle
        y += 1
        
    else:
        y += 1
        anslist = []
                        
        curransdict = {}
        currqasdict = {}
        
        curransdict["text"] = currans
        curransdict["answer_start"] = currstart
                
        anslist.append(curransdict)
        
        currqasdict["question"] = currqas 
        currqasdict["id"] = currid
        currqasdict["answers"] = anslist
        currqasdict["is_impossible"] = False
        
        qaslist.append(currqasdict)
        
        paralist.append(currqasdict)
        
        currdict["paragraphs"] = [{"qas" : paralist, "context" : currcont}]
            
if y > 0:
    datalist.append(currdict)

cdcdict_test['data'] = datalist

import json
with open("final_qa_pairs_dev.json", "w") as outfile:
    json.dump(cdcdict_test, outfile)

## Have modified cdc_test2.py to point to these new files, going to attempt to load without further modification

In [356]:
from datasets import load_dataset

In [357]:
dataset = load_dataset("cdc_test2.py", split="train")

Downloading and preparing dataset cdc_test2/plain_text to /home/jupyter/.cache/huggingface/datasets/cdc_test2/plain_text/1.0.0/cb09324d802e9c68ea628d81253aca6f30dca08142b40269a52e9fbb856273eb...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 6904.20it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 1275.25it/s]
                                                                       

Dataset cdc_test2 downloaded and prepared to /home/jupyter/.cache/huggingface/datasets/cdc_test2/plain_text/1.0.0/cb09324d802e9c68ea628d81253aca6f30dca08142b40269a52e9fbb856273eb. Subsequent calls will reuse this data.




# SUCCESS!

In [367]:
dataset

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 7109
})

In [358]:
ls gcs

CDC_Covid19_Data_2022_10_30.zip


In [360]:
import transformers

In [359]:
mkdir gcs/cdcmodel_train01 #Storing our checkpoints on cloud storage

In [361]:
cd QA_tune 

/home/jupyter/QA_tune


In [9]:
#!git clone https://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@github.com/huggingface/transformers/

Cloning into 'transformers'...
remote: Enumerating objects: 112247, done.[K
remote: Total 112247 (delta 0), reused 0 (delta 0), pack-reused 112247[K
Receiving objects: 100% (112247/112247), 105.67 MiB | 32.73 MiB/s, done.
Resolving deltas: 100% (83385/83385), done.


In [362]:
cd transformers/examples/pytorch/question-answering/

/home/jupyter/QA_tune/transformers/examples/pytorch/question-answering


In [363]:
pwd

'/home/jupyter/QA_tune/transformers/examples/pytorch/question-answering'

## Note: had to move loading script and data files to this folder. When specifying "dataset" as the thing to run, the QA training script will look for a subfolder named dataset and a script named dataset.py, so I renamed cdc_test2.py accordingly and made a copy of all of hte data in that folder.

In [364]:
#!pip install datasets



In [12]:
#!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.3.0


In [365]:
ls

README.md         [0m[01;32mrun_qa_beam_search.py[0m*            trainer_qa.py
[01;34m__pycache__[0m/      run_qa_beam_search_no_trainer.py  trainer_seq2seq_qa.py
requirements.txt  [01;32mrun_qa_no_trainer.py[0m*             utils_qa.py
[01;32mrun_qa.py[0m*        run_seq2seq_qa.py


In [366]:
"""#!python run_qa.py \   
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name squad_v2 \
  --do_train \
  --per_device_train_batch_size 12 \  #Batch size 12 = out of memory issues
  --learning_rate 3e-5 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /tmp/debug_bloom_squad/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16"""

'#!python run_qa.py \\   \n  --model_name_or_path bigscience/bloom-560m   --dataset_name squad_v2   --do_train   --per_device_train_batch_size 12 \\  #Batch size 12 = out of memory issues\n  --learning_rate 3e-5   --num_train_epochs 2   --max_seq_length 384   --doc_stride 128   --output_dir /tmp/debug_bloom_squad/   --eval_accumulation_steps 1   --version_2_with_negative   --overwrite_output_dir   --fp16'

In [15]:
import sys

In [16]:
sys.maxsize #setting token value for no-id as maxsize -1 in run_qa.py

9223372036854775807

In [369]:
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train01/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/13/2022 21:50:48 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False

## Successfully recreated SQuAD test on CDC data. Now to see if my 'cleaner' data allows for some additional functionality

In [372]:
mkdir /home/jupyter/gcs/cdcmodel_train02

In [373]:
#Attempt --do_eval
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train02/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/14/2022 01:54:56 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

## Now let's compare this to the baseline bert-base-uncased model with the same parameters (and we may try the bert-large option as well

In [374]:
mkdir /home/jupyter/gcs/cdcmodel_bert_base_uncased01

In [375]:
!python run_qa.py \
  --model_name_or_path bert-base-uncased \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 2 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_bert_base_uncased01 \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/14/2022 02:18:27 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

### Compare the above with the BLOOM 560M model results

In [384]:
#Bert
#  epoch                  =     2.0
#  eval_HasAns_exact      = 56.8428
#  eval_HasAns_f1         = 71.2395
#  eval_HasAns_total      =    3047
#  eval_best_exact        = 56.8428
#  eval_best_exact_thresh =     0.0
#  eval_best_f1           = 71.2395
#  eval_best_f1_thresh    =     0.0
#  eval_exact             = 56.8428
#  eval_f1                = 71.2395
#  eval_samples           =    3214
#  eval_total             =    3047

#Equivalent BLOOM-560M
#  epoch                  =    2.0
#  eval_HasAns_exact      = 0.4923
#  eval_HasAns_f1         = 0.9263
#  eval_HasAns_total      =   3047
#  eval_best_exact        = 0.4923
#  eval_best_exact_thresh =    0.0
#  eval_best_f1           = 0.9263
#  eval_best_f1_thresh    =    0.0
#  eval_exact             = 0.4923
#  eval_f1                = 0.9263
#  eval_samples           =   3229
#  eval_total             =   3047

### The small BLOOM model with light training is getting kicked pretty good by bert-base-uncased, so let's see if we can improve on this. Can we try the next larger model with the same settings on our single Workbench instance?

In [379]:
mkdir /home/jupyter/gcs/cdcmodel_train03/

In [382]:
#first attempt failed w/out of memory. Trying smaller batch size. (3 vs. 6 failed, trying 1 - also failed)
#!python run_qa.py \
#  --model_name_or_path bigscience/bloom-1b1 \
#  --dataset_name dataset \
#  --do_train \
#  --do_eval \
#  --per_device_train_batch_size 1 \
#  --learning_rate 3e-5 \
#  --num_train_epochs 2 \
#  --max_seq_length 384 \
#  --doc_stride 128 \
#  --output_dir /home/jupyter/gcs/cdcmodel_train03/ \
#  --eval_accumulation_steps 1 \
#  --version_2_with_negative \
#  --overwrite_output_dir \
#  --fp16

11/14/2022 02:45:19 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

### No dice. Going up to the next model size will require more hardware (even batch size of 1 is failing), or different settings that I don't know yet.

### OK, so now let's try more epochs on the smaller dataset and see if that helps us out.

In [383]:
#Try double training epochs to see if any improvement is made (from 2 to 4)
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 4 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train03/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/14/2022 02:49:19 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

In [386]:
### Hmm, interesting. F1 score went down. Overfitting?

#  epoch                  =    4.0
#  eval_HasAns_exact      = 0.1969
#  eval_HasAns_f1         = 0.7233
#  eval_HasAns_total      =   3047
#  eval_best_exact        = 0.1969
#  eval_best_exact_thresh =    0.0
#  eval_best_f1           = 0.7233
#  eval_best_f1_thresh    =    0.0
#  eval_exact             = 0.1969
#  eval_f1                = 0.7233
#  eval_samples           =   3229
#  eval_total             =   3047

In [387]:
mkdir /home/jupyter/gcs/cdcmodel_train04/

In [388]:
#Try with only one epoch - see what happens
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 1 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train04/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/14/2022 03:29:23 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

In [389]:
#Interesting. A single epoch did result in a slightly higher F1 score of .928 vs. .923

#  epoch                  =    1.0
#  eval_HasAns_exact      = 0.4266
#  eval_HasAns_f1         =  0.928
#  eval_HasAns_total      =   3047
#  eval_best_exact        = 0.4266
#  eval_best_exact_thresh =    0.0
#  eval_best_f1           =  0.928
#  eval_best_f1_thresh    =    0.0
#  eval_exact             = 0.4266
#  eval_f1                =  0.928
#  eval_samples           =   3229
#  eval_total             =   3047

In [390]:
mkdir /home/jupyter/gcs/cdcmodel_train05/

In [392]:
#Can we do 0 epochs or will that error out? Errors out. Less than 1?
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 0.1 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train05/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/14/2022 03:45:49 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

In [394]:
#OK, so 0 training, or close to it, is not better than 1 epoch.

#  epoch                  =    0.1
#  eval_HasAns_exact      = 0.1313
#  eval_HasAns_f1         = 0.4958
#  eval_HasAns_total      =   3047
#  eval_best_exact        = 0.1313
#  eval_best_exact_thresh =    0.0
#  eval_best_f1           = 0.4958
#  eval_best_f1_thresh    =    0.0
#  eval_exact             = 0.1313
#  eval_f1                = 0.4958
#  eval_samples           =   3229
#  eval_total             =   3047

### OK, final test - let's do a whole bunch of epochs and see what happens. This will need to run for a while.

In [395]:
mkdir /home/jupyter/gcs/cdcmodel_train06/

In [396]:
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 100 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train06/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/14/2022 03:52:07 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

In [398]:
mkdir /home/jupyter/gcs/cdcmodel_train07/

In [399]:
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 100 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train07/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/15/2022 14:54:54 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'loss': 2.3121, 'learning_rate': 2.5465843949044585e-05, 'epoch': 15.13}       
 15%|████▊                           | 19000/125600 [2:08:30<7:40:13,  3.86it/s][INFO|trainer.py:2671] 2022-11-15 17:03:43,170 >> Saving model checkpoint to /home/jupyter/gcs/cdcmodel_train07/checkpoint-19000
[INFO|configuration_utils.py:447] 2022-11-15 17:03:43,514 >> Configuration saved in /home/jupyter/gcs/cdcmodel_train07/checkpoint-19000/config.json
[INFO|modeling_utils.py:1624] 2022-11-15 17:04:03,489 >> Model weights saved in /home/jupyter/gcs/cdcmodel_train07/checkpoint-19000/pytorch_model.bin
[INFO|tokenization_utils_base.py:2125] 2022-11-15 17:04:03,882 >> tokenizer config file saved in /home/jupyter/gcs/cdcmodel_train07/checkpoint-19000/tokenizer_config.json
[INFO|tokenization_utils_base.py:2132] 2022-11-15 17:04:04,252 >> Special tokens file saved in /home/jupyter/gcs/cdcmodel_train07/checkpoint-19000/special_tokens_map.json
{'loss': 1.8979, 'learning_rate': 2.5346417197452228e-05, 'epoch': 15.

In [None]:
#100 epochs breaks eval - trying 50

In [400]:
mkdir /home/jupyter/gcs/cdcmodel_train08/

In [402]:
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 50 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train08/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/16/2022 15:23:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'loss': 1.6448, 'learning_rate': 1.1620222929936305e-05, 'epoch': 30.65}       
 61%|████████████████████▏            | 38500/62800 [4:53:09<1:45:47,  3.83it/s][INFO|trainer.py:2671] 2022-11-16 20:17:24,800 >> Saving model checkpoint to /home/jupyter/gcs/cdcmodel_train08/checkpoint-38500
[INFO|configuration_utils.py:447] 2022-11-16 20:17:25,086 >> Configuration saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-38500/config.json
[INFO|modeling_utils.py:1624] 2022-11-16 20:17:46,337 >> Model weights saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-38500/pytorch_model.bin
[INFO|tokenization_utils_base.py:2125] 2022-11-16 20:17:46,650 >> tokenizer config file saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-38500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2132] 2022-11-16 20:17:47,157 >> Special tokens file saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-38500/special_tokens_map.json
{'loss': 1.7762, 'learning_rate': 1.1381369426751592e-05, 'epoch': 31.

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[INFO|modeling_utils.py:1624] 2022-11-16 22:27:11,333 >> Model weights saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-57500/pytorch_model.bin
[INFO|tokenization_utils_base.py:2125] 2022-11-16 22:27:11,655 >> tokenizer config file saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-57500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2132] 2022-11-16 22:27:11,949 >> Special tokens file saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-57500/special_tokens_map.json
{'loss': 1.8263, 'learning_rate': 2.3087579617834395e-06, 'epoch': 46.18}       
 92%|████████████████████████████████▎  | 58000/62800 [7:06:02<20:53,  3.83it/s][INFO|trainer.py:2671] 2022-11-16 22:30:18,322 >> Saving model checkpoint to /home/jupyter/gcs/cdcmodel_train08/checkpoint-58000
[INFO|configuration_utils.py:447] 2022-11-16 22:30:18,786 >> Configuration saved in /home/jupyter/gcs/cdcmodel_train08/checkpoint-58000/config.json
[INFO|modeling_utils.py:1624] 2022-11-16 22:30:41,182 >> Model weights

In [None]:
#10 ?

In [408]:
mkdir /home/jupyter/gcs/cdcmodel_train11/

In [409]:
!python run_qa.py \
  --model_name_or_path bigscience/bloom-560m \
  --dataset_name dataset \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 6 \
  --learning_rate 3e-5 \
  --num_train_epochs 20 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir /home/jupyter/gcs/cdcmodel_train11/ \
  --eval_accumulation_steps 1 \
  --version_2_with_negative \
  --overwrite_output_dir \
  --fp16

11/21/2022 01:51:09 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'loss': 1.8845, 'learning_rate': 7.927547770700637e-06, 'epoch': 14.73}        
 74%|█████████████████████████▊         | 18500/25120 [2:04:13<28:44,  3.84it/s][INFO|trainer.py:2671] 2022-11-21 03:55:41,867 >> Saving model checkpoint to /home/jupyter/gcs/cdcmodel_train11/checkpoint-18500
[INFO|configuration_utils.py:447] 2022-11-21 03:55:42,227 >> Configuration saved in /home/jupyter/gcs/cdcmodel_train11/checkpoint-18500/config.json
[INFO|modeling_utils.py:1624] 2022-11-21 03:56:02,495 >> Model weights saved in /home/jupyter/gcs/cdcmodel_train11/checkpoint-18500/pytorch_model.bin
[INFO|tokenization_utils_base.py:2125] 2022-11-21 03:56:02,776 >> tokenizer config file saved in /home/jupyter/gcs/cdcmodel_train11/checkpoint-18500/tokenizer_config.json
[INFO|tokenization_utils_base.py:2132] 2022-11-21 03:56:03,097 >> Special tokens file saved in /home/jupyter/gcs/cdcmodel_train11/checkpoint-18500/special_tokens_map.json
{'loss': 1.8036, 'learning_rate': 7.330414012738853e-06, 'epoch': 15.1

### Previous output from SQuAD training POC

In [2]:
#SUCCESS!! Final output and details below

In [3]:
"""
Training completed. Do not forget to share your model on huggingface.co/models =)


{'train_runtime': 15323.5402, 'train_samples_per_second': 17.209, 'train_steps_per_second': 2.868, 'train_loss': 3.5304283500368654, 'epoch': 2.0}
100%|███████████████████████████████████| 43952/43952 [4:15:23<00:00,  2.87it/s]
[INFO|trainer.py:2671] 2022-10-22 22:33:05,661 >> Saving model checkpoint to /home/jupyter/tmp/debug_bloom_squad/
[INFO|configuration_utils.py:447] 2022-10-22 22:33:05,662 >> Configuration saved in /home/jupyter/tmp/debug_bloom_squad/config.json
[INFO|modeling_utils.py:1624] 2022-10-22 22:33:11,183 >> Model weights saved in /home/jupyter/tmp/debug_bloom_squad/pytorch_model.bin
[INFO|tokenization_utils_base.py:2125] 2022-10-22 22:33:11,183 >> tokenizer config file saved in /home/jupyter/tmp/debug_bloom_squad/tokenizer_config.json
[INFO|tokenization_utils_base.py:2132] 2022-10-22 22:33:11,184 >> Special tokens file saved in /home/jupyter/tmp/debug_bloom_squad/special_tokens_map.json
***** train metrics *****
  epoch                    =        2.0
  train_loss               =     3.5304
  train_runtime            = 4:15:23.54
  train_samples            =     131854
  train_samples_per_second =     17.209
  train_steps_per_second   =      2.868
[INFO|modelcard.py:444] 2022-10-22 22:33:13,465 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Question Answering', 'type': 'question-answering'}, 'dataset': {'name': 'squad_v2', 'type': 'squad_v2', 'config': 'squad_v2', 'split': 'train', 'args': 'squad_v2'}}
"""

"\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n\n{'train_runtime': 15323.5402, 'train_samples_per_second': 17.209, 'train_steps_per_second': 2.868, 'train_loss': 3.5304283500368654, 'epoch': 2.0}\n100%|███████████████████████████████████| 43952/43952 [4:15:23<00:00,  2.87it/s]\n[INFO|trainer.py:2671] 2022-10-22 22:33:05,661 >> Saving model checkpoint to /home/jupyter/tmp/debug_bloom_squad/\n[INFO|configuration_utils.py:447] 2022-10-22 22:33:05,662 >> Configuration saved in /home/jupyter/tmp/debug_bloom_squad/config.json\n[INFO|modeling_utils.py:1624] 2022-10-22 22:33:11,183 >> Model weights saved in /home/jupyter/tmp/debug_bloom_squad/pytorch_model.bin\n[INFO|tokenization_utils_base.py:2125] 2022-10-22 22:33:11,183 >> tokenizer config file saved in /home/jupyter/tmp/debug_bloom_squad/tokenizer_config.json\n[INFO|tokenization_utils_base.py:2132] 2022-10-22 22:33:11,184 >> Special tokens file saved in /home/jupyter/tmp/debug_bloom_squad/special_to

## Model is now built. Let's see if we can figure out how to interact with it.

In [1]:
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BloomForQuestionAnswering

In [3]:
from transformers import BloomTokenizerFast

In [4]:
model = BloomForQuestionAnswering.from_pretrained("/home/jupyter/tmp/debug_bloom_squad/")

In [5]:
tokenizer = BloomTokenizerFast.from_pretrained("/home/jupyter/tmp/debug_bloom_squad/")

In [6]:
import torch
prompt = ["What is the capital of North Dakota"] 
inputs = tokenizer(prompt, return_tensors="pt") 
#inIDs = torch.LongTensor(inputs["input_ids"])
#attnIDs = torch.LongTensor(inputs["attention_mask"])
#print(inIDs, attnIDs)
print(inputs)

{'input_ids': tensor([[ 10560,    632,    368,   9213,    461,  17527, 129602]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [22]:
tokenizer.decode(inputs['input_ids'][0])

'What is the capital of North Dakota'

In [23]:
tokenizer.decode(inputs['attention_mask'][0])

'<s><s><s><s><s><s><s>'

In [7]:
model(**inputs)



QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[110.5513, 107.8777, 111.6836, 111.9656, 107.3340, 111.9346, 110.6411]],
       grad_fn=<CloneBackward0>), end_logits=tensor([[51.6394, 47.1113, 48.6247, 51.8243, 46.2082, 51.1148, 51.0818]],
       grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)

In [12]:
outputs = model(**inputs)

In [16]:
outputs['start_logits']

tensor([[110.5513, 107.8777, 111.6836, 111.9656, 107.3340, 111.9346, 110.6411]],
       grad_fn=<CloneBackward0>)

In [18]:
outputs['end_logits']

tensor([[51.6394, 47.1113, 48.6247, 51.8243, 46.2082, 51.1148, 51.0818]],
       grad_fn=<CloneBackward0>)

In [60]:
tokenizer.decode([129602])

' Dakota'

In [32]:
outputs.start_logits.shape, outputs.end_logits.shape

(torch.Size([1, 7]), torch.Size([1, 7]))

In [35]:
outputs.start_logits.argmax(dim=-1), outputs.end_logits.argmax(dim=-1)

(tensor([3]), tensor([3]))