In [1]:
from datasets import load_dataset, load_dataset_builder, get_dataset_split_names

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATASET_NAME = "bigbio/med_qa"
DATASET_CONFIG = "med_qa_en_source"
ds_builder = load_dataset_builder(DATASET_NAME,DATASET_CONFIG)

In [3]:
print(ds_builder.info.description)

In this work, we present the first free-form multiple-choice OpenQA dataset for solving medical problems, MedQA,
collected from the professional medical board exams. It covers three languages: English, simplified Chinese, and
traditional Chinese, and contains 12,723, 34,251, and 14,123 questions for the three languages, respectively. Together
with the question data, we also collect and release a large-scale corpus from medical textbooks from which the reading
comprehension models can obtain necessary knowledge for answering the questions.



In [4]:
train_ds = load_dataset(DATASET_NAME, DATASET_CONFIG, split='train')

Found cached dataset med_qa (/Users/michael/.cache/huggingface/datasets/bigbio___med_qa/med_qa_en_source/1.0.0/cfb3883fd412613f1938bbb3449a43e18bd2428b691726183a0d3c9b590f885d)


In [5]:
train_ds[0]

{'meta_info': 'step2&3',
 'question': 'A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?',
 'answer_idx': 'E',
 'answer': 'Nitrofurantoin',
 'options': [{'key': 'A', 'value': 'Ampicillin'},
  {'key': 'B', 'value': 'Ceftriaxone'},
  {'key': 'C', 'value': 'Ciprofloxacin'},
  {'key': 'D', 'value': 'Doxycycline'},
  {'key': 'E', 'value': 'Nitrofurantoin'}]}

In [6]:
from mingpt.bpe import BPETokenizer

In [7]:
bpe_tokenizer = BPETokenizer()

In [8]:
bpe_tokenizer(train_ds[0]['question'])

tensor([[   32,  2242,    12,  1941,    12,   727, 10423,  2415,   379,  2534,
          2745, 47110, 10969,   351,  9482,  2402,  2956,  1883,    13,  1375,
          2585,   340,  2067,   352,  1110,  2084,   290,   468,   587, 42373,
          3805,  7722,   517,  1660,   290,  2263, 41286,  8396,  7925,    13,
          1375,  4306,  5300,   880,   290,   318,  3940,   416,   257,  6253,
           329,   607, 10241,    13,  2332,  5951,   318, 10111,    13,    22,
          7200,    37,   357,  2623,    13,    20,  7200,    34,   828,  2910,
          3833,   318, 19409,    14,  3324,  8085,    39,    70,    11, 19445,
           318,  4019,    14,  1084,    11, 21483,   602,   389,   678,    14,
          1084,    11,   290, 11863, 36275,   318,  9661,     4,   319,  2119,
          1633,    13, 16331,  2814,   318, 12411,   329,   281,  8889,   286,
          1575,  2502,   660, 24427,  9848, 15403,  1108,   290,   257,  9067,
           312, 41303,    13,  9022,   286,   262,  

In [9]:
results = bpe_tokenizer.encoder.encode_and_show_work(train_ds[0]['question'])
# print(train_ds[0]['question'])
for a in results['parts']:
    print(a['token'])

A
 23
-
year
-
old
 pregnant
 woman
 at
 22
 weeks
 gestation
 presents
 with
 burning
 upon
 urination
.
 She
 states
 it
 started
 1
 day
 ago
 and
 has
 been
 worsening
 despite
 drinking
 more
 water
 and
 taking
 cranberry
 extract
.
 She
 otherwise
 feels
 well
 and
 is
 followed
 by
 a
 doctor
 for
 her
 pregnancy
.
 Her
 temperature
 is
 97
.
7
°
F
 (
36
.
5
°
C
),
 blood
 pressure
 is
 122
/
77
 mmHg
,
 pulse
 is
 80
/
min
,
 respirations
 are
 19
/
min
,
 and
 oxygen
 saturation
 is
 98
%
 on
 room
 air
.
 Physical
 exam
 is
 notable
 for
 an
 absence
 of
 costovertebral
 angle
 tenderness
 and
 a
 gravid
 uterus
.
 Which
 of
 the
 following
 is
 the
 best
 treatment
 for
 this
 patient
?


In [10]:
def encode_examples(example):
    training_sentence = f"{example['question']}\nAnswer: {example['answer']}\n"
    return bpe_tokenizer(training_sentence)[0]

In [11]:
tokenizer_examples = [encode_examples(ex) for ex in train_ds]

# I only want to keep examples longer than 128 tokens
# I only want to use the last 129 tokens of each example
tokenized_train = [ex[-129:] for ex in tokenizer_examples if len(ex) >= 129]

In [12]:
from torch.utils.data import Dataset

class SimpleMedQADataset(Dataset):
    def __init__(self, tokenized_examples):
        self.tokenized_examples = tokenized_examples
        
    def __len__(self):
        return len(self.tokenized_examples)
    
    def __getitem__(self, idx):
        return self.tokenized_examples[idx][:-1], self.tokenized_examples[idx][1:]

In [13]:
train_dataset = SimpleMedQADataset(tokenized_train)

In [15]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt2'
model_config.vocab_size = 50257
model_config.block_size = 256
model = GPT(model_config)

number of parameters: 123.85M


In [16]:
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cpu


In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 10.97411
iter_dt 60800.91ms; iter 10: train loss 6.92168
iter_dt 54592.10ms; iter 20: train loss 6.47418
iter_dt 58699.42ms; iter 30: train loss 6.10319
iter_dt 64744.87ms; iter 40: train loss 5.61564
iter_dt 62520.68ms; iter 50: train loss 5.25659
iter_dt 57952.78ms; iter 60: train loss 4.84473
iter_dt 60909.48ms; iter 70: train loss 4.51911
iter_dt 58607.89ms; iter 80: train loss 4.47657
iter_dt 58216.79ms; iter 90: train loss 4.31750
iter_dt 62382.39ms; iter 100: train loss 4.20844
iter_dt 59524.22ms; iter 110: train loss 3.79563
iter_dt 58650.43ms; iter 120: train loss 3.84761
iter_dt 59997.55ms; iter 130: train loss 3.56417
iter_dt 68485.96ms; iter 140: train loss 3.73688


In [None]:
idx = 80
inputs = bpe_tokenizer(train_ds[idx]['question']+"\nAnswer: ")
outputs = model.generate(inputs, max_new_tokens=20, temperature=1.1, top_k=20, do_sample=True)

try:
    offset = list(outputs[0][-20:]).index(198)
except:
    offset = -1

In [None]:
print(train_ds[idx]['question'])
bpe_tokenizer.decode(outputs[0][len(inputs[0]):len(inputs[0])+offset])

In [183]:
len(inputs)

1

In [122]:
bpe_tokenizer("\n")

tensor([[198]])

In [124]:
list(outputs[0][-50:]).index(198)

37