In [None]:
from datasets import load_dataset, load_dataset_builder, get_dataset_split_names

In [None]:
DATASET_NAME = "bigbio/med_qa"
DATASET_CONFIG = "med_qa_en_source"
ds_builder = load_dataset_builder(DATASET_NAME,DATASET_CONFIG)

In [None]:
print(ds_builder.info.description)

In [None]:
train_ds = load_dataset(DATASET_NAME, DATASET_CONFIG, split='train')

In [None]:
train_ds[0]

In [None]:
from mingpt.bpe import BPETokenizer

In [None]:
bpe_tokenizer = BPETokenizer()

In [None]:
bpe_tokenizer(train_ds[0]['question'])

In [None]:
results = bpe_tokenizer.encoder.encode_and_show_work(train_ds[0]['question'])
# print(train_ds[0]['question'])
for a in results['parts']:
    print(a['token'])

In [None]:
def encode_examples(example):
    training_sentence = f"{example['question']}\nAnswer: {example['answer']}\n"
    return bpe_tokenizer(training_sentence)[0]

In [None]:
tokenizer_examples = [encode_examples(ex) for ex in train_ds]

# I only want to keep examples longer than 128 tokens
# I only want to use the last 129 tokens of each example
tokenized_train = [ex[-129:] for ex in tokenizer_examples if len(ex) >= 129]

In [None]:
from torch.utils.data import Dataset

class SimpleMedQADataset(Dataset):
    def __init__(self, tokenized_examples):
        self.tokenized_examples = tokenized_examples
        
    def __len__(self):
        return len(self.tokenized_examples)
    
    def __getitem__(self, idx):
        return self.tokenized_examples[idx][:-1], self.tokenized_examples[idx][1:]

In [None]:
train_dataset = SimpleMedQADataset(tokenized_train)

In [None]:
from mingpt.model import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt2'
model_config.vocab_size = 50257
model_config.block_size = 256
model = GPT(model_config)

In [None]:
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

In [None]:
idx = 5

import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

inputs = bpe_tokenizer(train_ds[idx]['question']+"\nAnswer: ").to(device)
outputs = model.generate(inputs, max_new_tokens=20, temperature=1.2, top_k=40, do_sample=True)

try:
    offset = list(outputs[0][-20:]).index(198)
except:
    offset = 10

In [None]:
print(train_ds[idx]['question'])
bpe_tokenizer.decode(outputs[0][len(inputs[0]):])

In [None]:
train_ds[idx]['answer']

In [None]:
bpe_tokenizer("\n")