# Preprocess USMLE for finetuning and evaluation

## Create version for finetuning

In [2]:
from datasets import load_dataset

gcs_bucket_name = "open-llm-finetuning"

# Load model
usml_raw = load_dataset("GBaker/MedQA-USMLE-4-options")
usml_train = usml_raw['train']

print(usml_raw)

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'options', 'meta_info', 'answer_idx', 'metamap_phrases'],
        num_rows: 10178
    })
    test: Dataset({
        features: ['question', 'answer', 'options', 'meta_info', 'answer_idx', 'metamap_phrases'],
        num_rows: 1273
    })
})


In [3]:
# Preprocess dataset for finetuning
def format_mcf_finetuning(example):
    option_keys = sorted(example['options'].keys())
    formatted_options = "\n".join([f"{key}. {example['options'][key]}" for key in option_keys])
    prompt = f"Question: {example['question']}\n{formatted_options}\nAnswer:"
    completion = example['answer']
    return {"prompt": prompt, "completion": completion}

usml_train_data = usml_train.map(
    format_mcf_finetuning,
    remove_columns=usml_train.column_names
)

# Save
usml_train_data.save_to_disk(f"gcs://{gcs_bucket_name}/data/finetuning/train_usml")

Saving the dataset (0/1 shards):   0%|          | 0/10178 [00:00<?, ? examples/s]

In [4]:
print(usml_train_data[0]['prompt'])
print(usml_train_data[0]['completion'])

Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. She otherwise feels well and is followed by a doctor for her pregnancy. Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. Which of the following is the best treatment for this patient?
A. Ampicillin
B. Ceftriaxone
C. Doxycycline
D. Nitrofurantoin
Answer:
Nitrofurantoin


## Version for evaluation

In [4]:
usml_eval = usml_raw['test']

usml_eval_data = usml_eval.map(
    format_mcf_finetuning,
    remove_columns=usml_eval.column_names
)

# Save
usml_eval_data.save_to_disk(f"gcs://{gcs_bucket_name}/data/finetuning/eval_usml")

Map:   0%|          | 0/1273 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1273 [00:00<?, ? examples/s]