In [21]:
import re
import json
import fitz 
import random
from tqdm import tqdm
from sentence_splitter import split_text_into_sentences
from datasets import load_dataset


In [10]:
def clean_page_text(text: str):
    # sub \ue01 and \n
    text = re.sub("\ue04a|\ue04b|\ue01e|-\n", "", text)
    text = re.sub("\n", " ", text)
    # split into sentences
    sentences = split_text_into_sentences(text, language="en")
    # remove short sentences
    sentences = [s for s in sentences if len(s) > 10]
    return sentences

pdfs = ['../data/out/medical/datasets/medbook.pdf','../data/out/medical/datasets/medbook2.pdf']
pages = []
for pdf_path in pdfs:
    # Open the PDF file
    doc = fitz.open(pdf_path)
    # Iterate over each page in the document
    for page in tqdm(doc):
        clean_page = clean_page_text(page.get_text())
        if len(clean_page) > 10:
            # first line is usually noisy with the page header or the end of a sentence from the previous page
            pages.append(clean_page[1:])    



  0%|          | 0/1428 [00:00<?, ?it/s]

100%|██████████| 1428/1428 [00:26<00:00, 53.20it/s]
100%|██████████| 2783/2783 [00:52<00:00, 52.79it/s]


In [35]:
# for each page, divide it prompt + completion splits where the pormpt is 1 sentence and the completion is 3 sentences or 4 sentences (0.85 prob split)
train_jsonl = []
for page in pages:
    for i in range(len(page)-4):
        prompt = page[i]
        completion_size = random.choice([3,3,4,5,6])
        completion = " ".join(page[i+1:i+1+completion_size])
        train_jsonl.append({'prompt': prompt, 'completion': completion})

In [37]:
# save
with open('../data/out/medical/train.jsonl', 'w') as f:
    for item in train_jsonl:
        f.write(json.dumps(item) + "\n")

In [36]:
def get_average_lengths(dataset_jsonl):
    prompt_sentences = []
    completion_sentences = []
    for example in tqdm(dataset_jsonl):
        prompt_sentences.append(len(split_text_into_sentences(example['prompt'], language="en")))
        completion_sentences.append(len(split_text_into_sentences(example['completion'], language="en")))
    print(f"Prompt sentences: {sum(prompt_sentences)/len(prompt_sentences)}")
    print(f"Completion sentences: {sum(completion_sentences)/len(completion_sentences)}")

get_average_lengths(train_jsonl)


100%|██████████| 146927/146927 [08:10<00:00, 299.65it/s]

Prompt sentences: 1.0
Completion sentences: 4.178156499486139





In [26]:
# prepare the medical test set
dev_path = "../data/out/medical/datasets/"

medical_qa_datasets = [
    "medalpaca/medical_meadow_medical_flashcards",
    "medalpaca/medical_meadow_wikidoc_patient_information",
    "medalpaca/medical_meadow_wikidoc"
]


for dataset_name in medical_qa_datasets:
    dataset_jsonl = []
    print(f"Processing {dataset_name}")
    dataset = load_dataset(dataset_name)
    for example in dataset['train']:
        dataset_jsonl.append({
            "prompt": example['input'],
            "completion": example['output'],
        })
    get_average_lengths(dataset_jsonl)
    with open(dev_path+"/"+dataset_name.split("/")[1]+".jsonl", 'w') as f:
        for example in dataset_jsonl:
            f.write(json.dumps(example) + '\n')



Processing medalpaca/medical_meadow_medical_flashcards


100%|██████████| 33955/33955 [01:40<00:00, 336.64it/s]


Prompt sentences: 0.9972610808422913
Completion sentences: 2.691032248564276
Processing medalpaca/medical_meadow_wikidoc_patient_information


100%|██████████| 5942/5942 [00:18<00:00, 327.95it/s]


Prompt sentences: 1.000673174015483
Completion sentences: 4.819757657354426
Processing monology/medical_meadow_alpaca


100%|██████████| 5942/5942 [00:18<00:00, 328.05it/s]


Prompt sentences: 1.000673174015483
Completion sentences: 4.819757657354426
Processing medalpaca/medical_meadow_wikidoc


100%|██████████| 10000/10000 [00:35<00:00, 282.73it/s]


Prompt sentences: 1.0238
Completion sentences: 6.6465
