In [1]:
# pip install transformers
# pip install torch
# pip install python-dotenv 
# ! pip install lxml
import os
import dotenv
import joblib
from huggingface_hub import hf_hub_download
import torch 

dotenv.load_dotenv()
HUGGING_FACE_API_KEY = os.getenv("HUGGING_FACE_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


### Fill Mask Model

In [15]:
print([os.walk(root_dir)])


[<generator object _walk at 0x13004d5b0>]


In [22]:
import os
import json
from bs4 import BeautifulSoup
import requests

def process_xml_files(root_dir):
    data = []
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.xml'):
                filepath = subdir + os.sep + file
                with open(filepath, 'r', encoding='utf-8') as xml_file:
                    soup = BeautifulSoup(xml_file, 'xml')
                    qapairs = soup.find_all('QAPair')
                    for qapair in qapairs:
                        question = qapair.find('Question')
                        answer = qapair.find('Answer')
                        # Both question and answer exist and are non-empty
                        if question and question.text.strip() and answer and answer.text.strip():
                            # Clean up the text by replacing tabs, newlines, and multiple spaces with a single space
                            clean_question = ' '.join(question.text.split())
                            clean_answer = ' '.join(answer.text.split())
                            data.append({
                                "instruction" : "You are a medical expert and you will answer questions related to medical inquiries.",
                                "input": clean_question,
                                "output": clean_answer,
                            })
    return data

def write_json_file(data, output_file):
    with open(output_file, 'w') as json_file:
        json.dump(data, json_file, indent=4)

root_dir = '/Users/jsauza/medical/MedQuad'  # Change this to the root directory of your XML files
output_file = 'output.json'  # The file where you want to store your JSON data

data = process_xml_files(root_dir)
write_json_file(data, output_file)

In [25]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalGPT-base-zh")
model = AutoModelForCausalLM.from_pretrained("medicalai/ClinicalGPT-base-zh")


Downloading (…)okenizer_config.json: 100%|██████████| 286/286 [00:00<00:00, 685kB/s]
Downloading tokenizer.json: 100%|██████████| 14.5M/14.5M [00:00<00:00, 32.9MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 96.0/96.0 [00:00<00:00, 439kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 784/784 [00:00<00:00, 7.34MB/s]
Downloading (…)model.bin.index.json: 100%|██████████| 31.9k/31.9k [00:00<00:00, 32.2MB/s]
Downloading (…)l-00002-of-00016.bin: 100%|██████████| 2.06G/2.06G [00:46<00:00, 44.1MB/s]
Downloading (…)l-00003-of-00016.bin: 100%|██████████| 940M/940M [00:21<00:00, 44.3MB/s]
Downloading (…)l-00004-of-00016.bin: 100%|██████████| 940M/940M [00:21<00:00, 44.1MB/s]
Downloading (…)l-00005-of-00016.bin: 100%|██████████| 940M/940M [00:21<00:00, 43.1MB/s]
Downloading (…)l-00006-of-00016.bin: 100%|██████████| 940M/940M [00:21<00:00, 43.8MB/s]
Downloading (…)l-00007-of-00016.bin: 100%|██████████| 940M/940M [00:21<00:00, 44.2MB/s]
Downloading (…)l-00008-of-00016.bin: 100%

In [8]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = "medicalai/ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
model_checkpoint = model_id #"distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
pipeline = pipeline("fill-mask", model=model, device=-1, tokenizer=tokenizer)

In [9]:
distilbert_num_parameters = model.num_parameters() / 1_000_000
print(f"'>>> ClinicalBERT number of parameters: {round(distilbert_num_parameters)}M'")
print(f"'>>> BERT number of parameters: 110M'")

'>>> ClinicalBERT number of parameters: 135M'
'>>> BERT number of parameters: 110M'


In [10]:
text = "The patient has a high fever which was indicative of [MASK]."

In [11]:
inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

'>>> The patient has a high fever which was indicative of infection.'
'>>> The patient has a high fever which was indicative of pain.'
'>>> The patient has a high fever which was indicative of symptoms.'
'>>> The patient has a high fever which was indicative of fever.'
'>>> The patient has a high fever which was indicative of influenza.'


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-xl"
tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipeline = pipeline("text2text-generation", model=model, device=-1, tokenizer=tokenizer, max_length=1000)

Loading checkpoint shards: 100%|██████████| 2/2 [00:25<00:00, 12.80s/it]


In [13]:
pipeline("What do you build a simple electronic engine?")
#pipeline("What are competitors to Apache Kafka?")

[{'generated_text': 'You can build a simple electronic engine by using a battery and a circuit board.'}]

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel

model_name = "WizardLM/WizardCoder-3B-V1.0"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

inputs = tokenizer("How do you make a lemonade?", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))