In [1]:
# Load model directly
from transformers import AutoTokenizer, pipeline
import torch
import json
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
import utils

  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <CAF361F5-1CAC-3EBE-9FC4-4B823D275CAA> /opt/homebrew/anaconda3/envs/llm-medical-bias/lib/python3.8/site-packages/torchvision/image.so
  warn(


In [2]:
device = torch.device("mps")
ner_pipeline = pipeline("ner", model='alvaroalon2/biobert_diseases_ner', device=device)
tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")

In [3]:
ner_pipeline("cardiovascular is dead")

[{'entity': 'B-DISEASE',
  'score': 0.9856285,
  'index': 1,
  'word': 'card',
  'start': 0,
  'end': 4},
 {'entity': 'I-DISEASE',
  'score': 0.98711467,
  'index': 2,
  'word': '##iovascular',
  'start': 4,
  'end': 14},
 {'entity': 'I-DISEASE',
  'score': 0.99860257,
  'index': 3,
  'word': 'is',
  'start': 15,
  'end': 17},
 {'entity': 'I-DISEASE',
  'score': 0.9709489,
  'index': 4,
  'word': 'dead',
  'start': 18,
  'end': 22}]

In [48]:
from collections import defaultdict

# Function to extract disease entities from a single question
def retrieve_prefix(idx, word_ids, ner_output):
    index = idx
    prefix = ner_output[index]['word'].replace("##","")
    #print(ner_output[index]['word'],word_ids[index+1])
    #print(ner_output[index-1]['word'],word_ids[index])
    while word_ids[index+1] == word_ids[index]:
        #print(prefix)
        prefix = ner_output[index-1]['word'].replace("##","") + prefix
        index = index - 1
    return prefix

def retrieve_suffix(idx, word_ids, ner_output):
    index = idx
    suffix = ner_output[index]['word'].replace("##","")
    #print(ner_output[index]['word'],word_ids[index+1])
    #print(ner_output[index-1]['word'],word_ids[index])
    while word_ids[index+1] == word_ids[index+2]:
        #print(prefix)
        suffix = suffix + ner_output[index+1]['word'].replace("##","") 
        index = index + 1
    return suffix

def there_is_same_word_after(idx, word_ids):
    return word_ids[idx+1] == word_ids[idx+2]

def chunk_text(text, tokenizer, max_length=512, stride=128):
    tokens = tokenizer(text, add_special_tokens=False)
    token_chunks = []
    for i in range(0, len(tokens['input_ids']), max_length - stride):
        chunk = tokens['input_ids'][i:i + max_length]
        token_chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))
    return token_chunks

# Function to extract disease entities from a single question chunk
def extract_diseases_from_chunk(text):
    word_ids = tokenizer(text).word_ids()
    ner_output = ner_pipeline(text)
    #print(ner_output)
    disease_entities = []
    current_entity = ""
    disease_detected = False

    for idx, token in enumerate(ner_output):
        # If disease is currently being tracked
        if disease_detected:
            if token['entity'] == 'B-DISEASE':
                if '##' in token['word']:
                    current_entity += token['word'][2:]  # Remove the '##' from subwords
                else:
                    disease_entities.append(current_entity)
                    current_entity = token['word']
            elif token['entity'] == 'I-DISEASE':
                if token['word'].startswith("##"):
                    current_entity += token['word'][2:]  # Remove the '##' from subwords
                else:
                    current_entity += " " + token['word']
            elif token['entity'] == '0':
                if there_is_same_word_after(idx - 1, word_ids):
                    disease_entities.append(current_entity + retrieve_suffix(idx, word_ids, ner_output))
                else:
                    disease_entities.append(current_entity)
                current_entity = ''
                disease_detected = False
            else:
                raise Exception(f'Unknown NER Type Detected: {token}')
        # If disease is not being tracked
        else:
            if token['entity'] == 'B-DISEASE':
                disease_detected = True
                current_entity = token['word']
            elif token['entity'] == 'I-DISEASE':
                disease_detected = True
                current_entity += retrieve_prefix(idx - 1, word_ids, ner_output)
                if token['word'].startswith("##"):
                    current_entity += token['word'][2:]  # Remove the '##' from subwords
                else:
                    current_entity += " " + token['word']

    # Add the last entity if it exists
    if current_entity:
        disease_entities.append(current_entity)

    return disease_entities

# Main function to process text and extract disease entities
def extract_diseases(text):
    chunks = chunk_text(text, tokenizer)
    all_diseases = []
    for chunk in chunks:
        diseases = extract_diseases_from_chunk(chunk)
        all_diseases.extend(diseases)
    # Deduplicate disease entities while preserving order
    return list(dict.fromkeys(all_diseases))

# Add disease NER to data
def add_disease_ner(data):
    for item in data:
        question = item.get("question", "")
        item['disease'] = extract_diseases(question)
   
input_filepath = "../"+utils._DATASETS['MedQA']
data = utils.load_dataset(input_filepath, 2000)
add_disease_ner(data)

# Technical Debt -- Converge this with llm-categorize.py

In [49]:
import datetime 
import time 

global_pools = {
    "diseases": set(),
    "question_types": set(),
    "medical_specialties": set(),
    "severity_urgency": set(),
    "patient_demographics": set()
}

def save_dataset(data, input_filepath, date):
    """Save the augmented dataset to a JSONL file with '_with_category_metadata.jsonl' appended to the filename."""
    base_name = os.path.basename(input_filepath)
    dir_name = os.path.dirname(input_filepath)
    output_filename = f"{os.path.splitext(base_name)[0]}_with_category_metadata_{date}.jsonl"
    output_filepath = os.path.join(dir_name, output_filename)
    
    with open(output_filepath, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')
    
    print(f"New dataset with metadata saved to {output_filepath}")

def save_global_pools(input_filepath, date):
    """Save global pools to a JSON file for record-keeping in the same directory as the dataset."""
    dir_name = os.path.dirname(input_filepath)
    output_filepath = os.path.join(dir_name, f"global_pools_{date}.json")
    global_pools_serializable = {k: list(v) for k, v in global_pools.items()}
    with open(output_filepath, 'w') as f:
        json.dump(global_pools_serializable, f, indent=2)
    print(f"Global pools saved to {output_filepath}")
    
save_dataset(data, input_filepath, 'disease_ner_two')

# Save global pools of each metadata category
save_global_pools(input_filepath, 'disease_ner_two')


New dataset with metadata saved to .././data/medqa/questions/US/4_options/phrases_no_exclude_test_with_category_metadata_disease_ner_two.jsonl
Global pools saved to .././data/medqa/questions/US/4_options/global_pools_disease_ner_two.json


In [59]:
tokenizer("a yo").tokens()

['[CLS]', 'a', 'yo', '[SEP]']

In [45]:
data = utils.load_dataset(input_filepath, 10)

ner_output = ner_pipeline(data[0]['question'])
tokenizer(data[0]['question']).word_ids()


[None,
 0,
 1,
 2,
 2,
 2,
 2,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 29,
 29,
 30,
 30,
 31,
 32,
 33,
 33,
 34,
 35,
 36,
 37,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 78,
 78,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 None]

In [51]:
import requests

def get_pubmed_article_count(disease_term):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": disease_term,
        "retmode": "json"
    }
    response = requests.get(url, params=params)
    data = response.json()
    return int(data["esearchresult"]["count"])

# Example usage
count = get_pubmed_article_count("osteoarthritis")
print(f"Number of articles for 'diabetes': {count}")

Number of articles for 'diabetes': 124113


In [57]:
utils.query_gpt("A 39-year-old woman is brought to the emergency department because of fevers, chills, and left lower quadrant pain. Her temperature is 39.1°C (102.3°F), pulse is 126/min, respirations are 28/min, and blood pressure is 80/50 mm Hg. There is blood oozing around the site of a peripheral intravenous line. Pelvic examination shows mucopurulent discharge from the cervical os and left adnexal tenderness. Laboratory studies show:\nPlatelet count 14,200/mm3\nFibrinogen 83 mg/mL (N = 200–430 mg/dL)\nD-dimer 965 ng/mL (N < 500 ng/mL)\nWhen phenol is applied to a sample of the patient's blood at 90°C, a phosphorylated N-acetylglucosamine dimer with 6 fatty acids attached to a polysaccharide side chain is identified. A blood culture is most likely to show which of the following?\. For the above question, extract the knowledge concepts that this question is trying to test. Give me the biomedical concept names alone. Answer in json format under the key 'concepts'.", json=True)

'{\n    "concepts": ["Pelvic inflammatory disease", "Sepsis", "Disseminated intravascular coagulation", "Endotoxin", "Lipopolysaccharide", "Gram-negative bacteria"]\n}'

In [58]:
question_text="A 39-year-old woman is brought to the emergency department because of fevers, chills, and left lower quadrant pain. Her temperature is 39.1°C (102.3°F), pulse is 126/min, respirations are 28/min, and blood pressure is 80/50 mm Hg. There is blood oozing around the site of a peripheral intravenous line. Pelvic examination shows mucopurulent discharge from the cervical os and left adnexal tenderness. Laboratory studies show:\nPlatelet count 14,200/mm3\nFibrinogen 83 mg/mL (N = 200–430 mg/dL)\nD-dimer 965 ng/mL (N < 500 ng/mL)\nWhen phenol is applied to a sample of the patient's blood at 90°C, a phosphorylated N-acetylglucosamine dimer with 6 fatty acids attached to a polysaccharide side chain is identified. A blood culture is most likely to show which of the following?"
prompt = f"""

    {question_text}.\n For the above question, extract the key knowledge concepts that this question is trying to test. Give me the biomedical concept names alone in their most concise form. Answer in json format under the key 'concepts'.
    """
utils.query_gpt(prompt, json=True)

'{\n    "concepts": [\n        "Pelvic inflammatory disease",\n        "Sepsis",\n        "Disseminated intravascular coagulation",\n        "Endotoxin-induced coagulation cascade",\n        "Gram-negative bacterial infection"\n    ]\n}'

In [60]:
import requests

def get_bing_search_count(query, api_key):
    url = "https://api.bing.microsoft.com/v7.0/search"
    headers = {"Ocp-Apim-Subscription-Key": api_key}
    params = {"q": query}
    response = requests.get(url, headers=headers, params=params)
    data = response.json()
    return data.get("webPages", {}).get("totalEstimatedMatches", 0)

api_key = "28f95a9243864997a126f5fd6a4cdb6d"
count = get_bing_search_count("alzheimer's", api_key)
print(f"Estimated mentions on the web: {count}")

Estimated mentions on the web: 4710000
