In [5]:

# filename1= 'Triage-Counterfactual_CoT_gpt-4o_json0_99_20241125_173640.csv'
# filename2 = 'Triage-Counterfactual_CoT_gpt-4o_json100_1000_20241124_173026.csv'
# filename3 = 'Triage-Counterfactual_CoT_gpt-4o_json1000_3000_20241124_211048.csv'
# filepaths = [
#     '../results/Triage-Counterfactual/' + filename1,
#     '../results/Triage-Counterfactual/' + filename2,
#     '../results/Triage-Counterfactual/' + filename3
# ]

# # Read and combine all dataframes
# dfs = [pd.read_csv(filepath) for filepath in filepaths]
# dfs[2] = dfs[2].loc[1:len(dfs[2])-1]
# combined_df = pd.concat(dfs, ignore_index=True)

# # Save the combined dataframe to a new file if needed
# output_filepath = '../results/Triage-Counterfactual/Triage-Counterfactual_CoT_gpt-4o_json0_3000.csv'
# combined_df.to_csv(output_filepath, index=False)

### Create Embeddings for KATE Few Shot Prediction

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import numpy as np
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
data = pd.read_csv('./data/mimic-iv-private/triage_stratified_training.csv')

In [3]:
data['chiefcomplaint'] = data['chiefcomplaint'].astype(str)

# Load the pre-trained SentenceTransformer model for clinical text
# This model is fine-tuned for biomedical and clinical text embeddings
model_name = 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb'
model = SentenceTransformer(model_name)

# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
# model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


In [18]:
import requests 
hf_token = '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'
model_name = 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb'

api_url = f'https://api-inference.huggingface.co/pipeline/feature-extraction/{model_name}'
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
    return response.json()

In [19]:
texts = ["How do I get a replacement Medicare card?",
        "What is the monthly premium for Medicare Part B?",
        "How do I terminate my Medicare Part B (medical insurance)?",
        "How do I sign up for Medicare?",
        "Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
        "How do I sign up for Medicare Part B if I already have Part A?",
        "What are Medicare late enrollment penalties?",
        "What is Medicare and who can get it?",
        "How can I get help with my Medicare Part A and Part B premiums?",
        "What are the different parts of Medicare?",
        "Will my Medicare premiums be higher because of my higher income?",
        "What is TRICARE ?",
        "Should I sign up for Medicare Part B if I have Veterans' Benefits?"]

output = query(texts)

In [21]:
# List of vital signs columns
vital_signs_cols = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']

# Ensure vital signs are numeric and handle missing values
data[vital_signs_cols] = data[vital_signs_cols].apply(pd.to_numeric, errors='coerce')

# Normalize vital signs using Min-Max scaling
scaler = MinMaxScaler()
vital_signs_normalized = scaler.fit_transform(data[vital_signs_cols])

In [30]:
len(data['chiefcomplaint'].unique())

56073

In [40]:
# Process chief complaints in batches of 10,000
batch_size = 100
symptom_embeddings = []

print("Computing symptom embeddings in batches...")
for i in tqdm(range(0, 100, batch_size)):
    batch = data['chiefcomplaint'].iloc[i:i + batch_size].tolist()
    print(batch)
    embeddings = query(batch)
    symptom_embeddings.extend(embeddings)

    # Save intermediate embeddings
    np.save(f'./data/mimic-iv-private/symptom_embeddings_batch_{i//batch_size}.npy', np.array(embeddings))

# Convert symptom embeddings to a numpy array
symptom_embeddings = np.array(symptom_embeddings)

# Perform PCA on symptom embeddings to reduce dimensionality to 10
print("Performing PCA on symptom embeddings...")
pca = PCA(n_components=10)
symptom_embeddings_reduced = pca.fit_transform(symptom_embeddings)

# Save the reduced embeddings in batches of 10,000
for i in range(0, len(symptom_embeddings_reduced), batch_size):
    batch_reduced = symptom_embeddings_reduced[i:i + batch_size]
    np.save(f'./data/mimic-iv-private/symptom_embeddings_reduced_batch_{i//batch_size}.npy', batch_reduced)

# Concatenate reduced symptom embeddings and vital signs to create comprehensive embeddings
print("Creating comprehensive embeddings...")
comprehensive_embeddings = np.hstack((symptom_embeddings_reduced, vital_signs_normalized))

# Save comprehensive embeddings in batches of 10,000
for i in range(0, len(comprehensive_embeddings), batch_size):
    batch_comprehensive = comprehensive_embeddings[i:i + batch_size]
    np.save(f'./data/mimic-iv-private/comprehensive_embeddings_batch_{i//batch_size}.npy', batch_comprehensive)

Computing symptom embeddings in batches...


  0%|          | 0/1 [00:00<?, ?it/s]

['R Ankle pain', 'Headache', 'Diarrhea, Presyncope', 'RLQ abdominal pain', 'SOB', 'Agitation, SI', 'Headache', 'Back pain', 'Buttock pain, L Leg pain', 'R Hand injury', 'KIDNEY STONES', 'Fever, Cough', 'N/V, RENAL FAILURE', 'Abd pain', 'Laceration, Altered mental status', 'Chest pain, Dyspnea', 'Abd pain, Abnormal ultrasound', 'SYNCOPE', 'n/v/d', 'Chest pain, Dyspnea', 'Abd pain, s/p colonoscopy', 'MEDICAL DEVICE PROBLEM', 'Chest pain', 'Epigastric pain', 'SI', 'BACK PAIN', 'HTN', 'RLQ ABDOMINAL PAIN', 'Buttock pain, Pelvic pain', 'MS CHANGES', 'Body pain', 'RT ARM PAIN/SWELLING', 'Altered mental status', 'Back pain', 'Dyspnea', 'L Shoulder pain', 'Lip swelling', 'Wound eval, Transfer', 'SHORTNESS OF BREATH', 'S/P MVC', 'ABNL LABS/THROMBOCYTOPENIA', 'R Hand injury, Syncope', 'Chest pain, Cough', 'Dyspnea, Leg swelling', 'R Wrist injury, s/p Fall', 'L Leg numbness, s/p Fall', 'LEFT KNEE PAIN', 'Abd pain, Nausea', 'Rectal pain, Abscess', 'ETOH', 'Lower back pain', 'LEFT SHOULDER PAIN', '

100%|██████████| 1/1 [00:26<00:00, 26.98s/it]


Performing PCA on symptom embeddings...
Creating comprehensive embeddings...


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 100 and the array at index 1 has size 386776

In [64]:
vital_signs_normalized.shape
symptom_embeddings_reduced.shape

(100, 10)

In [55]:
embeddings = np.load('./data/mimic-iv-private/symptom_embeddings_batch_0.npy', allow_pickle=True)
similarities = model.similarity(embeddings, embeddings)


In [60]:
# Iterate through the first 100 rows
num_samples = 100
for i in range(num_samples):
    # Get the similarity scores for the current sample
    similarity_row = similarities[i]

    # Exclude self-similarity by setting its score to a very low value
    similarity_row[i] = -np.inf

    # Find the index of the most similar sample
    closest_index = np.argmax(similarity_row)

    # Retrieve and print the symptoms
    symptom = data['chiefcomplaint'].loc[i]
    closest_symptom = data['chiefcomplaint'].loc[closest_index.item()]

    print(f"Sample {i}: {symptom}")
    print(f"Most similar symptom: {closest_symptom}")
    print()

Sample 0: R Ankle pain
Most similar symptom: R Flank pain

Sample 1: Headache
Most similar symptom: Headache

Sample 2: Diarrhea, Presyncope
Most similar symptom: Diarrhea, Weakness

Sample 3: RLQ abdominal pain
Most similar symptom: RLQ ABDOMINAL PAIN

Sample 4: SOB
Most similar symptom: SI

Sample 5: Agitation, SI
Most similar symptom: SI

Sample 6: Headache
Most similar symptom: Headache

Sample 7: Back pain
Most similar symptom: BACK PAIN

Sample 8: Buttock pain, L Leg pain
Most similar symptom: Buttock pain, Pelvic pain

Sample 9: R Hand injury
Most similar symptom: R Hand injury, Syncope

Sample 10: KIDNEY STONES
Most similar symptom: Dysuria

Sample 11: Fever, Cough
Most similar symptom: Dyspnea, Fever

Sample 12: N/V, RENAL FAILURE
Most similar symptom: N/V

Sample 13: Abd pain
Most similar symptom: ABD PAIN

Sample 14: Laceration, Altered mental status
Most similar symptom: Altered mental status

Sample 15: Chest pain, Dyspnea
Most similar symptom: Chest pain, Dyspnea

Sample 

In [None]:

# Save the final combined embeddings to CSV
print("Saving final comprehensive embeddings to CSV...")
comprehensive_embeddings_df = pd.DataFrame(comprehensive_embeddings)
comprehensive_embeddings_df.to_csv('./data/mimic-iv-private/comprehensive_embeddings.csv', index=False)

print("Process completed successfully!")

In [None]:
# Compute embeddings for 'chiefcomplaint'
print("Computing symptom embeddings...")
symptom_embeddings = model.encode(
    data['chiefcomplaint'].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True
)


Computing symptom embeddings...


Batches:  96%|█████████▌| 5781/6044 [06:09<00:08, 30.46it/s]

In [None]:
# Perform PCA on symptom embeddings to reduce dimensionality to 30
print("Performing PCA on symptom embeddings...")
pca = PCA(n_components=10)
symptom_embeddings_reduced = pca.fit_transform(symptom_embeddings)

# Concatenate reduced symptom embeddings and vital signs to create comprehensive embeddings
print("Creating comprehensive embeddings...")
comprehensive_embeddings = np.hstack((symptom_embeddings_reduced, vital_signs_normalized))

# Optionally, save the embeddings for later use
np.save('./data/mimic-iv-private/symptom_embeddings_reduced.npy', symptom_embeddings_reduced)
np.save('./data/mimic-iv-private/comprehensive_embeddings.npy', comprehensive_embeddings)

# If you want to store the embeddings back into the DataFrame
# Create DataFrames from the embeddings
symptom_embeddings_df = pd.DataFrame(symptom_embeddings_reduced, index=data.index)
vital_signs_normalized_df = pd.DataFrame(vital_signs_normalized, columns=vital_signs_cols, index=data.index)
comprehensive_embeddings_df = pd.concat([symptom_embeddings_df, vital_signs_normalized_df], axis=1)

# Now, 'comprehensive_embeddings_df' contains the combined embeddings
# You can save it to a file if needed
comprehensive_embeddings_df.to_csv('./data/mimic-iv-private/comprehensive_embeddings.csv', index=False)

In [5]:
embeddings = np.load('./data/mimic-iv-private/comprehensive_embeddings.npy',allow_pickle=True)

In [None]:
# Step 1: Validate the embeddings array
print("Checking for NaN values...")
print(f"NaN in embeddings: {np.isnan(embeddings).any()}")
print(f"NaN in query embedding: {np.isnan(embeddings[0]).any()}")

print("Checking for Inf values...")
print(f"Inf in embeddings: {np.isinf(embeddings).any()}")
print(f"Inf in query embedding: {np.isinf(embeddings[0]).any()}")

print("Checking for zero vectors...")
zero_vector_indices = np.where(np.linalg.norm(embeddings, axis=1) == 0)[0]
if len(zero_vector_indices) > 0:
    print(f"Found zero vectors at indices: {zero_vector_indices}")
else:
    print("No zero vectors found!")

In [11]:
embeddings.shape

(386776, 17)

In [18]:
import numpy as np
import pandas as pd
from collections import Counter


def compute_cosine_similarity(embedding, embeddings, epsilon=1e-8):
    """
    Compute cosine similarity between a single embedding and a set of embeddings.
    Avoids division by zero by adding epsilon to the norm.
    """
    # Compute cosine similarities
    similarities = np.dot(embeddings, embedding)
    return similarities

def get_top_k_similar(embedding, embeddings, k=5):
    """
    Find the top-k most similar samples to a given embedding.
    """
    similarities = compute_cosine_similarity(embedding, embeddings)
    top_k_indices = np.argsort(similarities)[-k:][::-1]
    return top_k_indices, similarities[top_k_indices], similarities

# Get top 5 similar embeddings
top_k_indices, top_k_similarities, similarities = get_top_k_similar(embeddings[0], embeddings, k=5)

# Retrieve and print the corresponding symptoms
print(f"Querying {data['chiefcomplaint'].iloc[0]} most similar samples:")
for i, index in enumerate(top_k_indices):
    symptom = data['chiefcomplaint'].iloc[index]
    similarity = top_k_similarities[i]
    print(f"Rank {i+1}: {symptom} (Similarity: {similarity:.4f})")


Querying R Ankle pain most similar samples:
Rank 1: Sore throat, Headache, Cough (Similarity: nan)
Rank 2: ETOH, Unable to ambulate (Similarity: nan)
Rank 3: Agitation, Altered mental status (Similarity: nan)
Rank 4: Lightheaded, Presyncope (Similarity: nan)
Rank 5: Altered mental status (Similarity: nan)


Unnamed: 0,subject_id,stay_id,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,chiefcomplaint
0,15289989,36797979,97.9,92.0,20.0,100.0,143.0,82.0,10,3.0,R Ankle pain
1,18074917,38851499,97.1,66.0,17.0,100.0,135.0,94.0,9,3.0,Headache
2,15540381,36065198,97.8,67.0,16.0,100.0,111.0,70.0,4,3.0,"Diarrhea, Presyncope"
3,10360824,33721541,98.6,118.0,18.0,100.0,131.0,73.0,8,3.0,RLQ abdominal pain
4,13884765,36117985,98.0,80.0,20.0,98.0,119.0,46.0,13,3.0,SOB
...,...,...,...,...,...,...,...,...,...,...,...
386771,14197003,39304285,98.0,71.0,16.0,100.0,111.0,66.0,7,3.0,"Abd pain, Pregnant"
386772,13702399,33756340,97.8,85.0,18.0,97.0,123.0,86.0,6,3.0,LLQ abdominal pain
386773,11073871,32890162,98.3,64.0,16.0,98.0,121.0,71.0,7,2.0,Chest pain
386774,18985761,34442172,98.2,101.0,14.0,99.0,116.0,76.0,8,3.0,ABD PAIN WITH N/V
