In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

In [10]:
def generate_embeddings(texts, tokenizer, model):
    """
    Generate embeddings for a list of texts using ClinicalBERT.
    """
    # Tokenize texts
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    # Generate embeddings using the ClinicalBERT model
    with torch.no_grad():  # Disable gradient computation
        output = model(**tokens)
    # Extract the [CLS] token embeddings (representing the entire sequence)
    embeddings = output.last_hidden_state[:, 0, :]  # Shape: (batch_size, embedding_dim)
    return embeddings.numpy()  # Convert to NumPy array for easier handling

In [11]:
# Load ClinicalBERT from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [13]:
ehr_data = pd.read_csv("../data/ehr/dummy_ehr_data.csv")
snomed_data = pd.read_csv("../data/snomed/dummy_snomed_data.csv")

# Display the first few rows of each dataset
# print("EHR Dataset:")
# print(ehr_data.head())
# print("SNOMED Dataset:")
# print(snomed_data.head())

# Extract the 'text' column for embedding generation
ehr_texts = ehr_data['Diagnosis_Text'].tolist()  # Assuming a 'text' column exists
snomed_texts = snomed_data['Term'].tolist()

In [14]:
# Generate embeddings for EHR and SNOMED texts
ehr_embeddings = generate_embeddings(ehr_texts, tokenizer, model)
snomed_embeddings = generate_embeddings(snomed_texts, tokenizer, model)

# Verify the shape of the embeddings
print("EHR Embeddings Shape:", ehr_embeddings.shape)
print("SNOMED Embeddings Shape:", snomed_embeddings.shape)


EHR Embeddings Shape: (1000, 768)
SNOMED Embeddings Shape: (100, 768)


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def find_closest_match(ehr_embedding, snomed_embeddings):
    """
    Find the closest SNOMED embedding to the given EHR embedding using cosine similarity.
    """
    # Compute cosine similarity between the EHR embedding and all SNOMED embeddings
    similarities = cosine_similarity(ehr_embedding.reshape(1, -1), snomed_embeddings)
    
    # Get the index of the closest SNOMED embedding
    closest_index = np.argmax(similarities)
    return closest_index


In [18]:
# Save embeddings to files
import numpy as np
np.save("../data/ehr/ehr_embeddings.npy", ehr_embeddings)
np.save("../data/snomed/snomed_embeddings.npy", snomed_embeddings)


In [None]:
# Load EHR embeddings
ehr_embeddings = np.load("../data/ehr/ehr_embeddings.npy")

# Load SNOMED embeddings
snomed_embeddings = np.load("../data/snomed/snomed_embeddings.npy")

# Print their shapes to verify
print("EHR Embeddings Shape:", ehr_embeddings.shape)
print("SNOMED Embeddings Shape:", snomed_embeddings.shape)

In [16]:
for i in range(10):  # Adjust this number if you want more or fewer texts
    ehr_embedding = ehr_embeddings[i]  # Select the i-th EHR embedding
    
    # Find the closest matching SNOMED embedding
    closest_index = find_closest_match(ehr_embedding, snomed_embeddings)
    
    # Print the EHR text and the matching SNOMED text
    print(f"EHR Text: {ehr_texts[i]}")
    print(f"Closest SNOMED Text: {snomed_texts[closest_index]}")
    print("-" * 50)  # Separator for readability


EHR Text: migraine with nausea
Closest SNOMED Text: Migraine
--------------------------------------------------
EHR Text: sore throat
Closest SNOMED Text: Chronic low back pain
--------------------------------------------------
EHR Text: asthma attack
Closest SNOMED Text: Upper respiratory infection
--------------------------------------------------
EHR Text: depression symptoms
Closest SNOMED Text: Depressive disorder
--------------------------------------------------
EHR Text: dizziness
Closest SNOMED Text: Chest pain
--------------------------------------------------
EHR Text: sore throat
Closest SNOMED Text: Chronic low back pain
--------------------------------------------------
EHR Text: type 2 diabetes
Closest SNOMED Text: Diabetes mellitus type 2
--------------------------------------------------
EHR Text: type 2 diabetes
Closest SNOMED Text: Diabetes mellitus type 2
--------------------------------------------------
EHR Text: shortness of breath
Closest SNOMED Text: Shortness 