In [5]:
import pandas as pd
ehr_data = pd.read_csv("../data/ehr/dummy_ehr_data.csv")
snomed_data = pd.read_csv("../data/snomed/dummy_snomed_data.csv")

# Display the first few rows of each dataset
print("EHR Dataset:")
print(ehr_data.head())
print("SNOMED Dataset:")
print(snomed_data.head())

# Extract the 'text' column for embedding generation
ehr_texts = ehr_data['Diagnosis_Text'].tolist()  # Assuming a 'text' column exists
snomed_texts = snomed_data['Term'].tolist()

EHR Dataset:
    EHR_ID Patient_ID        Diagnosis_Text        Date
0  EHR0001       P370  migraine with nausea  2025-04-17
1  EHR0002       P253           sore throat  2025-03-21
2  EHR0003       P493         asthma attack  2025-04-09
3  EHR0004       P267   depression symptoms  2025-03-23
4  EHR0005       P129             dizziness  2025-03-21
SNOMED Dataset:
   SNOMED_ID                      Term                             Description
0  386661006                Chest pain                       Pain in the chest
1  267036007                   Fatigue      Feeling of tiredness or exhaustion
2  386705008       Shortness of breath                    Difficulty breathing
3  195967001                    Asthma  Chronic inflammatory disease of airway
4   44054006  Diabetes mellitus type 2                         Type 2 diabetes


In [6]:
import torch
def generate_embeddings(texts, tokenizer, model):
    """
    Generate embeddings for a list of texts using ClinicalBERT.
    """
    # Tokenize texts
    tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    # Generate embeddings using the ClinicalBERT model
    with torch.no_grad():  # Disable gradient computation
        output = model(**tokens)
    # Extract the [CLS] token embeddings (representing the entire sequence)
    embeddings = output.last_hidden_state[:, 0, :]  # Shape: (batch_size, embedding_dim)
    return embeddings.numpy()  # Convert to NumPy array for easier handling

In [19]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
# Load ClinicalBERT from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Generate embeddings for EHR and SNOMED texts
# ehr_embeddings = generate_embeddings(ehr_texts, tokenizer, model)
snomed_embeddings = generate_embeddings(snomed_texts, tokenizer, model)
print ( type(snomed_embeddings))
# Verify the shape of the embeddings
# print("EHR Embeddings Shape:", ehr_embeddings.shape)
print("SNOMED Embeddings Shape:", snomed_embeddings.shape)

# Save embeddings to files
# np.save("../data/ehr/ehr_embeddings.npy", ehr_embeddings)
# np.save("../data/snomed/snomed_embeddings.npy", snomed_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


<class 'numpy.ndarray'>
SNOMED Embeddings Shape: (100, 768)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
def find_closest_match(query_vector, candidate_vectors):
    """
    Find the closest SNOMED embedding to the given EHR embedding using cosine similarity.
    """
    # Compute cosine similarity between the EHR embedding and all SNOMED embeddings
    similarities = cosine_similarity(query_vector.reshape(1, -1), candidate_vectors)
    # print(similarities)
    # Get the index of the closest SNOMED embedding
    closest_index = np.argmax(similarities)
    return closest_index


In [9]:
# Load EHR embeddings
import numpy as np
ehr_embeddings = np.load("../data/ehr/ehr_embeddings.npy")

# Load SNOMED embeddings
snomed_embeddings = np.load("../data/snomed/snomed_embeddings.npy")

# Print their shapes to verify
print("EHR Embeddings Shape:", ehr_embeddings.shape)
print("SNOMED Embeddings Shape:", snomed_embeddings.shape)

EHR Embeddings Shape: (1000, 768)
SNOMED Embeddings Shape: (100, 768)


In [10]:
print("EHR Embeddings Shape:", ehr_embeddings.shape)
print("SNOMED Embeddings Shape:", snomed_embeddings.shape)



EHR Embeddings Shape: (1000, 768)
SNOMED Embeddings Shape: (100, 768)


In [None]:
print (ehr_texts[1])
print (snomed_texts[9])
# print (ehr_embeddings[1])
# print (snomed_embeddings[9])
e=ehr_embeddings[1].reshape(1, -1)
s=snomed_embeddings[9].reshape(1, -1)
result=cosine_similarity(e, s)
print (result)

In [None]:
for i in range(5):  # Adjust this number if you want more or fewer texts
    ehr_embedding = ehr_embeddings[i]  # Select the i-th EHR embedding
    
    # Find the closest matching SNOMED embedding
    closest_index = find_closest_match(ehr_embedding, snomed_embeddings)
    
    # Print the EHR text and the matching SNOMED text
    print(f"EHR Text: {ehr_texts[i]}")
    print(f"ClosestIndex:{closest_index}")
    print(f"Closest SNOMED Text: {snomed_texts[closest_index]}")
    print("-" * 50)  # Separator for readability


In [None]:
import pymilvus
print(pymilvus.__version__)



2.5.7


In [3]:
from pymilvus import MilvusClient

client = MilvusClient("milvus_demo.db")

In [21]:
import random

# Text strings to search from.
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]
# Use fake representation with random vectors (768 dimension).
vectors = [[random.uniform(-1, 1) for _ in range(768)] for _ in docs]
# data = [
#     {"id": i, "vector": vectors[i], "text": docs[i], "source": "history"}
#     for i in range(len(vectors))
# ]

# vector_list = vector.tolist()
data = [
    {"id": i, "vector": snomed_embeddings[i], "text": "", "source": "history"}
    for i in range(len(snomed_embeddings))
]


print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

print(type(data))
print (type(vectors))

Data has 100 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'source'])
Vector dim: 768
<class 'list'>
<class 'list'>


In [22]:
if client.has_collection(collection_name="demo_collection"):
    client.drop_collection(collection_name="demo_collection")
client.create_collection(
    collection_name="demo_collection",
    dimension=768,  # The vectors we will use in this demo has 768 dimensions
)
print (type(snomed_embeddings))
res = client.insert(collection_name="demo_collection", data=data)

print(res)

<class 'numpy.ndarray'>
{'insert_count': 100, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]}


In [None]:
print(type(client.list_collections()))


<class 'list'>


In [28]:
# from pymilvus import connections, Collection

# # Connect to Milvus
# connections.connect(alias="default", host="localhost", port=19530)

# Load your collection
# collection = Collection("demo_collection")

# collection = client.load_collection("demo_collection")
# Query for records where id >= 0 and id < 10
# results = collection.query(
#     expr="id >= 0 and id < 10",
#     output_fields=["id", "vector", "text", "source"]
# )

# results = client.query(
#     collection_name="demo_collection",
#     expr="id >= 0 and id < 10",
#     output_fields=["id", "vector", "text", "source"]
# )
results = client.query(
    collection_name="demo_collection",
    ids=[0, 2],
    output_fields=["vector", "text", "subject"],
)


# Print results
for record in results:
    print(record)


{'id': 0, 'text': '', 'vector': [np.float32(0.3405214), np.float32(0.36036423), np.float32(-0.22068807), np.float32(-0.14411251), np.float32(-0.25699162), np.float32(0.08009442), np.float32(0.44356662), np.float32(-0.4560798), np.float32(0.46227187), np.float32(0.014782628), np.float32(0.05373027), np.float32(0.3938559), np.float32(-0.3395977), np.float32(-0.33930612), np.float32(-0.15733853), np.float32(0.7126083), np.float32(-0.27955377), np.float32(0.05432962), np.float32(0.17572357), np.float32(-0.45829368), np.float32(0.22844367), np.float32(-0.14025429), np.float32(-0.25017694), np.float32(-0.11067615), np.float32(-0.64208275), np.float32(-0.29416597), np.float32(0.7285691), np.float32(0.7012455), np.float32(-0.19706284), np.float32(0.5994214), np.float32(-0.04877547), np.float32(0.23671852), np.float32(0.028442131), np.float32(0.69277555), np.float32(-0.5860519), np.float32(0.41877043), np.float32(-0.18204178), np.float32(-0.090329945), np.float32(0.24416728), np.float32(-0.1348

In [12]:
print (type(snomed_embeddings))


<class 'numpy.ndarray'>


In [2]:
from pymilvus import connections
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType

connections.connect(alias="default", host="localhost", port="19530")

2025-04-25 16:27:41,640 [ERROR][handler]: RPC error: [__internal_register], <MilvusException: (code=1, message=Incorrect port or sdk is incompatible with server, please check your port or downgrade your sdk or upgrade your server)>, <Time:{'RPC start': '2025-04-25 16:27:41.638014', 'RPC error': '2025-04-25 16:27:41.640026'}> (decorators.py:140)


MilvusException: <MilvusException: (code=1, message=Incorrect port or sdk is incompatible with server, please check your port or downgrade your sdk or upgrade your server)>

In [9]:
from pymilvus import connections
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType

connections.connect(alias="default", host="localhost", port="19530")

# Define fields
id_field = FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True)
type_field = FieldSchema(name="type", dtype=DataType.INT64)  # EHR, SNOMED, etc.
source_field = FieldSchema(name="source", dtype=DataType.INT64)  # Bio_ClinicalBERT, etc.
embedding_field = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)  # Adjust if needed

# Create collection schema
schema = CollectionSchema(fields=[id_field, type_field, source_field, embedding_field])

# Initialize collection
collection = Collection(name="medical_embeddings", schema=schema)

MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530, illegal connection params or server unavailable)>