In [4]:
import torch
import chromadb
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from chromadb.utils import embedding_functions

In [6]:
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

# === Function to get [CLS] embedding ===
def get_biobert_embedding(text: str):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
        return cls_embedding.squeeze(0).tolist()

# === Connect to ChromaDB ===
client = chromadb.PersistentClient(path="../vector_db/chroma")
collection = client.get_or_create_collection("patient_embeddings")

# === Sample Query ===
query_text = "The patient is a 45-year-old male with hypertension and chronic kidney disease, admitted for shortness of breath."

query_embedding = get_biobert_embedding(query_text)

# === Search in Vector DB ===
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3  # return top 3 matches
)

# === Print Results ===
print("🔍 Top Matches:")
for i in range(len(results["documents"][0])):
    print(f"\nMatch {i+1}")
    print("Input Text:", results["documents"][0][i])
    print("Summary:", results["metadatas"][0][i].get("summary", "[no summary]"))

🔍 Top Matches:

Match 1
Input Text: with abdominal pain and found to have likely metastatic colon
Summary: **Discharge Summary**

**Admission Date:** ___  
**Discharge Date:** ___  
**Service:** Medicine  
**Attending:** ___  

**Chief Complaint:** Fall  

**History of Present Illness:**  
Female with history of follicular lymphoma in CR and recurrent UTI presented with a fall and 2 days of fatigue. Felt lightheaded and fell, hitting the back of her head. No loss of consciousness. CT head and neck, CXR unremarkable. U/A mildly positive, given IV cipro.

**Past Medical History:**  
1. Follicular lymphoma in CR  
2. Lumbar spinal stenosis s/p XLIF  
3. Cervical spinal stenosis  
4. Recurrent UTIs with chronic cystitis  
5. Hypertension  
6. History of breast cancer  
7. History of migraines  
8. Right upper extremity nerve damage  
9. Left shoulder shingles  
10. Moderate aortic regurgitation and aortic root dilatation  

**Physical Exam:**  
- **Admission:** NAD, vitals stable, no acute

In [5]:
MODEL_NAME = "emilyalsentzer/Bio_ClinicalBERT"

# === Initialize Model ===
print("🔄 Loading BioClinicalBERT model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()

# === Improved Embedding Function ===
def get_biobert_embedding(text: str, show_progress: bool = False):
    """Get embedding with optional progress indication"""
    if show_progress:
        print(f"🔧 Processing query: '{text[:50]}...'")
    
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
        return cls_embedding.squeeze(0).tolist()

# === Connect to ChromaDB ===
print("🔄 Connecting to ChromaDB...")
try:
    client = chromadb.PersistentClient(path="../vector_db/chroma")
    collection = client.get_or_create_collection("patient_embeddings")
    print(f"✅ Connected to collection with {collection.count()} entries")
except Exception as e:
    print(f"❌ ChromaDB connection failed: {str(e)}")
    raise

# === Sample Query ===
query_text = "The patient is a 45-year-old male with hypertension and chronic kidney disease, admitted for shortness of breath."

print("\n🔍 Running RAG Verification...")
try:
    # Get embedding with progress indication
    query_embedding = get_biobert_embedding(query_text, show_progress=True)
    
    # Search with progress bar
    print("\n🔎 Searching for similar cases...")
    with tqdm(total=1, desc="Query Processing") as pbar:
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=3
        )
        pbar.update(1)
    
    # Print results with formatting
    print("\n" + "="*50)
    print("🧪 RAG Verification Results")
    print("="*50)
    
    for i in range(len(results["documents"][0])):
        print(f"\n🏥 Match #{i+1}")
        print("-"*40)
        print("📝 Clinical Text:")
        print(results["documents"][0][i][:200] + "...")
        print("\n📋 Summary:")
        print(results["metadatas"][0][i].get("summary", "[no summary available]"))
        print(f"\n⭐ Similarity Score: {results['distances'][0][i]:.4f}")
    
    print("\n" + "="*50)
    print("✅ RAG Verification Complete")
    print("="*50)

except Exception as e:
    print(f"\n❌ Verification failed: {str(e)}")

🔄 Loading BioClinicalBERT model...
🔄 Connecting to ChromaDB...
✅ Connected to collection with 800 entries

🔍 Running RAG Verification...
🔧 Processing query: 'The patient is a 45-year-old male with hypertensio...'

🔎 Searching for similar cases...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Query Processing: 100%|██████████| 1/1 [00:00<00:00, 220.61it/s]


🧪 RAG Verification Results

🏥 Match #1
----------------------------------------
📝 Clinical Text:
with abdominal pain and found to have likely metastatic colon...

📋 Summary:
**Discharge Summary**

**Admission Date:** ___  
**Discharge Date:** ___  
**Service:** Medicine  
**Attending:** ___  

**Chief Complaint:** Fall  

**History of Present Illness:**  
Female with history of follicular lymphoma in CR and recurrent UTI presented with a fall and 2 days of fatigue. Felt lightheaded and fell, hitting the back of her head. No loss of consciousness. CT head and neck, CXR unremarkable. U/A mildly positive, given IV cipro.

**Past Medical History:**  
1. Follicular lymphoma in CR  
2. Lumbar spinal stenosis s/p XLIF  
3. Cervical spinal stenosis  
4. Recurrent UTIs with chronic cystitis  
5. Hypertension  
6. History of breast cancer  
7. History of migraines  
8. Right upper extremity nerve damage  
9. Left shoulder shingles  
10. Moderate aortic regurgitation and aortic root dilatation  


