In [1]:
# Cell 1 - Import necessary libraries and modules
# hpomapper: Extracting HPO Terms from Clinical Notes
# ====================================================

# This notebook demonstrates the full workflow of hpomapper:
# 1. Setting up the environment
# 2. Vectorizing the HPO database (if needed)
# 3. Running hpomapper on clinical notes
# 4. Analyzing and visualizing the results

import os
os.chdir("../src")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import json
import sys

# Add current directory to path to import hpomapper modules
sys.path.append('.')

# Import hpomapper modules - make sure these match your actual file names
from hpomapper_main import HPOVectorDB, BedrockLLM, hpomapper, parse_hpo_json
from hpo_vectorization import HPOVectorizer, create_test_embeddings_file

# Import AWS credential helper
import aws_helper
import logging

# Configure logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

## 1. Setting up the environment
 First, let's make sure we have the necessary files and AWS credentials

In [7]:
# Cell 2 - Setting up the environment
# ## 1. Setting up the environment
# First, let's make sure we have the necessary files and AWS credentials

# Check AWS credentials using the helper
try:
    # This will try to get a valid session, handling SSO if needed
    aws_session = aws_helper.get_aws_session()
    
    # Test the session with a simple STS call
    sts = aws_session.client('sts')
    identity = sts.get_caller_identity()
    
    print(f"✅ AWS credentials verified successfully")
    print(f"Account: {identity['Account']}")
    print(f"User: {identity['Arn']}")
    
    # Check Bedrock access
    try:
        bedrock = aws_session.client('bedrock-runtime')
        print(f"✅ Bedrock client initialized successfully")
    except Exception as e:
        print(f"⚠️ Bedrock client initialization failed: {str(e)}")
        print("Make sure your AWS role has access to Bedrock")
        
except Exception as e:
    print(f"❌ AWS credential check failed: {str(e)}")
    print("Please ensure your AWS SSO session is active:")
    print("  aws sso login --profile plm-dev")
    print("Or set environment variables:")
    print("  export AWS_PROFILE=plm-dev")
    print("  export AWS_DEFAULT_REGION=us-west-2")

# Check for HPO files
hpo_embedding_file = '../db/G2GHPO_metadata_0.2k.npy'
hpo_json_file = '../db/hp.json'

if os.path.exists(hpo_embedding_file):
    print(f"✅ HPO embeddings file found: {hpo_embedding_file}")
else:
    print(f"⚠️ HPO embeddings file not found: {hpo_embedding_file}")
    print("   Vectorization will be required.")

if os.path.exists(hpo_json_file):
    print(f"✅ HPO JSON file found: {hpo_json_file}")
else:
    print(f"⚠️ HPO JSON file not found: {hpo_json_file}")
    print("   Please download it from https://hpo.jax.org/app/")

# Check for sample clinical notes
sample_notes_file = 'sample_clinical_notes.csv'

if not os.path.exists(sample_notes_file):
    # Create a sample clinical notes file for demonstration
    print(f"Creating sample clinical notes file: {sample_notes_file}")
    
    sample_notes = pd.DataFrame([
        {
            'patient_id': 'PT001',
            'note': 'Patient presents with delayed motor development, hypotonia, and macrocephaly. MRI shows periventricular leukomalacia.'
        },
        {
            'patient_id': 'PT002',
            'note': 'A 5-year-old male with severe intellectual disability, epilepsy, and autistic features. He also has a history of recurrent respiratory infections.'
        },
        {
            'patient_id': 'PT003',
            'note': 'Adolescent female with short stature, bilateral cataracts, and sensorineural hearing loss. Family history of similar features in maternal lineage.'
        }
    ])
    
    sample_notes.to_csv(sample_notes_file, index=False)
    print(f"Created sample file with {len(sample_notes)} clinical notes")
else:
    print(f"Sample clinical notes file found: {sample_notes_file}")

2025-03-30 17:18:30,692 - aws_helper - INFO - Loaded configuration from ../config/hpomapper_config.yaml
2025-03-30 17:18:30,694 - aws_helper - INFO - Using AWS profile from config: plm-dev
2025-03-30 17:18:30,694 - aws_helper - INFO - Using AWS region from config: us-west-2
2025-03-30 17:18:30,695 - aws_helper - INFO - Creating AWS session with profile: plm-dev
2025-03-30 17:18:32,453 - aws_helper - INFO - AWS session created successfully. Using identity: arn:aws:sts::346034459362:assumed-role/AWSReservedSSO_CHLA_PowerUserAccess_f216beaca69a9496/hhakimjavadi@chla.usc.edu


✅ AWS credentials verified successfully
Account: 346034459362
User: arn:aws:sts::346034459362:assumed-role/AWSReservedSSO_CHLA_PowerUserAccess_f216beaca69a9496/hhakimjavadi@chla.usc.edu
✅ Bedrock client initialized successfully
✅ HPO embeddings file found: ../db/G2GHPO_metadata_0.2k.npy
✅ HPO JSON file found: ../db/hp.json
Sample clinical notes file found: sample_clinical_notes.csv


In [8]:
# Cell 4 - Test multiple embedding and LLM models
import boto3
import json
import aws_helper
import time
from tqdm import tqdm

# Get AWS session
aws_session = aws_helper.get_aws_session()

# Create bedrock-runtime client
bedrock_runtime = aws_session.client('bedrock-runtime')

print("Testing access to various AWS Bedrock models...")

# List of embedding models to test
embedding_models = [
    "amazon.titan-embed-text-v2:0"
]

# Test the embedding models
print("\n== TESTING EMBEDDING MODELS ==")
for model_id in embedding_models:
    print(f"\nTesting {model_id}...")
    
    try:
        # Use the correct format for Titan embed model
        request_body = {
            "inputText": "Test input for embedding model",
            "dimensions": 512,
            "normalize": True
        }
        
        # Try to invoke the model
        response = bedrock_runtime.invoke_model(
            modelId=model_id,
            body=json.dumps(request_body)
        )
        
        # Process the response
        response_body = json.loads(response.get('body').read())
        
        if 'embedding' in response_body:
            print(f"✅ Successfully accessed {model_id}")
            print(f"   Embedding dimension: {len(response_body['embedding'])}")
            print(f"   Response keys: {list(response_body.keys())}")
        else:
            print(f"⚠️ Got response from {model_id}, but no embedding found")
            print(f"   Response keys: {list(response_body.keys())}")
            
    except Exception as e:
        print(f"❌ Error accessing {model_id}: {str(e)}")

# Test LLM models
print("\n== TESTING LLM MODELS ==")

# List of LLM models to test
llm_models = [
    {"id": "anthropic.claude-v2:1", "name": "Claude v2.1", "type": "claude-v2"},
    {"id": "anthropic.claude-3-5-sonnet-20241022-v2:0", "name": "Claude 3.5 Sonnet", "type": "claude-3"}
]

for model in llm_models:
    model_id = model["id"]
    model_type = model["type"]
    print(f"\nTesting {model_id}...")
    
    try:
        # Use the appropriate request format based on model type
        if model_type == "claude-3":
            # Format for Claude 3.x models (messages API)
            request_body = {
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 200,
                "top_k": 250,
                "stop_sequences": [],
                "temperature": 1,
                "top_p": 0.999,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": "hello world"
                            }
                        ]
                    }
                ]
            }
        elif model_type == "claude-v2":
            # Format for Claude v2 models
            request_body = {
                "prompt": "\n\nHuman: Hello world\n\nAssistant:",
                "max_tokens_to_sample": 300,
                "temperature": 0.5,
                "top_k": 250,
                "top_p": 1,
                "stop_sequences": ["\n\nHuman:"],
                "anthropic_version": "bedrock-2023-05-31"
            }
        elif model_type == "llama":
            # Format for Llama models
            request_body = {
                "prompt": "Hello world",
                "max_gen_len": 512,
                "temperature": 0.5,
                "top_p": 0.9
            }
        else:
            # Generic format
            request_body = {
                "inputText": "Hello world",
                "textGenerationConfig": {
                    "maxTokenCount": 200,
                    "temperature": 0.5,
                    "topP": 0.9
                }
            }
        
        # Try to invoke the model
        start_time = time.time()
        response = bedrock_runtime.invoke_model(
            modelId=model_id,
            body=json.dumps(request_body)
        )
        end_time = time.time()
        
        # Process the response
        response_body = json.loads(response.get('body').read())
        
        # Extract response text based on model type
        response_text = None
        if model_type == "claude-3" and "content" in response_body:
            for content in response_body.get("content", []):
                if content.get("type") == "text":
                    response_text = content.get("text", "")
                    break
        elif model_type == "claude-v2" and "completion" in response_body:
            response_text = response_body.get("completion", "")
        elif model_type == "llama" and "generation" in response_body:
            response_text = response_body.get("generation", "")
        
        if response_text:
            print(f"✅ Successfully accessed {model_id}")
            print(f"   Response: '{response_text[:50]}...'")
            print(f"   Response time: {end_time - start_time:.2f}s")
        else:
            print(f"⚠️ Got response from {model_id}, but couldn't extract text")
            print(f"   Response keys: {list(response_body.keys())}")
        
    except Exception as e:
        print(f"❌ Error accessing {model_id}: {str(e)}")

# Display the recommended approach based on available models
print("\n== RECOMMENDATIONS ==")
print("Based on the test results, update the hpomapper config to use:")
print("1. For LLM extraction: Use the working Claude model")
print("2. For embeddings: Use amazon.titan-embed-text-v2:0 with the correct format")

2025-03-30 17:18:34,725 - aws_helper - INFO - Loaded configuration from ../config/hpomapper_config.yaml
2025-03-30 17:18:34,726 - aws_helper - INFO - Using AWS profile from config: plm-dev
2025-03-30 17:18:34,727 - aws_helper - INFO - Using AWS region from config: us-west-2
2025-03-30 17:18:34,728 - aws_helper - INFO - Creating AWS session with profile: plm-dev
2025-03-30 17:18:36,231 - aws_helper - INFO - AWS session created successfully. Using identity: arn:aws:sts::346034459362:assumed-role/AWSReservedSSO_CHLA_PowerUserAccess_f216beaca69a9496/hhakimjavadi@chla.usc.edu


Testing access to various AWS Bedrock models...

== TESTING EMBEDDING MODELS ==

Testing amazon.titan-embed-text-v2:0...
✅ Successfully accessed amazon.titan-embed-text-v2:0
   Embedding dimension: 512
   Response keys: ['embedding', 'embeddingsByType', 'inputTextTokenCount']

== TESTING LLM MODELS ==

Testing anthropic.claude-v2:1...
✅ Successfully accessed anthropic.claude-v2:1
   Response: ' Hello!...'
   Response time: 0.97s

Testing anthropic.claude-3-5-sonnet-20241022-v2:0...
✅ Successfully accessed anthropic.claude-3-5-sonnet-20241022-v2:0
   Response: 'Hi there! How can I help you today?...'
   Response time: 0.61s

== RECOMMENDATIONS ==
Based on the test results, update the hpomapper config to use:
1. For LLM extraction: Use the working Claude model
2. For embeddings: Use amazon.titan-embed-text-v2:0 with the correct format



## 2. Vectorizing the HPO database (if needed)
If the HPO embeddings file is not available, we need to create it by vectorizing the HPO terms.


In [9]:
# Cell 5 - Vectorizing the HPO database (if needed)
# ## 2. Vectorizing the HPO database (if needed)
# If the HPO embeddings file is not available, we need to create it by vectorizing the HPO terms.

# Check if vectorization is needed
if not os.path.exists(hpo_embedding_file) and os.path.exists(hpo_json_file):
    print("Vectorizing HPO database...")
    
    # First, let's verify we can access the embedding model
    try:
        # Get AWS session
        aws_session = aws_helper.get_aws_session()
        
        # Test access to the embedding model
        model_id = "amazon.titan-embed-text-v2:0"
        print(f"Testing access to {model_id}...")
        
        # Create bedrock-runtime client
        bedrock_runtime = aws_session.client('bedrock-runtime')
        
        # Simplified request format for Titan embed text v2
        request_body = {
            "inputText": "Test input for embedding model"
        }
        
        # Try to invoke the model
        response = bedrock_runtime.invoke_model(
            modelId=model_id,
            body=json.dumps(request_body)
        )
        
        # If we get here, we have access
        response_body = json.loads(response.get('body').read())
        print(f"✅ Successfully accessed {model_id}")
        print(f"Embedding dimension: {len(response_body.get('embedding', []))}")
        
        # Initialize the vectorizer with the working model
        vectorizer = HPOVectorizer(
            model_id=model_id  # Use the model we just verified
        )
        
        # Parse HPO JSON
        hpo_df = vectorizer.parse_hpo_json(hpo_json_file)
        
        # We'll use a small subset for demonstration
        # In production, you'd use the full dataset
        sample_size = min(200, len(hpo_df))  # Use up to 50 terms for test
        sample_hpo_df = hpo_df.head(sample_size)
        print(f"Using {len(sample_hpo_df)} HPO terms for demonstration")
        
        # Prepare data for vectorization
        prepared_df = vectorizer.prepare_hpo_data(sample_hpo_df)
        
        # Generate embeddings (this will call the Bedrock API)
        # This can take some time depending on the number of terms
        with_embeddings_df = vectorizer.generate_embeddings(prepared_df)
        
        # Save embeddings
        vectorizer.save_embeddings(with_embeddings_df, hpo_embedding_file)
        
        print("HPO vectorization completed with real embeddings")
        
    except Exception as e:
        print(f"❌ Error with Bedrock API: {str(e)}")
        print("Creating test embeddings with mock data instead...")
        
        # Create test embeddings file with mock data
        create_test_embeddings_file(hpo_embedding_file, num_terms=50)
        
        print("Test HPO vectorization completed with mock embeddings")
        
else:
    print(f"Using existing HPO embeddings file: {hpo_embedding_file}")

Using existing HPO embeddings file: ../db/G2GHPO_metadata_0.2k.npy


In [10]:
# Cell 7 - Create a function to debug embedding files
def debug_embedding_file(embedding_file):
    """Display information about an embedding file for debugging"""
    try:
        import numpy as np
        
        # Load the embeddings file
        data = np.load(embedding_file, allow_pickle=True).item()
        
        print(f"Embedding file: {embedding_file}")
        print(f"Model ID: {data.get('model_id', 'Unknown')}")
        print(f"Created at: {data.get('created_at', 'Unknown')}")
        print(f"Number of items: {len(data.get('items', []))}")
        
        # Display info about the first item
        if data.get('items', []):
            first_item = data['items'][0]
            print("\nFirst item:")
            print(f"  HPO ID: {first_item.get('hpo_id', 'Unknown')}")
            print(f"  Name: {first_item.get('name', 'Unknown')}")
            
            # Check embedding
            if 'embedding' in first_item:
                embedding = first_item['embedding']
                print(f"  Embedding shape: {embedding.shape}")
                print(f"  Embedding type: {embedding.dtype}")
                print(f"  Embedding preview: {embedding[:5]}...")
            else:
                print("  No embedding found in item")
    except Exception as e:
        print(f"Error reading embedding file: {str(e)}")

In [13]:
debug_embedding_file("../db/G2GHPO_metadata_0.2k.npy")

Embedding file: ../db/G2GHPO_metadata_0.2k.npy
Model ID: amazon.titan-embed-text-v2:0
Created at: 2025-03-30 17:11:08
Number of items: 200

First item:
  HPO ID: HP:0000001
  Name: All
  Embedding shape: (1024,)
  Embedding type: float64
  Embedding preview: [-0.06037616  0.01139749  0.0227799  -0.001469    0.06384783]...




## 6. Conclusion

We've demonstrated the hpomapper workflow:

1. Setting up the environment
2. Vectorizing the HPO database