In [45]:
import os
import glob
import pandas as pd
import ast
import re
from collections import defaultdict
import base64
from PIL import Image
from dotenv import load_dotenv
from google import genai
import json
import io
import base64
import traceback
import datetime

In [23]:
def get_latest_aggregated_files(output_dir):
    # Get all aggregated prediction files
    pattern = os.path.join(output_dir, "aggregated_predictions_*.csv")
    print(f"Searching for files with pattern: {pattern}")
    
    agg_files = glob.glob(pattern)
    print(f"Found {len(agg_files)} aggregated prediction files")
    
    if len(agg_files) == 0:
        return []
    
    # Dictionary to keep track of latest file for each model
    latest_files = {}
    
    # Process each file
    for file_path in agg_files:
        # Extract file name
        file_name = os.path.basename(file_path)
        
        # Extract model name (between "aggregated_predictions_" and "_base_")
        parts = file_name.split("_base_")
        if len(parts) != 2:
            print(f"Warning: Unexpected filename format: {file_name}")
            continue
        
        model_part = parts[0].replace("aggregated_predictions_", "")
        model_name = model_part
        
        # Extract timestamp (second numeric part after _base_)
        timestamps = re.findall(r'(\d+)', parts[1])
        if len(timestamps) < 2:
            print(f"Warning: Could not find timestamps in {file_name}")
            continue
        
        # Use the second timestamp (should be the file timestamp)
        timestamp = int(timestamps[1])
        
        print(f"Found model: {model_name}, timestamp: {timestamp}")
        
        # If this model isn't in our dictionary yet, or this file is newer
        if model_name not in latest_files or timestamp > latest_files[model_name]['timestamp']:
            latest_files[model_name] = {
                'file_path': file_path,
                'timestamp': timestamp
            }
    
    # Print the selected files
    print("\nSelected latest file for each model:")
    for model, info in latest_files.items():
        print(f"  {model}: {os.path.basename(info['file_path'])}")
    
    # Return just the file paths
    return [info['file_path'] for model, info in latest_files.items()]

def load_all_model_predictions(output_dir):
    latest_files = get_latest_aggregated_files(output_dir)
    
    if not latest_files:
        print("No aggregated prediction files found. Cannot proceed.")
        return {}
    
    # Dictionary to store dataframes by model name
    model_predictions = {}
    
    # Load each file into a dataframe
    for file_path in latest_files:
        file_name = os.path.basename(file_path)
        
        # Extract model name from filename
        parts = file_name.split("_base_")
        if len(parts) != 2:
            print(f"Warning: Unexpected filename format: {file_name}")
            continue
            
        model_name = parts[0].replace("aggregated_predictions_", "")
        
        # Load the dataframe
        try:
            df = pd.read_csv(file_path)
            
            # Add a column identifying which model this came from
            df['model_name'] = model_name
            
            # Store in dictionary
            model_predictions[model_name] = df
            
            print(f"Successfully loaded {model_name} predictions with {len(df)} rows")
            
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    print(f"Loaded {len(model_predictions)} model prediction sets")
    return model_predictions

In [24]:
output_dir = os.path.join(os.getcwd(), "outputs", "outputs-akshay-2", "outputs")
model_predictions = load_all_model_predictions(output_dir)

if model_predictions:
    # Show which models we loaded
    print("\nLoaded models:")
    for model_name, df in model_predictions.items():
        print(f"  {model_name}: {len(df)} rows")
    
    # Create a combined dataframe with all model predictions
    all_models_df = pd.concat(model_predictions.values(), ignore_index=True)
    print(f"\nCombined dataframe has {len(all_models_df)} rows")
    
    # Create an encounter-level dataframe
    encounter_groups = all_models_df.groupby(['encounter_id', 'base_qid'])
    print(f"Number of unique encounter-question pairs: {len(encounter_groups)}")
    
    # Sample of the data structure
    print("\nExample of data structure:")
    print(f"Columns: {all_models_df.columns.tolist()}")
    
    # First encounter-question pair as example
    first_group = next(iter(encounter_groups))
    print(f"\nSample data for encounter {first_group[0][0]}, question {first_group[0][1]}:")
    for model_name in model_predictions.keys():
        model_rows = first_group[1][first_group[1]['model_name'] == model_name]
        if not model_rows.empty:
            row = model_rows.iloc[0]
            print(f"\n  Model: {model_name}")
            print(f"  Combined prediction: {row['combined_prediction']}")
else:
    print("No model predictions were loaded. Please check the directory path and file patterns.")

Searching for files with pattern: /storage/scratch1/2/kthakrar3/mediqa-magic-v2/outputs/outputs-akshay-2/outputs/aggregated_predictions_*.csv
Found 12 aggregated prediction files
Found model: gemma-3-12b-it, timestamp: 115333
Found model: gemma-3-12b-it, timestamp: 151119
Found model: Qwen2-VL-7B-Instruct, timestamp: 120954
Found model: Qwen2-VL-2B-Instruct, timestamp: 124256
Found model: gemma-3-4b-it, timestamp: 151631
Found model: Llama-3.2-11B-Vision-Instruct, timestamp: 111944
Found model: Llama-3.2-11B-Vision-Instruct, timestamp: 124737
Found model: Qwen2-VL-2B-Instruct, timestamp: 112949
Found model: Qwen2.5-VL-7B-Instruct, timestamp: 134915
Found model: Qwen2.5-VL-3B-Instruct, timestamp: 132244
Found model: Qwen2-VL-7B-Instruct, timestamp: 124322
Found model: Qwen2.5-VL-7B-Instruct, timestamp: 140634

Selected latest file for each model:
  gemma-3-12b-it: aggregated_predictions_gemma-3-12b-it_base_20250428_151119_20250428_151119.csv
  Qwen2-VL-7B-Instruct: aggregated_prediction

In [25]:
all_models_df.head()

Unnamed: 0,encounter_id,base_qid,image_ids,unique_predictions,combined_prediction,all_raw_predictions,all_sorted_predictions,options_en,model_name
0,ENC00852,CQID010,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...",[''],,"['', '']","[('', 2)]","['single spot', 'limited area', 'widespread', ...",gemma-3-12b-it
1,ENC00852,CQID011,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...","['face', 'upper extremities', 'back of the hand']","face, upper extremities, back of the hand","['back of the hand', 'face', 'upper extremitie...","[('face', 2), ('upper extremities', 2), ('back...","['head', 'neck', 'upper extremities', 'lower e...",gemma-3-12b-it
2,ENC00852,CQID012,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...",['larger area'],larger area,"['larger area', 'larger area']","[('larger area', 2)]","['size of thumb nail', 'size of palm', 'larger...",gemma-3-12b-it
3,ENC00852,CQID015,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...",['within months'],within months,"['within months', 'within months']","[('within months', 2)]","['within hours', 'within days', 'within weeks'...",gemma-3-12b-it
4,ENC00852,CQID020,"['IMG_ENC00852_00001.jpg', 'IMG_ENC00852_00002...","['flat', 'thin or close to the surface']","flat, thin or close to the surface","['flat', 'thin or close to the surface', 'flat']","[('flat', 2), ('thin or close to the surface',...","['raised or bumpy', 'flat', 'skin loss or sunk...",gemma-3-12b-it


In [26]:
def load_validation_dataset(val_dataset_path):
    """
    Load the validation dataset while preserving all images and ground truth answers
    for each encounter-question pair.
    """
    print(f"Loading validation dataset from {val_dataset_path}")
    val_df = pd.read_csv(val_dataset_path)
    print(f"Loaded validation dataset with {len(val_df)} rows")
    
    # Process and clean the validation dataset
    val_df = process_validation_dataset(val_df)
    
    # Create a dictionary to store all data grouped by encounter and question
    encounter_question_data = defaultdict(lambda: {
        'images': [],
        'data': None,
        'ground_truth': None
    })
    
    # Process each row in the validation dataset
    for _, row in val_df.iterrows():
        encounter_id = row['encounter_id']
        base_qid = row['base_qid']
        key = (encounter_id, base_qid)
        
        # Add image path to the images list
        if 'image_path' in row and row['image_path']:
            encounter_question_data[key]['images'].append(row['image_path'])
        elif 'image_id' in row and row['image_id']:
            # If only image_id is available, construct the path
            image_dir = os.path.join(os.getcwd(), "2025_dataset", "valid", "images_valid")
            image_path = os.path.join(image_dir, row['image_id'])
            encounter_question_data[key]['images'].append(image_path)
        
        # Store row data if not already stored
        if encounter_question_data[key]['data'] is None:
            encounter_question_data[key]['data'] = row.to_dict()
            
        # Process ground truth answers if available
        if 'valid_answers' in row and row['valid_answers'] and encounter_question_data[key]['ground_truth'] is None:
            # Extract ground truth answers
            raw_answers = row['valid_answers']
            
            # Convert string representation to list if needed
            if isinstance(raw_answers, str):
                try:
                    raw_answers = ast.literal_eval(raw_answers)
                except:
                    # If eval fails, handle as comma-separated string
                    if ',' in raw_answers:
                        raw_answers = [ans.strip() for ans in raw_answers.split(',')]
                    else:
                        raw_answers = [raw_answers.strip()]
            
            # Clean answers similar to how model predictions are cleaned
            cleaned_answers = []
            for ans in raw_answers:
                if isinstance(ans, str):
                    # Remove any quotes and trailing/leading spaces
                    cleaned_ans = ans.strip("'\" ")
                    # Remove "(please specify)"
                    cleaned_ans = cleaned_ans.replace(" (please specify)", "")
                    cleaned_answers.append(cleaned_ans)
                else:
                    cleaned_answers.append(str(ans).strip("'\" "))
            
            # Create formatted answer string
            combined_answer = ", ".join(cleaned_answers)
            
            # Store ground truth data
            encounter_question_data[key]['ground_truth'] = {
                'raw_answers': raw_answers,
                'cleaned_answers': cleaned_answers,
                'combined_answer': combined_answer
            }
    
    print(f"Created grouped validation dataset with {len(encounter_question_data)} unique encounter-question pairs")
    
    # Convert to a list of dictionaries for easier handling
    grouped_data = []
    for (encounter_id, base_qid), data in encounter_question_data.items():
        entry = data['data'].copy()
        entry['all_images'] = data['images']
        entry['ground_truth'] = data['ground_truth']
        entry['encounter_id'] = encounter_id
        entry['base_qid'] = base_qid
        grouped_data.append(entry)
    
    return pd.DataFrame(grouped_data)

def safe_convert_options(options_str):
    """Safely convert a string representation of a list to an actual list."""
    if not isinstance(options_str, str):
        return options_str
        
    try:
        return ast.literal_eval(options_str)
    except (SyntaxError, ValueError):
        if options_str.startswith('[') and options_str.endswith(']'):
            return [opt.strip().strip("'\"") for opt in options_str[1:-1].split(',')]
        elif ',' in options_str:
            return [opt.strip() for opt in options_str.split(',')]
        else:
            return [options_str]

def process_validation_dataset(val_df):
    """Process and clean the validation dataset."""
    # Convert string representations to actual Python objects
    if 'options_en' in val_df.columns:
        val_df['options_en'] = val_df['options_en'].apply(safe_convert_options)
        
        # Clean options by removing "(please specify)" phrases
        def clean_options(options):
            if not isinstance(options, list):
                return options
                
            cleaned_options = []
            for opt in options:
                if isinstance(opt, str):
                    cleaned_opt = opt.strip("'\" ").replace(" (please specify)", "")
                    cleaned_options.append(cleaned_opt)
                else:
                    cleaned_options.append(str(opt).strip("'\" "))
            return cleaned_options
            
        val_df['options_en_cleaned'] = val_df['options_en'].apply(clean_options)
    
    # Clean question text
    if 'question_text' in val_df.columns:
        # Remove "Please specify which affected area for each selection." from CQID012
        val_df['question_text_cleaned'] = val_df['question_text'].apply(
            lambda q: q.replace(" Please specify which affected area for each selection.", "") 
                      if isinstance(q, str) and "Please specify which affected area for each selection" in q 
                      else q
        )
        
        # Remove leading numbers like "1 " from the beginning of questions
        val_df['question_text_cleaned'] = val_df['question_text_cleaned'].apply(
            lambda q: re.sub(r'^\d+\s+', '', q) if isinstance(q, str) else q
        )
    
    # Extract base_qid if it doesn't exist
    if 'base_qid' not in val_df.columns and 'qid' in val_df.columns:
        val_df['base_qid'] = val_df['qid'].apply(
            lambda q: q.split('-')[0] if isinstance(q, str) and '-' in q else q
        )
    
    return val_df

def create_query_context(row):
    """Create query context from validation data similar to the inference process."""
    # Get cleaned question text
    question = row.get('question_text_cleaned', row.get('question_text', 'What do you see in this image?'))
    
    # Create metadata string
    metadata = ""
    if 'question_type_en' in row:
        metadata += f"Type: {row['question_type_en']}"
        
    if 'question_category_en' in row:
        metadata += f", Category: {row['question_category_en']}"
    
    # Get clinical context from query title and content
    query_title = row.get('query_title_en', '')
    query_content = row.get('query_content_en', '')
    
    # Create the clinical context section
    clinical_context = ""
    if query_title or query_content:
        clinical_context += "Background Clinical Information (to help with your analysis):\n"
        if query_title:
            clinical_context += f"{query_title}\n"
        if query_content:
            clinical_context += f"{query_content}\n"
    
    # Get options text
    options = row.get('options_en_cleaned', row.get('options_en', ['Yes', 'No', 'Not mentioned']))
    if isinstance(options, list):
        options_text = ", ".join(options)
    else:
        options_text = str(options)
    
    # Create the full query text with clinical context
    query_text = (f"MAIN QUESTION TO ANSWER: {question}\n"
                 f"Question Metadata: {metadata}\n"
                 f"{clinical_context}"
                 f"Available Options (choose from these): {options_text}")
    
    return query_text

In [27]:
# Class to provide an interface for retrieving combined data
class AgenticRAGData:
    def __init__(self, all_models_df, validation_df):
        self.all_models_df = all_models_df
        self.validation_df = validation_df
        
        # Create lookup dictionaries for faster access
        self.model_predictions = {}
        for (encounter_id, base_qid), group in all_models_df.groupby(['encounter_id', 'base_qid']):
            self.model_predictions[(encounter_id, base_qid)] = group
        
        self.validation_data = {}
        for _, row in validation_df.iterrows():
            self.validation_data[(row['encounter_id'], row['base_qid'])] = row
    
    def get_combined_data(self, encounter_id, base_qid):
        """Retrieve combined data for a specific encounter and question."""
        # Get model predictions
        model_preds = self.model_predictions.get((encounter_id, base_qid), None)
        
        # Get validation data
        val_data = self.validation_data.get((encounter_id, base_qid), None)
        
        if model_preds is None:
            print(f"No model predictions found for encounter {encounter_id}, question {base_qid}")
            return None
            
        if val_data is None:
            print(f"No validation data found for encounter {encounter_id}, question {base_qid}")
            return None
        
        # Create query context if it doesn't exist
        if 'query_context' not in val_data:
            val_data['query_context'] = create_query_context(val_data)
        
        # Format model predictions as a dictionary
        model_predictions_dict = {}
        for _, row in model_preds.iterrows():
            model_name = row['model_name']
            
            # Handle different data types for predictions
            unique_preds = row.get('unique_predictions', [])
            if isinstance(unique_preds, str):
                try:
                    unique_preds = ast.literal_eval(unique_preds)
                except:
                    unique_preds = [unique_preds]
                    
            raw_preds = row.get('all_raw_predictions', [])
            if isinstance(raw_preds, str):
                try:
                    raw_preds = ast.literal_eval(raw_preds)
                except:
                    raw_preds = [raw_preds]
                    
            sorted_preds = row.get('all_sorted_predictions', [])
            if isinstance(sorted_preds, str):
                try:
                    sorted_preds = ast.literal_eval(sorted_preds)
                except:
                    sorted_preds = [(str(raw_preds[0]), 1)] if raw_preds else []
            
            model_predictions_dict[model_name] = {
                'combined_prediction': row.get('combined_prediction', ''),
                'unique_predictions': unique_preds,
                'all_raw_predictions': raw_preds,
                'all_sorted_predictions': sorted_preds
            }
        
        # Get ground truth data
        ground_truth = val_data.get('ground_truth', {})
        if not ground_truth:
            # If ground_truth is not already processed, extract it from valid_answers
            valid_answers = val_data.get('valid_answers', [])
            if valid_answers:
                # Process valid_answers similarly to how it's done in load_validation_dataset
                if isinstance(valid_answers, str):
                    try:
                        valid_answers = ast.literal_eval(valid_answers)
                    except:
                        if ',' in valid_answers:
                            valid_answers = [ans.strip() for ans in valid_answers.split(',')]
                        else:
                            valid_answers = [valid_answers.strip()]
                
                # Clean answers
                cleaned_answers = []
                for ans in valid_answers:
                    if isinstance(ans, str):
                        cleaned_ans = ans.strip("'\" ").replace(" (please specify)", "")
                        cleaned_answers.append(cleaned_ans)
                    else:
                        cleaned_answers.append(str(ans).strip("'\" "))
                
                ground_truth = {
                    'raw_answers': valid_answers,
                    'cleaned_answers': cleaned_answers,
                    'combined_answer': ", ".join(cleaned_answers)
                }
        
        # Return combined data with all images and ground truth
        return {
            'encounter_id': encounter_id,
            'base_qid': base_qid,
            'query_context': val_data['query_context'],
            'images': val_data.get('all_images', []),  # Use all_images that contains multiple image paths
            'options': val_data.get('options_en_cleaned', val_data.get('options_en', [])),
            'question_type': val_data.get('question_type_en', ''),
            'question_category': val_data.get('question_category_en', ''),
            'ground_truth': ground_truth,  # Add ground truth to the result
            'model_predictions': model_predictions_dict
        }
    
    def get_all_encounter_question_pairs(self):
        """Return a list of all unique encounter_id, base_qid pairs."""
        return list(self.validation_data.keys())
    
    def get_sample_data(self, n=5):
        """Get a sample of combined data for n random encounter-question pairs."""
        import random
        
        all_pairs = self.get_all_encounter_question_pairs()
        sample_pairs = random.sample(all_pairs, min(n, len(all_pairs)))
        
        return [self.get_combined_data(encounter_id, base_qid) for encounter_id, base_qid in sample_pairs]

In [28]:
val_data_dir = os.path.join(os.getcwd(), "outputs")
val_dataset_path = os.path.join(val_data_dir, "val_dataset.csv")
validation_df = load_validation_dataset(val_dataset_path)
agentic_data = AgenticRAGData(all_models_df, validation_df)

Loading validation dataset from /storage/scratch1/2/kthakrar3/mediqa-magic-v2/outputs/val_dataset.csv
Loaded validation dataset with 1413 rows
Created grouped validation dataset with 504 unique encounter-question pairs


In [29]:
# To see the modified output with ground truth:
def print_sample_with_ground_truth(sample_data):
    """Print sample data including ground truth for validation"""
    print(f"\nSample combined data:")
    print(f"Encounter: {sample_data['encounter_id']}, Question: {sample_data['base_qid']}")
    print(f"Question type: {sample_data['question_type']}")
    print(f"Options: {sample_data['options']}")
    
    print("\nQuery context:")
    print(sample_data['query_context'])
    
    # Print ground truth
    if 'ground_truth' in sample_data and sample_data['ground_truth']:
        print("\nGround Truth:")
        print(f"  Answer: {sample_data['ground_truth'].get('combined_answer', 'Not available')}")
    else:
        print("\nGround Truth: Not available")
    
    print("\nModel predictions:")
    for model_name, predictions in sample_data['model_predictions'].items():
        print(f"\n  Model: {model_name}")
        print(f"  Prediction: {predictions['combined_prediction']}")
        
    # Print image paths
    print("\nAvailable images:")
    for i, img_path in enumerate(sample_data['images']):
        print(f"  Image {i+1}: {os.path.basename(img_path)}")

# Example usage:
sample_data = agentic_data.get_sample_data(1)[0]
print_sample_with_ground_truth(sample_data)


Sample combined data:
Encounter: ENC00862, Question: CQID012
Question type: Size
Options: ['size of thumb nail', 'size of palm', 'larger area', 'Not mentioned']

Query context:
MAIN QUESTION TO ANSWER: How large are the affected areas?
Question Metadata: Type: Size, Category: General
Background Clinical Information (to help with your analysis):
See picture.  Is this Mucoid cyst?
It happens on the thumb, for 8 years already.  It is not itchy or painful.  It doesn't affect the sense there.  The size is 1 cm X 1 cm.  The nail has been deformed to a wavy shape becasue of the pressing.  Is this myxoid cyst?
Available Options (choose from these): size of thumb nail, size of palm, larger area, Not mentioned

Ground Truth:
  Answer: size of thumb nail

Model predictions:

  Model: gemma-3-12b-it
  Prediction: nan

  Model: Qwen2-VL-7B-Instruct
  Prediction: size of thumb nail, size of palm, 1 cm x 1 cm, larger area

  Model: Qwen2-VL-2B-Instruct
  Prediction: not mentioned(4, 287), (731, 996)

In [30]:
sample_data

{'encounter_id': 'ENC00862',
 'base_qid': 'CQID012',
 'query_context': "MAIN QUESTION TO ANSWER: How large are the affected areas?\nQuestion Metadata: Type: Size, Category: General\nBackground Clinical Information (to help with your analysis):\nSee picture.  Is this Mucoid cyst?\nIt happens on the thumb, for 8 years already.  It is not itchy or painful.  It doesn't affect the sense there.  The size is 1 cm X 1 cm.  The nail has been deformed to a wavy shape becasue of the pressing.  Is this myxoid cyst?\nAvailable Options (choose from these): size of thumb nail, size of palm, larger area, Not mentioned",
 'images': ['/storage/scratch1/2/kthakrar3/mediqa-magic-v2/2025_dataset/valid/images_valid/IMG_ENC00862_00001.jpg',
  '/storage/scratch1/2/kthakrar3/mediqa-magic-v2/2025_dataset/valid/images_valid/IMG_ENC00862_00002.jpg',
  '/storage/scratch1/2/kthakrar3/mediqa-magic-v2/2025_dataset/valid/images_valid/IMG_ENC00862_00003.jpg',
  '/storage/scratch1/2/kthakrar3/mediqa-magic-v2/2025_datase

In [31]:
# Load environment variables
load_dotenv()
API_KEY = os.getenv("API_KEY")

# Initialize the Gemini client
client = genai.Client(api_key=API_KEY)

In [32]:
def extract_structured_dermatological_analysis(sample_data):
    """
    Extract structured analysis of all images for an encounter, covering all question types.
    
    Args:
        sample_data: Dictionary containing encounter data with images
        
    Returns:
        Dictionary with structured dermatological analysis
    """
    encounter_id = sample_data['encounter_id']
    image_paths = sample_data['images']
    
    # Create a list to store analysis of each image
    image_analyses = []
    
    # Modified prompt for structured dermatological analysis
        
    structured_prompt = f"""As dermatology specialist analyzing skin images, extract and structure all clinically relevant information from this dermatological image.

    Organize your response in a JSON dictionary:

    1. SIZE: Approximate dimensions of lesions/affected areas, size comparison (thumbnail, palm, larger), Relative size comparisons for multiple lesions
    2. SITE_LOCATION: Visible body parts in the image, body areas showing lesions/abnormalities, Specific anatomical locations affected
    3. SKIN_DESCRIPTION: Lesion morphology (flat, raised, depressed), Texture of affected areas, Surface characteristics (scales, crust, fluid), Appearance of lesion boundaries
    4. LESION_COLOR: Predominant color(s) of affected areas, Color variations within lesions, Color comparison to normal skin, Color distribution patterns
    5. LESION_COUNT: Number of distinct lesions/affected areas, Single vs multiple presentation, Distribution pattern if multiple, Any counting limitations
    6. EXTENT: How widespread the condition appears, Localized vs widespread assessment, Approximate percentage of visible skin affected, Limitations in determining full extent
    7. TEXTURE: Expected tactile qualities, Smooth vs rough assessment, Notable textural features, Texture consistency across affected areas
    8. ONSET_INDICATORS: Visual clues about condition duration, Acute vs chronic presentation features, Healing/progression/chronicity signs, Note: precise timing cannot be determined from images
    9. ITCH_INDICATORS: Scratch marks/excoriations/trauma signs, Features associated with itchy conditions, Pruritic vs non-pruritic visual indicators, Note: sensation cannot be directly observed
    10. OVERALL_IMPRESSION: Brief description (1-2 sentences), Key diagnostic features, Potential diagnoses (2-3)
    
    Be concise and use medical terminology where appropriate. If information for a section is 
    cannot be determined, state "Cannot determine from image".
    """
    
    # Process each image
    for idx, img_path in enumerate(image_paths):
        try:
            # Load image
            image = Image.open(img_path)
            
            # Analyze image with structured prompt
            print(f"Analyzing image {idx+1}/{len(image_paths)} for encounter {encounter_id}")
            
            # Create a request for Gemini
            response = client.models.generate_content(
                model="gemini-2.5-flash-preview-04-17",
                contents=[structured_prompt, image]
            )
            
            # Extract the analysis text and parse as JSON
            analysis_text = response.text
            print(f"Analysis text received (length: {len(analysis_text)})")
            
            # Clean up the response to ensure it's valid JSON
            # Remove markdown code block indicators and any non-JSON text
            cleaned_text = analysis_text
            if "```json" in cleaned_text:
                cleaned_text = cleaned_text.split("```json")[1]
            if "```" in cleaned_text:
                cleaned_text = cleaned_text.split("```")[0]
            
            # Try to parse as JSON
            try:
                import json
                structured_analysis = json.loads(cleaned_text)
                print(f"Successfully parsed structured analysis for image {idx+1}")
            except json.JSONDecodeError:
                print(f"Warning: Could not parse analysis as JSON for image {idx+1}")
                structured_analysis = {"parse_error": "Could not parse as JSON", "raw_text": analysis_text}
            
            # Store result
            image_analyses.append({
                "image_index": idx + 1,
                "image_path": os.path.basename(img_path),
                "structured_analysis": structured_analysis
            })
            
        except Exception as e:
            print(f"Error analyzing image {img_path}: {str(e)}")
            image_analyses.append({
                "image_index": idx + 1,
                "image_path": os.path.basename(img_path),
                "error": str(e)
            })
    
    # Generate aggregated analysis across all images
    aggregated_analysis = aggregate_structured_analyses(image_analyses, encounter_id)
    
    # Return combined results
    return {
        "encounter_id": encounter_id,
        "image_count": len(image_paths),
        "individual_analyses": image_analyses,
        "aggregated_analysis": aggregated_analysis
    }

def aggregate_structured_analyses(image_analyses, encounter_id):
    """Aggregate structured analyses from multiple images into a single structured analysis"""
    
    # Skip if there are no valid analyses
    valid_analyses = [a for a in image_analyses if "error" not in a and "structured_analysis" in a]
    print(f"Aggregating {len(valid_analyses)} valid structured analyses for encounter {encounter_id}")
    
    if not valid_analyses:
        return {
            "error": "No valid analyses to aggregate",
            "message": "Unable to generate aggregated analysis due to errors in individual analyses."
        }
    
    # If there's only one image, return its analysis as the aggregated result
    if len(valid_analyses) == 1:
        return valid_analyses[0]["structured_analysis"]
    
    # For multiple images, create a prompt for aggregation
    # Extract analysis JSONs to include in the prompt
    analysis_jsons = []
    for analysis in valid_analyses:
        import json
        analysis_json = json.dumps(analysis["structured_analysis"])
        analysis_jsons.append(f"Image {analysis['image_index']} ({analysis['image_path']}): {analysis_json}")
    
    # Create an aggregation prompt
    aggregation_prompt = f"""As dermatology specialist reviewing multiple skin image analyses for the same patient, combine these analyses and organize your response in a JSON dictionary:

    1. SIZE: Approximate dimensions of lesions/affected areas, size comparison (thumbnail, palm, larger), Relative size comparisons for multiple lesions
    2. SITE_LOCATION: Visible body parts in the image, body areas showing lesions/abnormalities, Specific anatomical locations affected
    3. SKIN_DESCRIPTION: Lesion morphology (flat, raised, depressed), Texture of affected areas, Surface characteristics (scales, crust, fluid), Appearance of lesion boundaries
    4. LESION_COLOR: Predominant color(s) of affected areas, Color variations within lesions, Color comparison to normal skin, Color distribution patterns
    5. LESION_COUNT: Number of distinct lesions/affected areas, Single vs multiple presentation, Distribution pattern if multiple, Any counting limitations
    6. EXTENT: How widespread the condition appears, Localized vs widespread assessment, Approximate percentage of visible skin affected, Limitations in determining full extent
    7. TEXTURE: Expected tactile qualities, Smooth vs rough assessment, Notable textural features, Texture consistency across affected areas
    8. ONSET_INDICATORS: Visual clues about condition duration, Acute vs chronic presentation features, Healing/progression/chronicity signs, Note: precise timing cannot be determined from images
    9. ITCH_INDICATORS: Scratch marks/excoriations/trauma signs, Features associated with itchy conditions, Pruritic vs non-pruritic visual indicators, Note: sensation cannot be directly observed
    10. OVERALL_IMPRESSION: Brief description (1-2 sentences), Key diagnostic features, Potential diagnoses (2-3)
        
    {' '.join(analysis_jsons)}
    """
    
    try:
        # Call Gemini for the aggregated analysis
        import json
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-04-17",
            contents=[aggregation_prompt]
        )
        
        aggregation_text = response.text
        print(f"Aggregated analysis received (length: {len(aggregation_text)})")
        
        # Clean up the response to ensure it's valid JSON
        cleaned_text = aggregation_text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        # Try to parse as JSON
        try:
            aggregated_analysis = json.loads(cleaned_text)
            print(f"Successfully parsed aggregated analysis")
        except json.JSONDecodeError:
            print(f"Warning: Could not parse aggregated analysis as JSON")
            aggregated_analysis = {"parse_error": "Could not parse as JSON", "raw_text": aggregation_text}
        
        return aggregated_analysis
        
    except Exception as e:
        print(f"Error creating aggregated analysis for encounter {encounter_id}: {str(e)}")
        return {
            "error": str(e),
            "aggregation_error": "Failed to generate aggregated analysis"
        }

In [33]:
def process_encounter_image_analyses(agentic_data):
    """
    Process comprehensive image analyses for all unique encounters.
    
    Args:
        agentic_data: AgenticRAGData instance containing all encounter data
        
    Returns:
        Dictionary mapping encounter_ids to comprehensive image analyses
    """
    # Get all unique encounter IDs
    all_pairs = agentic_data.get_all_encounter_question_pairs()
    unique_encounters = set(pair[0] for pair in all_pairs)
    
    print(f"Processing image analyses for {len(unique_encounters)} unique encounters")
    
    # Process each encounter (using the first question for that encounter)
    encounter_image_analyses = {}
    for encounter_id in unique_encounters:
        # Find the first question for this encounter
        for pair in all_pairs:
            if pair[0] == encounter_id:
                # Get the full data for this encounter-question pair
                sample_data = agentic_data.get_combined_data(pair[0], pair[1])
                
                # Extract and store comprehensive image analysis
                image_analysis = extract_comprehensive_image_analysis(sample_data)
                encounter_image_analyses[encounter_id] = image_analysis
                
                # Print progress
                print(f"Processed comprehensive image analysis for encounter {encounter_id}")
                
                # Since we only need one question per encounter, break after finding the first
                break
    
    return encounter_image_analyses

# Usage example for a single encounter
def extract_single_encounter_image_analysis(sample_data):
    """Extract comprehensive image analysis for a single encounter"""
    encounter_id = sample_data['encounter_id']
    print(f"Extracting comprehensive image analysis for encounter {encounter_id}")
    
    image_analysis = extract_comprehensive_image_analysis(sample_data)
    
    # Save to a file for future reference
    import json
    with open(f"image_analysis_{encounter_id}.json", "w") as f:
        json.dump(image_analysis, f, indent=2)
    
    print(f"Extracted and saved comprehensive image analysis for encounter {encounter_id}")
    return image_analysis

In [34]:
# def process_specific_encounter(agentic_data, target_encounter_id):
#     """
#     Process comprehensive image analysis for a specific encounter ID.
    
#     Args:
#         agentic_data: AgenticRAGData instance containing all encounter data
#         target_encounter_id: The specific encounter ID to process
        
#     Returns:
#         Dictionary with comprehensive image analysis for the specified encounter
#     """
#     # Get all pairs for this encounter
#     all_pairs = agentic_data.get_all_encounter_question_pairs()
#     encounter_pairs = [pair for pair in all_pairs if pair[0] == target_encounter_id]
    
#     if not encounter_pairs:
#         print(f"No data found for encounter {target_encounter_id}")
#         return None
    
#     # Get the full data for the first question of this encounter
#     first_pair = encounter_pairs[0]
#     sample_data = agentic_data.get_combined_data(first_pair[0], first_pair[1])
    
#     # Extract comprehensive image analysis
#     image_analysis = extract_comprehensive_image_analysis(sample_data)
    
#     # Save to a file for future reference
#     import json
#     with open(f"image_analysis_{target_encounter_id}.json", "w") as f:
#         json.dump(image_analysis, f, indent=2)
    
#     print(f"Processed comprehensive image analysis for encounter {target_encounter_id}")
#     return {target_encounter_id: image_analysis}

# # Usage
# specific_analysis = process_specific_encounter(agentic_data, "ENC00902")

In [35]:
# specific_analysis

In [None]:
# For a single sample
sample_data = agentic_data.get_sample_data(1)[0]
comprehensive_image_analysis = extract_single_encounter_image_analysis(sample_data)

In [None]:
comprehensive_image_analysis

In [36]:
def extract_clinical_context(sample_data):
    """
    Extract structured clinical information from an encounter's query title and content.
    
    Args:
        sample_data: Dictionary containing encounter data with query_context
        
    Returns:
        Dictionary with structured clinical information
    """
    # Extract encounter details
    encounter_id = sample_data['encounter_id']
    
    # Get the clinical context from query_context
    query_context = sample_data['query_context']
    
    # Extract just the clinical part
    clinical_lines = []
    capturing = False
    for line in query_context.split('\n'):
        if "Background Clinical Information" in line:
            capturing = True
            continue
        elif "Available Options" in line:
            capturing = False
        elif capturing:
            clinical_lines.append(line)
    
    clinical_text = "\n".join(clinical_lines).strip()
    
    # If there's no clinical information, return empty structure
    if not clinical_text:
        return {
            "encounter_id": encounter_id,
            "clinical_summary": "No clinical information available"
        }
    
    # Create prompt for Gemini to extract structured information
    prompt = f"""You are a dermatology specialist analyzing patient information. 
    Extract and structure all clinically relevant information from this patient description:

    {clinical_text}

    Organize your response in the following JSON structure:

    1. DEMOGRAPHICS: Age, sex, and any other demographic data
    2. SITE_LOCATION: Body parts affected by the condition as described in the text
    3. SKIN_DESCRIPTION: Any mention of lesion morphology (flat, raised, depressed), texture, surface characteristics (scales, crust, fluid), appearance of lesion boundaries
    4. LESION_COLOR: Any description of color(s) of affected areas, color variations, comparison to normal skin
    5. LESION_COUNT: Any information about number of lesions, single vs multiple presentation, distribution pattern
    6. EXTENT: How widespread the condition appears based on the description, localized vs widespread
    7. TEXTURE: Any description of tactile qualities, smooth vs rough, notable textural features
    8. ONSET_INDICATORS: Information about onset, duration, progression, or evolution of symptoms
    9. ITCH_INDICATORS: Mentions of scratching, itchiness, or other sensory symptoms
    10. OTHER_SYMPTOMS: Any additional symptoms mentioned (pain, burning, etc.)
    11. TRIGGERS: Identified factors that worsen/improve the condition
    12. HISTORY: Relevant past medical history or previous treatments
    13. DIAGNOSTIC_CONSIDERATIONS: Any mentioned or suggested diagnoses in the text

    Be concise and use medical terminology where appropriate. If information for a section is 
    not available, indicate "Not mentioned".
    """
    
    try:
        # Call Gemini
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-04-17",
            contents=[prompt]
        )
        
        # Return structured result
        return {
            "encounter_id": encounter_id,
            "raw_clinical_text": clinical_text,
            "structured_clinical_context": response.text
        }
            
    except Exception as e:
        print(f"Error extracting clinical context for encounter {encounter_id}: {str(e)}")
        return {
            "encounter_id": encounter_id,
            "raw_clinical_text": clinical_text,
            "error": str(e)
        }

def process_encounter_clinical_contexts(agentic_data):
    """
    Process clinical contexts for all unique encounters.
    
    Args:
        agentic_data: AgenticRAGData instance containing all encounter data
        
    Returns:
        Dictionary mapping encounter_ids to structured clinical contexts
    """
    # Get all unique encounter IDs
    all_pairs = agentic_data.get_all_encounter_question_pairs()
    unique_encounters = set(pair[0] for pair in all_pairs)
    
    print(f"Processing clinical contexts for {len(unique_encounters)} unique encounters")
    
    # Process each encounter (using the first question for that encounter)
    encounter_contexts = {}
    for encounter_id in unique_encounters:
        # Find the first question for this encounter
        for pair in all_pairs:
            if pair[0] == encounter_id:
                # Get the full data for this encounter-question pair
                sample_data = agentic_data.get_combined_data(pair[0], pair[1])
                
                # Extract and store clinical context
                clinical_context = extract_clinical_context(sample_data)
                encounter_contexts[encounter_id] = clinical_context
                
                # Print progress
                print(f"Processed clinical context for encounter {encounter_id}")
                
                # Since we only need one question per encounter, break after finding the first
                break
    
    return encounter_contexts

# Usage example
def extract_all_clinical_contexts(agentic_data):
    """Extract clinical contexts for all encounters"""
    clinical_contexts = process_encounter_clinical_contexts(agentic_data)
    
    # Save to a file for future use
    import json
    with open("clinical_contexts.json", "w") as f:
        json.dump(clinical_contexts, f, indent=2)
    
    print(f"Extracted and saved clinical contexts for {len(clinical_contexts)} encounters")
    return clinical_contexts

In [37]:
# Get a sample data point (one encounter-question pair)
sample_data = agentic_data.get_sample_data(1)[0]

# Extract clinical context for just this one encounter
clinical_context = extract_clinical_context(sample_data)

# Print the results
print(f"Encounter ID: {clinical_context['encounter_id']}")
print(f"Raw clinical text:\n{clinical_context['raw_clinical_text']}")
print(f"Structured clinical context:\n{clinical_context['structured_clinical_context']}")

Encounter ID: ENC00857
Raw clinical text:
Would experts here please help take a look.
Would experts here take a look.
Patient is 34 years old, female.  There grows figure like pattern all over the body, with the exception of palms and soles.  Has been like that for 20 years, and growing more.  Would experts here take a look and advise what disease it is, and the treatment.  Thanks.
Structured clinical context:
```json
{
  "DEMOGRAPHICS": {
    "Age": "34 years",
    "Sex": "Female"
  },
  "SITE_LOCATION": "Widespread distribution, excluding palms and soles.",
  "SKIN_DESCRIPTION": "Lesions described as forming a \"figure like pattern\". Morphology (e.g., papular, plaque), surface characteristics (e.g., scaling), and boundaries are not described.",
  "LESION_COLOR": "Not mentioned",
  "LESION_COUNT": "Multiple lesions, distributed in a pattern.",
  "EXTENT": "Diffuse distribution over the body surface area, sparing acral sites (palms and soles).",
  "TEXTURE": "Not mentioned",
  "ONSET_

In [48]:
def apply_reasoning_layer(encounter_id, base_qid, image_analysis, clinical_context, sample_data):
    """
    Apply a reasoning layer to determine the best answer(s) for a specific encounter-question pair.
    
    Args:
        encounter_id: The encounter ID
        base_qid: The question ID
        image_analysis: Structured image analysis for this encounter
        clinical_context: Structured clinical context for this encounter
        sample_data: Combined data for this encounter-question pair
    
    Returns:
        Dictionary with reasoning and final answer(s)
    """
    # Extract relevant data
    question_text = sample_data['query_context'].split("MAIN QUESTION TO ANSWER:")[1].split("\n")[0].strip()
    print("question_text: ", question_text)
    options = sample_data['options']
    print("options: ", options)
    question_type = sample_data['question_type']
    print("question_type: ", question_type)
    model_predictions = sample_data['model_predictions']
    print("model_predictions: ", model_predictions)
    
    # Format model predictions for the prompt
    model_prediction_text = ""
    for model_name, predictions in model_predictions.items():
        combined_pred = predictions.get('combined_prediction', '')
        if isinstance(combined_pred, float) and pd.isna(combined_pred):
            combined_pred = "No prediction"
        model_prediction_text += f"- {model_name}: {combined_pred}\n"
    print(model_prediction_text)
    
    # Create a prompt for the reasoning layer
    prompt = f"""You are a medical expert analyzing dermatological images. Use the provided evidence to determine the most accurate answer(s) for the following question:

    QUESTION: {question_text}
    QUESTION TYPE: {question_type}
    OPTIONS: {", ".join(options)}

    IMAGE ANALYSIS:
    {json.dumps(image_analysis['aggregated_analysis'], indent=2)}

    CLINICAL CONTEXT:
    {clinical_context['structured_clinical_context']}

    MODEL PREDICTIONS:
    {model_prediction_text}

    Based on all the evidence above, determine the most accurate answer(s) to the question. Your task is to:
    1. Analyze the evidence from the image analysis and clinical context
    2. Consider the model predictions, noting any consensus or disagreement
    3. Provide a brief reasoning for your conclusion
    4. Select the final answer(s) from the available options

    If selecting multiple answers is appropriate, provide them in a comma-separated list. If no answer can be determined, select "Not mentioned".

    Format your response as a JSON object with these fields:
    1. "reasoning": Your step-by-step reasoning process
    2. "answer": Your final answer(s) as a single string or comma-separated list of options

    When providing your answer, strictly adhere to the available options and only select from them.
    """
    
    print(f"prompt: {prompt}")
    try:
        # Call Gemini for reasoning
        response = client.models.generate_content(
            model="gemini-2.5-flash-preview-04-17",
            contents=[prompt]
        )
        
        # Parse the response
        reasoning_text = response.text
        print("reasoning_text: ", reasoning_text)
        
        # Clean up the response to ensure it's valid JSON
        cleaned_text = reasoning_text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
            
        print("cleaned_text: ", cleaned_text)
        
        # Parse JSON
        try:
            reasoning_result = json.loads(cleaned_text)
            print(f"Successfully parsed reasoning for {encounter_id}, {base_qid}")
            print("reasoning_result: ", reasoning_result)
            
            # Validate the answer against options
            answer = reasoning_result.get('answer', '').lower()
            print("answer: ", answer)
            valid_answers = []
            
            # Handle comma-separated answers
            if ',' in answer:
                answer_parts = [part.strip() for part in answer.split(',')]
                for part in answer_parts:
                    for option in options:
                        if part == option.lower():
                            valid_answers.append(option)
            else:
                for option in options:
                    if answer == option.lower():
                        valid_answers.append(option)
            
            # Use "Not mentioned" if no valid answers found
            if not valid_answers:
                if "not mentioned" in answer:
                    valid_answers = ["Not mentioned"]
                else:
                    print(f"Warning: No valid answers found for {encounter_id}, {base_qid}")
                    valid_answers = ["Not mentioned"]
            
            # Update the result with validated answers
            reasoning_result['validated_answer'] = ", ".join(valid_answers)
            
        except json.JSONDecodeError:
            print(f"Warning: Could not parse reasoning as JSON for {encounter_id}, {base_qid}")
            reasoning_result = {
                "reasoning": "Error parsing response.",
                "answer": "Not mentioned",
                "validated_answer": "Not mentioned",
                "raw_text": reasoning_text
            }
        
        print("reasoning_result: ", reasoning_result)
        
        return reasoning_result
        
    except Exception as e:
        print(f"Error applying reasoning layer for {encounter_id}, {base_qid}: {str(e)}")
        return {
            "reasoning": f"Error: {str(e)}",
            "answer": "Not mentioned",
            "validated_answer": "Not mentioned",
            "error": str(e)
        }

In [49]:
def process_all_encounters_with_reasoning(agentic_data):
    """
    Process all encounters and questions with reasoning layer.
    
    Args:
        agentic_data: AgenticRAGData instance containing all encounter data
        
    Returns:
        Dictionary with all encountered processed with reasoning
    """
    # Get all encounter-question pairs
    all_pairs = agentic_data.get_all_encounter_question_pairs()
    print(f"Processing {len(all_pairs)} encounter-question pairs with reasoning layer")
    
    # Dictionary to store all results
    all_results = {}
    
    # Dictionary to store image analyses and clinical contexts (to avoid recomputing)
    image_analyses = {}
    clinical_contexts = {}
    
    # Process each encounter-question pair
    for i, (encounter_id, base_qid) in enumerate(all_pairs):
        print(f"Processing pair {i+1}/{len(all_pairs)}: {encounter_id}, {base_qid}")
        
        # Get the combined data for this pair
        sample_data = agentic_data.get_combined_data(encounter_id, base_qid)
        if not sample_data:
            print(f"Warning: No data found for {encounter_id}, {base_qid}")
            continue
        
        # Get or compute image analysis for this encounter
        if encounter_id not in image_analyses:
            print(f"  Computing image analysis for {encounter_id}")
            image_analyses[encounter_id] = extract_structured_dermatological_analysis(sample_data)
        
        # Get or compute clinical context for this encounter
        if encounter_id not in clinical_contexts:
            print(f"  Computing clinical context for {encounter_id}")
            clinical_contexts[encounter_id] = extract_clinical_context(sample_data)
        
        # Apply reasoning layer
        print(f"  Applying reasoning layer for {encounter_id}, {base_qid}")
        reasoning_result = apply_reasoning_layer(
            encounter_id,
            base_qid,
            image_analyses[encounter_id],
            clinical_contexts[encounter_id],
            sample_data
        )
        
        # Store the result
        if encounter_id not in all_results:
            all_results[encounter_id] = {}
        
        all_results[encounter_id][base_qid] = {
            "query_context": sample_data['query_context'],
            "options": sample_data['options'],
            "model_predictions": sample_data['model_predictions'],
            "reasoning_result": reasoning_result,
            "final_answer": reasoning_result.get('validated_answer', 'Not mentioned')
        }
        
        # Save intermediate results to avoid losing progress
        if (i+1) % 10 == 0:
            with open(f"reasoning_results_{i+1}.json", "w") as f:
                json.dump(all_results, f, indent=2)
            print(f"Saved intermediate results after processing {i+1} pairs")
    
    # Save full results
    with open("final_reasoning_results.json", "w") as f:
        json.dump(all_results, f, indent=2)
    
    print(f"Processed all {len(all_pairs)} encounter-question pairs with reasoning layer")
    return all_results

In [50]:
def format_reasoning_results_for_eval(all_results, output_file):
    """
    Format results from reasoning layer for official evaluation.
    
    Args:
        all_results: Dictionary with all results from reasoning layer
        output_file: Path to save the formatted results
    
    Returns:
        List of formatted predictions for evaluation
    """
    # Define the question IDs and their allowed variants
    QIDS = [
        "CQID010-001",  # how much of body is affected (single answer)
        "CQID011-001", "CQID011-002", "CQID011-003", "CQID011-004", "CQID011-005", "CQID011-006",  # multiple answers allowed
        "CQID012-001", "CQID012-002", "CQID012-003", "CQID012-004", "CQID012-005", "CQID012-006",  # multiple answers allowed
        "CQID015-001",  # single answer
        "CQID020-001", "CQID020-002", "CQID020-003", "CQID020-004", "CQID020-005", 
        "CQID020-006", "CQID020-007", "CQID020-008", "CQID020-009",  # multiple answers allowed
        "CQID025-001",  # single answer
        "CQID034-001",  # single answer
        "CQID035-001",  # single answer
        "CQID036-001",  # single answer
    ]
    
    # Create a mapping of question base IDs to their allowed variants
    qid_variants = {}
    for qid in QIDS:
        base_qid, variant = qid.split('-')
        if base_qid not in qid_variants:
            qid_variants[base_qid] = []
        qid_variants[base_qid].append(qid)
    
    # Get all required base QIDs for a complete encounter
    required_base_qids = set(qid.split('-')[0] for qid in QIDS)
    
    # Process each encounter
    formatted_predictions = []
    for encounter_id, questions in all_results.items():
        # Check if encounter has all required questions
        encounter_base_qids = set(questions.keys())
        if not required_base_qids.issubset(encounter_base_qids):
            print(f"Skipping encounter {encounter_id} - missing required questions")
            continue
        
        # Create prediction entry for this encounter
        pred_entry = {'encounter_id': encounter_id}
        
        # Process each question
        for base_qid, question_data in questions.items():
            # Skip if no variants defined
            if base_qid not in qid_variants:
                continue
            
            # Get final answer from reasoning layer
            final_answer = question_data['final_answer']
            
            # Get options for this question
            options = question_data['options']
            
            # Find "Not mentioned" index
            not_mentioned_index = None
            for i, opt in enumerate(options):
                if opt.lower() == "not mentioned":
                    not_mentioned_index = i
                    break
            
            # Default to last option if "Not mentioned" not found
            if not_mentioned_index is None:
                not_mentioned_index = len(options) - 1
            
            # Process answer(s)
            if ',' in final_answer:
                # Multiple answers
                answer_parts = [part.strip() for part in final_answer.split(',')]
                answer_indices = []
                
                for part in answer_parts:
                    found = False
                    for i, opt in enumerate(options):
                        if part.lower() == opt.lower():
                            answer_indices.append(i)
                            found = True
                            break
                    
                    if not found:
                        # Default to "Not mentioned" if answer not found
                        answer_indices.append(not_mentioned_index)
                
                # Distribute across variants
                available_variants = qid_variants[base_qid]
                
                # Distribute answers across available variants
                for i, idx in enumerate(answer_indices):
                    if i < len(available_variants):
                        pred_entry[available_variants[i]] = idx
                
                # Fill remaining variants with "Not mentioned"
                for i in range(len(answer_indices), len(available_variants)):
                    pred_entry[available_variants[i]] = not_mentioned_index
                
            else:
                # Single answer
                found = False
                answer_index = not_mentioned_index
                
                for i, opt in enumerate(options):
                    if final_answer.lower() == opt.lower():
                        answer_index = i
                        found = True
                        break
                
                # Assign to first variant
                pred_entry[qid_variants[base_qid][0]] = answer_index
                
                # Fill remaining variants with "Not mentioned" if this is a multi-variant question
                if len(qid_variants[base_qid]) > 1:
                    for i in range(1, len(qid_variants[base_qid])):
                        pred_entry[qid_variants[base_qid][i]] = not_mentioned_index
        
        # Add to formatted predictions
        formatted_predictions.append(pred_entry)
    
    # Save to JSON file
    with open(output_file, 'w') as f:
        json.dump(formatted_predictions, f, indent=2)
    
    print(f"Formatted predictions saved to {output_file} ({len(formatted_predictions)} complete encounters)")
    return formatted_predictions

In [51]:
# # Run the end-to-end pipeline
# all_results = process_all_encounters_with_reasoning(agentic_data)

# # Format the results for evaluation
# formatted_predictions = format_reasoning_results_for_eval(
#     all_results,
#     os.path.join(output_dir, f"data_cvqa_sys_reasoned_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
# )

In [52]:
def process_single_encounter_with_reasoning(agentic_data, target_encounter_id):
    """
    Process a single encounter with all its questions using the reasoning layer.
    
    Args:
        agentic_data: AgenticRAGData instance containing all encounter data
        target_encounter_id: The specific encounter ID to process
        
    Returns:
        Dictionary with all questions processed with reasoning for this encounter
    """
    # Get all pairs for this encounter
    all_pairs = agentic_data.get_all_encounter_question_pairs()
    encounter_pairs = [pair for pair in all_pairs if pair[0] == target_encounter_id]
    
    if not encounter_pairs:
        print(f"No data found for encounter {target_encounter_id}")
        return None
    
    print(f"Processing {len(encounter_pairs)} questions for encounter {target_encounter_id}")
    
    # Dictionary to store all results for this encounter
    encounter_results = {target_encounter_id: {}}
    
    # Get or compute image analysis once for this encounter
    print(f"Computing image analysis for {target_encounter_id}")
    sample_data = agentic_data.get_combined_data(encounter_pairs[0][0], encounter_pairs[0][1])
    image_analysis = extract_structured_dermatological_analysis(sample_data)
    
    # Get or compute clinical context once for this encounter
    print(f"Computing clinical context for {target_encounter_id}")
    clinical_context = extract_clinical_context(sample_data)
    
    # Process each question for this encounter
    for i, (encounter_id, base_qid) in enumerate(encounter_pairs):
        print(f"Processing question {i+1}/{len(encounter_pairs)}: {base_qid}")
        
        # Get the combined data for this question
        sample_data = agentic_data.get_combined_data(encounter_id, base_qid)
        if not sample_data:
            print(f"Warning: No data found for {encounter_id}, {base_qid}")
            continue
        
        # Apply reasoning layer
        print(f"Applying reasoning layer for {encounter_id}, {base_qid}")
        reasoning_result = apply_reasoning_layer(
            encounter_id,
            base_qid,
            image_analysis,
            clinical_context,
            sample_data
        )
        
        # Store the result
        encounter_results[encounter_id][base_qid] = {
            "query_context": sample_data['query_context'],
            "options": sample_data['options'],
            "model_predictions": sample_data['model_predictions'],
            "reasoning_result": reasoning_result,
            "final_answer": reasoning_result.get('validated_answer', 'Not mentioned')
        }
    
    # Save results for this encounter
    with open(f"reasoning_results_{target_encounter_id}.json", "w") as f:
        json.dump(encounter_results, f, indent=2)
    
    print(f"Processed all {len(encounter_pairs)} questions for encounter {target_encounter_id}")
    return encounter_results

In [53]:
# Process a single encounter
encounter_id = "ENC00852"  # Example encounter ID
encounter_results = process_single_encounter_with_reasoning(agentic_data, encounter_id)

# Format the results for evaluation (optional)
formatted_predictions = format_reasoning_results_for_eval(
    encounter_results,
    os.path.join(output_dir, f"data_cvqa_sys_reasoned_{encounter_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
)

Processing 9 questions for encounter ENC00852
Computing image analysis for ENC00852
Analyzing image 1/2 for encounter ENC00852
Analysis text received (length: 1655)
Successfully parsed structured analysis for image 1
Analyzing image 2/2 for encounter ENC00852
Analysis text received (length: 1669)
Successfully parsed structured analysis for image 2
Aggregating 2 valid structured analyses for encounter ENC00852
Aggregated analysis received (length: 2581)
Successfully parsed aggregated analysis
Computing clinical context for ENC00852
Processing question 1/9: CQID010
Applying reasoning layer for ENC00852, CQID010
question_text:  How much of the body is affected?
options:  ['single spot', 'limited area', 'widespread', 'Not mentioned']
question_type:  Site
model_predictions:  {'gemma-3-12b-it': {'combined_prediction': nan, 'unique_predictions': [''], 'all_raw_predictions': ['', ''], 'all_sorted_predictions': [('', 2)]}, 'Qwen2-VL-7B-Instruct': {'combined_prediction': 'limited area', 'unique_

reasoning_text:  ```json
{
  "reasoning": "The image analysis explicitly states that lesions are located on the 'dorsum of the hand', which falls under the 'upper extremities' category. The clinical context confirms involvement of the 'Back of the hand' (upper extremities) and explicitly mentions lesions on the 'Face', which falls under the 'head' category. The clinical context also notes the condition is becoming more widespread, but does not specify other locations from the given options. Therefore, based directly on the provided evidence, the affected areas are the head and upper extremities.",
  "answer": "head, upper extremities"
}
```
cleaned_text:  
{
  "reasoning": "The image analysis explicitly states that lesions are located on the 'dorsum of the hand', which falls under the 'upper extremities' category. The clinical context confirms involvement of the 'Back of the hand' (upper extremities) and explicitly mentions lesions on the 'Face', which falls under the 'head' category. 

reasoning_text:  ```json
{
 "reasoning": "The image analysis states that the visible lesions on the hand are small, approximately 5-10 mm, and smaller than a thumbnail. However, the clinical context provides additional information about the progression of the condition. It mentions that the 'Rashes are getting bigger' and becoming 'more widespread over time', affecting the hands and later the face. The question asks about the size of the affected areas, which, in light of the clinical history, refers not just to the current size of the depicted macules but to the overall extent and growth of the condition. The description 'getting bigger' and 'more widespread' indicates that the affected areas are expanding, fitting the description of a 'larger area' compared to the initial presentation or the size of the individual small lesions shown. While individual lesions might be small, the condition as a whole involves a larger or growing area. The option 'size of palm' is not explicitly mentio

reasoning_text:  ```json
{
 "reasoning": "The clinical context states, 'Symptoms are getting worse, rashes are getting bigger, macula appeared on face one month later.' This sentence describes the progression of the issue, starting from the onset on the hand. The most logical interpretation is that the macula appeared on the face one month *after the onset* of the hand lesions. Therefore, the initial onset of the issue on the hand was approximately one month ago. One month falls within the 'within months' timeframe. While the term 'gradually turn into Leukoplakia' and 'getting bigger' also imply a duration longer than days or hours, the specific mention of 'one month later' provides the most concrete timeline anchor relative to the onset.",
 "answer": "within months"
}
```
cleaned_text:  
{
 "reasoning": "The clinical context states, 'Symptoms are getting worse, rashes are getting bigger, macula appeared on face one month later.' This sentence describes the progression of the issue, st

reasoning_text:  ```json
{
 "reasoning": "The question asks for the best label to describe the affected area. The image analysis explicitly states that 'The lesions on the hand are macular, flat relative to surrounding skin, with a smooth surface'. The clinical context also mentions 'Macula (flat spots) on the face'. Both sources describe the lesions as 'flat' or 'macular', which means flat. The options include 'flat'. Other options like 'raised or bumpy', 'skin loss or sunken', 'thick or raised', 'warty', 'crust', 'scab', and 'weeping' are either directly contradicted by the image analysis ('No scales or crust') or not supported by the evidence. The option 'thin or close to the surface' could potentially apply to a macule (as there is no thickening), but 'flat' is the primary descriptor used in both sources to describe the elevation relative to surrounding skin, which is the distinguishing feature of a macule compared to raised or sunken lesions. Therefore, 'flat' is the most accurate

reasoning_text:  ```json
{
  "reasoning": "The question asks if there is any associated itching with the skin problem. The image analysis notes that itching (pruritus) cannot be determined from the images and finds no visible signs like scratch marks. The clinical context, under the heading 'ITCH_INDICATORS', explicitly states 'None mentioned (Patient feels no specific symptoms)'. This indicates that the patient does not experience itching. Based on the direct statement in the clinical context, the answer is no.",
  "answer": "no"
}
```
cleaned_text:  
{
  "reasoning": "The question asks if there is any associated itching with the skin problem. The image analysis notes that itching (pruritus) cannot be determined from the images and finds no visible signs like scratch marks. The clinical context, under the heading 'ITCH_INDICATORS', explicitly states 'None mentioned (Patient feels no specific symptoms)'. This indicates that the patient does not experience itching. Based on the direct s

reasoning_text:  ```json
{
  "reasoning": "The image analysis explicitly states that the lesions on the hand are 'hypopigmented (lighter than surrounding skin), appearing whitish or pale pinkish-white compared to the surrounding tan skin.' This description directly answers the question 'Compared to the normal surrounding skin, what is the color of the skin lesion?' by stating they are 'hypopigmented' and describing their appearance as 'whitish'. The clinical context also mentions the rashes gradually turn into 'Leukoplakia (whitish/hypopigmented patches)'. Both the terms 'white' and 'hypopigmentation' are among the provided options and are directly supported by the evidence.",
  "answer": "white, hypopigmentation"
}
```
cleaned_text:  
{
  "reasoning": "The image analysis explicitly states that the lesions on the hand are 'hypopigmented (lighter than surrounding skin), appearing whitish or pale pinkish-white compared to the surrounding tan skin.' This description directly answers the q

reasoning_text:  ```json
{
 "reasoning": "The image analysis explicitly states that there are 'Two distinct lesions' visible on the hand in the provided images. The clinical context describes the patient's condition as having 'Multiple (rashes, macula)', initially localized to hands and later appearing on the face, becoming 'more widespread over time'. The question asks 'How many skin lesions are there?'. While the image shows two lesions, the clinical context indicates a broader picture of the patient's condition with multiple lesions overall. The options provided are 'single', 'multiple', and 'Not mentioned'. Since the image shows more than one lesion (two) and the clinical context states 'Multiple', the most accurate answer within the given options is 'multiple'. The majority of the successful model predictions also align with 'multiple'.",
 "answer": "multiple"
}
```
cleaned_text:  
{
 "reasoning": "The image analysis explicitly states that there are 'Two distinct lesions' visible 

reasoning_text:  ```json
{
  "reasoning": "The question asks about the skin lesion texture based on the provided evidence. The IMAGE ANALYSIS section, which describes the lesions visible in the images, explicitly states under 'SKIN_DESCRIPTION' that the lesions on the hand have a 'smooth surface' and that 'No scales or crust are noted'. It also explicitly states under 'TEXTURE' that 'The lesions appear smooth, similar to the texture of the surrounding unaffected skin'. The clinical context mentions that the texture is 'Not specified for hand lesions' but notes the absence of skin scales on face macula. Since the images show the lesions on the hand, the description from the IMAGE ANALYSIS is the most relevant and direct evidence for the texture of the depicted lesions. This evidence clearly indicates the texture is smooth. The models' predictions are split, but the detailed description in the image analysis strongly supports 'smooth'.",
  "answer": "smooth"
}
```
cleaned_text:  
{
  "re

In [54]:
formatted_predictions

[{'encounter_id': 'ENC00852',
  'CQID010-001': 2,
  'CQID011-001': 0,
  'CQID011-002': 2,
  'CQID011-003': 7,
  'CQID011-004': 7,
  'CQID011-005': 7,
  'CQID011-006': 7,
  'CQID012-001': 2,
  'CQID012-002': 3,
  'CQID012-003': 3,
  'CQID012-004': 3,
  'CQID012-005': 3,
  'CQID012-006': 3,
  'CQID015-001': 3,
  'CQID020-001': 1,
  'CQID020-002': 9,
  'CQID020-003': 9,
  'CQID020-004': 9,
  'CQID020-005': 9,
  'CQID020-006': 9,
  'CQID020-007': 9,
  'CQID020-008': 9,
  'CQID020-009': 9,
  'CQID025-001': 1,
  'CQID034-001': 7,
  'CQID035-001': 1,
  'CQID036-001': 0}]

Reframe questions + their meanings
Define options more clearly
See if removing clinical context helps