In [None]:
import os
import glob
import pandas as pd
import ast
import re
from collections import defaultdict
import json
import datetime
import time
from PIL import Image
from dotenv import load_dotenv
from google import genai

In [None]:
class Config:
    MODEL_PREDICTIONS_DIR = os.path.join(os.getcwd(), "outputs", "outputs-akshay-2", "outputs")
    OUTPUT_DIR = os.path.join(os.getcwd(), "outputs")
    VAL_DATASET_PATH = os.path.join(OUTPUT_DIR, "val_dataset.csv")
    IMAGES_DIR = os.path.join(os.getcwd(), "2025_dataset", "valid", "images_valid")
    GEMINI_MODEL = "gemini-2.5-flash-preview-04-17"
    MAX_REFLECTION_CYCLES = 2
    CONFIDENCE_THRESHOLD = 0.8  # Threshold for accepting an answer without reflection

In [None]:
class DataLoader:
    @staticmethod
    def get_latest_aggregated_files(model_predictions_dir):
        """Get the latest aggregated prediction files for each model."""
        pattern = os.path.join(model_predictions_dir, "aggregated_predictions_*.csv")
        
        agg_files = glob.glob(pattern)
        
        if len(agg_files) == 0:
            return []
        
        latest_files = {}
        
        for file_path in agg_files:
            file_name = os.path.basename(file_path)
            
            parts = file_name.split("_base_")
            if len(parts) != 2:
                print(f"Warning: Unexpected filename format: {file_name}")
                continue
            
            model_part = parts[0].replace("aggregated_predictions_", "")
            model_name = model_part
            
            timestamps = re.findall(r'(\d+)', parts[1])
            if len(timestamps) < 2:
                print(f"Warning: Could not find timestamps in {file_name}")
                continue
            
            timestamp = int(timestamps[1])
            
            if model_name not in latest_files or timestamp > latest_files[model_name]['timestamp']:
                latest_files[model_name] = {
                    'file_path': file_path,
                    'timestamp': timestamp
                }
        
        return [info['file_path'] for _, info in latest_files.items()]
    
    @staticmethod
    def load_all_model_predictions(model_predictions_dir):
        """Load all model predictions from aggregated files."""
        latest_files = DataLoader.get_latest_aggregated_files(model_predictions_dir)
        
        if not latest_files:
            print("No aggregated prediction files found. Cannot proceed.")
            return {}
        
        model_predictions = {}
        
        for file_path in latest_files:
            file_name = os.path.basename(file_path)
            
            parts = file_name.split("_base_")
            if len(parts) != 2:
                print(f"Warning: Unexpected filename format: {file_name}")
                continue
                
            model_name = parts[0].replace("aggregated_predictions_", "")
            
            try:
                df = pd.read_csv(file_path)
                
                df['model_name'] = model_name
                
                model_predictions[model_name] = df
                
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        return model_predictions

    @staticmethod
    def load_validation_dataset(val_dataset_path):
        """Load the validation dataset."""
        val_df = pd.read_csv(val_dataset_path)
        
        val_df = DataLoader.process_validation_dataset(val_df)
        
        encounter_question_data = defaultdict(lambda: {
            'images': [],
            'data': None
        })
        
        for _, row in val_df.iterrows():
            encounter_id = row['encounter_id']
            base_qid = row['base_qid']
            key = (encounter_id, base_qid)
            
            if 'image_path' in row and row['image_path']:
                encounter_question_data[key]['images'].append(row['image_path'])
            elif 'image_id' in row and row['image_id']:
                image_path = os.path.join(Config.IMAGES_DIR, row['image_id'])
                encounter_question_data[key]['images'].append(image_path)
            
            if encounter_question_data[key]['data'] is None:
                encounter_question_data[key]['data'] = row.to_dict()
        
        grouped_data = []
        for (encounter_id, base_qid), data in encounter_question_data.items():
            entry = data['data'].copy()
            entry['all_images'] = data['images']
            entry['encounter_id'] = encounter_id
            entry['base_qid'] = base_qid
            grouped_data.append(entry)
        
        return pd.DataFrame(grouped_data)
    
    @staticmethod
    def safe_convert_options(options_str):
        """Safely convert a string representation of a list to an actual list."""
        if not isinstance(options_str, str):
            return options_str
            
        try:
            return ast.literal_eval(options_str)
        except (SyntaxError, ValueError):
            if options_str.startswith('[') and options_str.endswith(']'):
                return [opt.strip().strip("'\"") for opt in options_str[1:-1].split(',')]
            elif ',' in options_str:
                return [opt.strip() for opt in options_str.split(',')]
            else:
                return [options_str]
    
    @staticmethod
    def process_validation_dataset(val_df):
        """Process and clean the validation dataset."""
        if 'options_en' in val_df.columns:
            val_df['options_en'] = val_df['options_en'].apply(DataLoader.safe_convert_options)
            
            def clean_options(options):
                if not isinstance(options, list):
                    return options
                    
                cleaned_options = []
                for opt in options:
                    if isinstance(opt, str):
                        cleaned_opt = opt.strip("'\" ").replace(" (please specify)", "")
                        cleaned_options.append(cleaned_opt)
                    else:
                        cleaned_options.append(str(opt).strip("'\" "))
                return cleaned_options
                
            val_df['options_en_cleaned'] = val_df['options_en'].apply(clean_options)
        
        if 'question_text' in val_df.columns:
            val_df['question_text_cleaned'] = val_df['question_text'].apply(
                lambda q: q.replace(" Please specify which affected area for each selection.", "") 
                          if isinstance(q, str) and "Please specify which affected area for each selection" in q 
                          else q
            )
            
            val_df['question_text_cleaned'] = val_df['question_text_cleaned'].apply(
                lambda q: re.sub(r'^\d+\s+', '', q) if isinstance(q, str) else q
            )
        
        if 'base_qid' not in val_df.columns and 'qid' in val_df.columns:
            val_df['base_qid'] = val_df['qid'].apply(
                lambda q: q.split('-')[0] if isinstance(q, str) and '-' in q else q
            )
        
        return val_df

In [None]:
class DataProcessor:
    @staticmethod
    def create_query_context(row):
        """Create query context from validation data similar to the inference process."""
        question = row.get('question_text_cleaned', row.get('question_text', 'What do you see in this image?'))
        
        metadata = ""
        if 'question_type_en' in row:
            metadata += f"Type: {row['question_type_en']}"
            
        if 'question_category_en' in row:
            metadata += f", Category: {row['question_category_en']}"
        
        query_title = row.get('query_title_en', '')
        query_content = row.get('query_content_en', '')
        
        clinical_context = ""
        if query_title or query_content:
            clinical_context += "Background Clinical Information (to help with your analysis):\n"
            if query_title:
                clinical_context += f"{query_title}\n"
            if query_content:
                clinical_context += f"{query_content}\n"
        
        options = row.get('options_en_cleaned', row.get('options_en', ['Yes', 'No', 'Not mentioned']))
        if isinstance(options, list):
            options_text = ", ".join(options)
        else:
            options_text = str(options)
        
        query_text = (f"MAIN QUESTION TO ANSWER: {question}\n"
                     f"Question Metadata: {metadata}\n"
                     f"{clinical_context}"
                     f"Available Options (choose from these): {options_text}")
        
        return query_text

In [None]:
class AgenticRAGData:
    def __init__(self, all_models_df, validation_df):
        self.all_models_df = all_models_df
        self.validation_df = validation_df
        
        self.model_predictions = {}
        for (encounter_id, base_qid), group in all_models_df.groupby(['encounter_id', 'base_qid']):
            self.model_predictions[(encounter_id, base_qid)] = group
        
        self.validation_data = {}
        for _, row in validation_df.iterrows():
            self.validation_data[(row['encounter_id'], row['base_qid'])] = row
    
    def get_combined_data(self, encounter_id, base_qid):
        """Retrieve combined data for a specific encounter and question."""
        model_preds = self.model_predictions.get((encounter_id, base_qid), None)
        
        val_data = self.validation_data.get((encounter_id, base_qid), None)
        
        if model_preds is None:
            print(f"No model predictions found for encounter {encounter_id}, question {base_qid}")
            return None
            
        if val_data is None:
            print(f"No validation data found for encounter {encounter_id}, question {base_qid}")
            return None
        
        if 'query_context' not in val_data:
            val_data['query_context'] = DataProcessor.create_query_context(val_data)
        
        model_predictions_dict = {}
        for _, row in model_preds.iterrows():
            model_name = row['model_name']
            
            model_predictions_dict[model_name] = self._process_model_predictions(row)
        
        return {
            'encounter_id': encounter_id,
            'base_qid': base_qid,
            'query_context': val_data['query_context'],
            'images': val_data.get('all_images', []),
            'options': val_data.get('options_en_cleaned', val_data.get('options_en', [])),
            'question_type': val_data.get('question_type_en', ''),
            'question_category': val_data.get('question_category_en', ''),
            'model_predictions': model_predictions_dict
        }
    
    def _process_model_predictions(self, row):
        """Process model predictions from row data."""
#         unique_preds = row.get('unique_predictions', [])
#         if isinstance(unique_preds, str):
#             try:
#                 unique_preds = ast.literal_eval(unique_preds)
#             except:
#                 unique_preds = [unique_preds]
                
#         raw_preds = row.get('all_raw_predictions', [])
#         if isinstance(raw_preds, str):
#             try:
#                 raw_preds = ast.literal_eval(raw_preds)
#             except:
#                 raw_preds = [raw_preds]
                
#         sorted_preds = row.get('all_sorted_predictions', [])
#         if isinstance(sorted_preds, str):
#             try:
#                 sorted_preds = ast.literal_eval(sorted_preds)
#             except:
#                 sorted_preds = [(str(raw_preds[0]), 1)] if raw_preds else []
        
        return {
            'model_prediction': row.get('combined_prediction', '')
#             'unique_predictions': unique_preds,
#             'all_raw_predictions': raw_preds,
#             'all_sorted_predictions': sorted_preds
        }
    
    def get_all_encounter_question_pairs(self):
        """Return a list of all unique encounter_id, base_qid pairs."""
        return list(self.validation_data.keys())
    
    def get_sample_data(self, n=5):
        """Get a sample of combined data for n random encounter-question pairs."""
        import random
        
        all_pairs = self.get_all_encounter_question_pairs()
        sample_pairs = random.sample(all_pairs, min(n, len(all_pairs)))
        
        return [self.get_combined_data(encounter_id, base_qid) for encounter_id, base_qid in sample_pairs]

In [None]:
class ImageAnalysisService:
    """Service for analyzing dermatological images."""
    
    def __init__(self, client):
        self.client = client
        
    def analyze_images(self, image_paths, encounter_id):
        """
        Analyze multiple dermatological images for an encounter.
        
        Args:
            image_paths: List of paths to images
            encounter_id: Encounter identifier
            
        Returns:
            Dictionary with individual and aggregated analyses
        """
        image_analyses = []
        
        structured_prompt = self._create_dermatology_prompt()
        
        for idx, img_path in enumerate(image_paths):
            analysis = self._analyze_single_image(
                img_path, 
                structured_prompt, 
                encounter_id, 
                idx, 
                len(image_paths)
            )
            image_analyses.append(analysis)
        
        aggregated_analysis = self._aggregate_analyses(image_analyses, encounter_id)
        
        return {
            "encounter_id": encounter_id,
            "image_count": len(image_paths),
            "individual_analyses": image_analyses,
            "aggregated_analysis": aggregated_analysis
        }
    
    def _create_dermatology_prompt(self):
        """Create the structured dermatology analysis prompt."""
        return """As dermatology specialist analyzing skin images, extract and structure all clinically relevant information from this dermatological image.

Organize your response in a JSON dictionary:

1. SIZE: Approximate dimensions of lesions/affected areas, size comparison (thumbnail, palm, larger), Relative size comparisons for multiple lesions
2. SITE_LOCATION: Visible body parts in the image, body areas showing lesions/abnormalities, Specific anatomical locations affected
3. SKIN_DESCRIPTION: Lesion morphology (flat, raised, depressed), Texture of affected areas, Surface characteristics (scales, crust, fluid), Appearance of lesion boundaries
4. LESION_COLOR: Predominant color(s) of affected areas, Color variations within lesions, Color comparison to normal skin, Color distribution patterns
5. LESION_COUNT: Number of distinct lesions/affected areas, Single vs multiple presentation, Distribution pattern if multiple, Any counting limitations
6. EXTENT: How widespread the condition appears, Localized vs widespread assessment, Approximate percentage of visible skin affected, Limitations in determining full extent
7. TEXTURE: Expected tactile qualities, Smooth vs rough assessment, Notable textural features, Texture consistency across affected areas
8. ONSET_INDICATORS: Visual clues about condition duration, Acute vs chronic presentation features, Healing/progression/chronicity signs, Note: precise timing cannot be determined from images
9. ITCH_INDICATORS: Scratch marks/excoriations/trauma signs, Features associated with itchy conditions, Pruritic vs non-pruritic visual indicators, Note: sensation cannot be directly observed
10. OVERALL_IMPRESSION: Brief description (1-2 sentences), Key diagnostic features, Potential diagnoses (2-3)

Be concise and use medical terminology where appropriate. If information for a section is cannot be determined, state "Cannot determine from image".
"""
    
    def _analyze_single_image(self, img_path, prompt, encounter_id, idx, total_images):
        """Analyze a single dermatological image."""
        try:
            image = Image.open(img_path)
            
            print(f"Analyzing image {idx+1}/{total_images} for encounter {encounter_id}")
            
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[prompt, image]
            )
            
            analysis_text = response.text
            
            structured_analysis = self._parse_json_response(analysis_text)
            
            return {
                "image_index": idx + 1,
                "image_path": os.path.basename(img_path),
                "structured_analysis": structured_analysis
            }
            
        except Exception as e:
            print(f"Error analyzing image {img_path}: {str(e)}")
            return {
                "image_index": idx + 1,
                "image_path": os.path.basename(img_path),
                "error": str(e)
            }
    
    def _parse_json_response(self, text):
        """Parse JSON from LLM response."""
        cleaned_text = text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError:
            print(f"Warning: Could not parse as JSON")
            return {"parse_error": "Could not parse as JSON", "raw_text": text}
    
    def _aggregate_analyses(self, image_analyses, encounter_id):
        """Aggregate structured analyses from multiple images."""
        valid_analyses = [a for a in image_analyses if "error" not in a and "structured_analysis" in a]
        
        if not valid_analyses:
            return {
                "error": "No valid analyses to aggregate",
                "message": "Unable to generate aggregated analysis due to errors in individual analyses."
            }
        
        if len(valid_analyses) == 1:
            return valid_analyses[0]["structured_analysis"]
        
        analysis_jsons = []
        for analysis in valid_analyses:
            analysis_json = json.dumps(analysis["structured_analysis"])
            analysis_jsons.append(f"Image {analysis['image_index']} ({analysis['image_path']}): {analysis_json}")
        
        aggregation_prompt = self._create_aggregation_prompt(analysis_jsons)
        
        try:
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[aggregation_prompt]
            )
            
            aggregation_text = response.text
            
            aggregated_analysis = self._parse_json_response(aggregation_text)
            
            return aggregated_analysis
            
        except Exception as e:
            print(f"Error creating aggregated analysis for encounter {encounter_id}: {str(e)}")
            return {
                "error": str(e),
                "aggregation_error": "Failed to generate aggregated analysis"
            }
    
    def _create_aggregation_prompt(self, analysis_jsons):
        """Create a prompt for aggregating multiple image analyses."""
        return f"""As dermatology specialist reviewing multiple skin image analyses for the same patient, combine these analyses and organize your response in a JSON dictionary:

1. SIZE: Approximate dimensions of lesions/affected areas, size comparison (thumbnail, palm, larger), Relative size comparisons for multiple lesions
2. SITE_LOCATION: Visible body parts in the image, body areas showing lesions/abnormalities, Specific anatomical locations affected
3. SKIN_DESCRIPTION: Lesion morphology (flat, raised, depressed), Texture of affected areas, Surface characteristics (scales, crust, fluid), Appearance of lesion boundaries
4. LESION_COLOR: Predominant color(s) of affected areas, Color variations within lesions, Color comparison to normal skin, Color distribution patterns
5. LESION_COUNT: Number of distinct lesions/affected areas, Single vs multiple presentation, Distribution pattern if multiple, Any counting limitations
6. EXTENT: How widespread the condition appears, Localized vs widespread assessment, Approximate percentage of visible skin affected, Limitations in determining full extent
7. TEXTURE: Expected tactile qualities, Smooth vs rough assessment, Notable textural features, Texture consistency across affected areas
8. ONSET_INDICATORS: Visual clues about condition duration, Acute vs chronic presentation features, Healing/progression/chronicity signs, Note: precise timing cannot be determined from images
9. ITCH_INDICATORS: Scratch marks/excoriations/trauma signs, Features associated with itchy conditions, Pruritic vs non-pruritic visual indicators, Note: sensation cannot be directly observed
10. OVERALL_IMPRESSION: Brief description (1-2 sentences), Key diagnostic features, Potential diagnoses (2-3)
    
{' '.join(analysis_jsons)}
"""

In [None]:
class ClinicalContextAnalyzer:
    """Service for analyzing clinical context."""
    
    def __init__(self, client):
        self.client = client
        
    def extract_clinical_context(self, query_context, encounter_id):
        """
        Extract structured clinical information from an encounter's query context.
        
        Args:
            query_context: The query context text
            encounter_id: Encounter identifier
            
        Returns:
            Dictionary with structured clinical information
        """
        clinical_text = self._extract_clinical_text(query_context)
        
        if not clinical_text:
            return {
                "encounter_id": encounter_id,
                "clinical_summary": "No clinical information available"
            }
        
        prompt = self._create_clinical_context_prompt(clinical_text)
        
        try:
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[prompt]
            )
            
            structured_context = self._parse_json_response(response.text)
            
            return {
                "encounter_id": encounter_id,
                "raw_clinical_text": clinical_text,
                "structured_clinical_context": structured_context
            }
                
        except Exception as e:
            print(f"Error extracting clinical context for encounter {encounter_id}: {str(e)}")
            return {
                "encounter_id": encounter_id,
                "raw_clinical_text": clinical_text,
                "error": str(e)
            }
    
    def _extract_clinical_text(self, query_context):
        """Extract clinical text from query context."""
        clinical_lines = []
        capturing = False
        for line in query_context.split('\n'):
            if "Background Clinical Information" in line:
                capturing = True
                continue
            elif "Available Options" in line:
                capturing = False
            elif capturing:
                clinical_lines.append(line)
        
        return "\n".join(clinical_lines).strip()
    
    def _create_clinical_context_prompt(self, clinical_text):
        """Create prompt for extracting structured clinical information."""
        return f"""You are a dermatology specialist analyzing patient information. 
Extract and structure all clinically relevant information from this patient description:

{clinical_text}

Organize your response in the following JSON structure:

1. DEMOGRAPHICS: Age, sex, and any other demographic data
2. SITE_LOCATION: Body parts affected by the condition as described in the text
3. SKIN_DESCRIPTION: Any mention of lesion morphology (flat, raised, depressed), texture, surface characteristics (scales, crust, fluid), appearance of lesion boundaries
4. LESION_COLOR: Any description of color(s) of affected areas, color variations, comparison to normal skin
5. LESION_COUNT: Any information about number of lesions, single vs multiple presentation, distribution pattern
6. EXTENT: How widespread the condition appears based on the description, localized vs widespread
7. TEXTURE: Any description of tactile qualities, smooth vs rough, notable textural features
8. ONSET_INDICATORS: Information about onset, duration, progression, or evolution of symptoms
9. ITCH_INDICATORS: Mentions of scratching, itchiness, or other sensory symptoms
10. OTHER_SYMPTOMS: Any additional symptoms mentioned (pain, burning, etc.)
11. TRIGGERS: Identified factors that worsen/improve the condition
12. HISTORY: Relevant past medical history or previous treatments
13. DIAGNOSTIC_CONSIDERATIONS: Any mentioned or suggested diagnoses in the text

Be concise and use medical terminology where appropriate. If information for a section is 
not available, indicate "Not mentioned".
"""

    def _parse_json_response(self, text):
        """Parse JSON from LLM response."""
        cleaned_text = text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError:
            print(f"Warning: Could not parse as JSON")
            return {"parse_error": "Could not parse as JSON", "raw_text": text}

In [None]:
class EvidenceIntegrator:
    """Integrates visual and clinical evidence."""
    
    def __init__(self, client):
        self.client = client
        
    def integrate_evidence(self, image_analysis, clinical_context, question_type):
        """
        Integrate image analysis with clinical context.
        
        Args:
            image_analysis: Structured image analysis
            clinical_context: Structured clinical context
            question_type: Type of question being asked
            
        Returns:
            Dictionary with integrated evidence
        """
        # Determine weighting based on question type
        weights = self._get_weights_for_question(question_type)
        
        # Create prompt for integration
        prompt = self._create_integration_prompt(
            image_analysis,
            clinical_context,
            question_type,
            weights
        )
        
        try:
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[prompt]
            )
            
            integration_text = response.text
            
            integrated_evidence = self._parse_json_response(integration_text)
            
            return integrated_evidence
            
        except Exception as e:
            print(f"Error integrating evidence: {str(e)}")
            return {
                "error": str(e),
                "message": "Failed to integrate evidence"
            }
    
    def _get_weights_for_question(self, question_type):
        """
        Determine evidence weighting based on question type.
        
        Returns:
            Dictionary with weights for each evidence type
        """
        weights = {
            "Site Location": {"image": 0.8, "clinical": 0.2},
            "Lesion Color": {"image": 0.9, "clinical": 0.1},
            "Size": {"image": 0.8, "clinical": 0.2},
            "Skin Description": {"image": 0.7, "clinical": 0.3},
            "Duration of Symptoms": {"image": 0.3, "clinical": 0.7},
            "Itch": {"image": 0.4, "clinical": 0.6},
            "Extent": {"image": 0.7, "clinical": 0.3},
            "Treatment": {"image": 0.1, "clinical": 0.9},
            "Lesion Evolution": {"image": 0.3, "clinical": 0.7},
            "Texture": {"image": 0.6, "clinical": 0.4},
            "Specific Diagnosis": {"image": 0.5, "clinical": 0.5},
            "Count": {"image": 0.8, "clinical": 0.2},
            "Differential": {"image": 0.5, "clinical": 0.5},
        }
        
        # Default weights if question type not found
        return weights.get(question_type, {"image": 0.5, "clinical": 0.5})
    
    def _create_integration_prompt(self, image_analysis, clinical_context, question_type, weights):
        """Create prompt for evidence integration."""
        return f"""As a dermatology specialist, integrate the visual findings from images with the clinical history.

IMAGE ANALYSIS:
{json.dumps(image_analysis.get("aggregated_analysis", {}), indent=2)}

CLINICAL CONTEXT:
{json.dumps(clinical_context.get("structured_clinical_context", {}), indent=2)}

For this {question_type} question, image evidence has {weights['image']*100}% weight and clinical evidence has {weights['clinical']*100}% weight.

Pay special attention to potential contradictions between visual findings and clinical history. Even minor inconsistencies should be noted as contradictions. Look for cases where clinical context suggests features not visible in images or where visual findings seem to contradict patient-reported symptoms or history.

Organize your response in a JSON structure with the following elements:

1. INTEGRATED_FINDINGS: For each key dermatological feature, combine visual and clinical evidence
   - SIZE
   - SITE_LOCATION
   - SKIN_DESCRIPTION
   - LESION_COLOR
   - LESION_COUNT
   - EXTENT
   - TEXTURE
   - ONSET_DURATION
   - SYMPTOMS

2. CONCORDANCE_ASSESSMENT: For each feature, assess if visual and clinical evidence are:
   - CONCORDANT: Visual and clinical evidence agree
   - DISCORDANT: Visual and clinical evidence conflict (explain the conflict)
   - COMPLEMENTARY: Evidence sources provide different but non-conflicting information
   - MISSING_VISUAL: Clinical description present but not visible in images
   - MISSING_CLINICAL: Visible in images but not mentioned in clinical context

3. CONTRADICTIONS: List any specific contradictions between visual and clinical evidence
   - For each contradiction, explain what the conflict is and assess which source is more reliable

4. WEIGHTED_EVIDENCE_PROFILE: Synthesize the most reliable information for each category
   - Apply the provided weights to determine the most reliable facts for each feature
   - Explain where you've prioritized one source over another

5. CONFIDENCE_SCORES: Score the confidence (0.0-1.0) in the integrated evidence for each feature

Be specific, concise, and use medical terminology where appropriate.
"""

    def _parse_json_response(self, text):
        """Parse JSON from LLM response."""
        cleaned_text = text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError:
            print(f"Warning: Could not parse as JSON")
            return {"parse_error": "Could not parse as JSON", "raw_text": text}

In [None]:
class ReasoningEngine:
    """Applies reasoning to determine the best answer."""
    
    def __init__(self, client):
        self.client = client
        
    def apply_initial_reasoning(self, question_text, question_type, options, integrated_evidence, model_predictions):
        """
        Apply initial reasoning to determine the most likely answer.
        
        Args:
            question_text: The question text
            question_type: The type of question
            options: Available answer options
            integrated_evidence: Integrated evidence from images and clinical context
            model_predictions: Model predictions to consider
            
        Returns:
            Dictionary with reasoning and answer
        """
        model_prediction_text = self._format_model_predictions(model_predictions)
        
        multiple_answers_allowed = question_type in ["Site Location", "Size", "Skin Description"]
        
        prompt = self._create_reasoning_prompt(
            question_text,
            question_type,
            options,
            integrated_evidence,
            model_prediction_text,
            multiple_answers_allowed
        )
        
        try:
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[prompt]
            )
            
            reasoning_text = response.text
            
            reasoning_result = self._parse_json_response(reasoning_text)
            
            validated_answer = self._validate_answer(reasoning_result.get('answer', ''), options)
            reasoning_result['validated_answer'] = validated_answer
            
            return reasoning_result
            
        except Exception as e:
            print(f"Error applying initial reasoning: {str(e)}")
            return {
                "reasoning": f"Error: {str(e)}",
                "answer": "Not mentioned",
                "validated_answer": "Not mentioned",
                "confidence": 0.0,
                "error": str(e)
            }
    
    def _format_model_predictions(self, model_predictions):
        """Format model predictions for the prompt."""
        model_prediction_text = ""
        for model_name, predictions in model_predictions.items():
            combined_pred = predictions.get('model_prediction', '')
            if isinstance(combined_pred, float) and pd.isna(combined_pred):
                combined_pred = "No prediction"
            model_prediction_text += f"- {model_name}: {combined_pred}\n"
        return model_prediction_text

    def _create_reasoning_prompt(self, question_text, question_type, options, integrated_evidence, model_prediction_text, multiple_answers_allowed):
        """Create a prompt for the reasoning layer."""
        specialized_guidance = ""
        
        if question_type == "Size" and all(option in ", ".join(options) for option in ["size of thumb nail", "size of palm", "larger area"]):
            specialized_guidance = """
SPECIALIZED GUIDANCE FOR SIZE ASSESSMENT:
When answering this size-related question, interpret the options as follows:
- "size of thumb nail": Individual lesions or affected areas approximately 1-2 cm in diameter
- "size of palm": Affected areas larger than the size of a thumb nail and roughly the size of a palm (approximately 1% of body surface area), which may include multiple smaller lesions across a region
- "larger area": Widespread involvement significantly larger than a palm, affecting a substantial portion(s) of the body

IMPORTANT: For cases with multiple small lesions that are visible in the images, but without extensive widespread involvement across large body regions, "size of palm" is likely the most appropriate answer.
"""
        elif question_type == "Lesion Color" and "combination" in ", ".join(options):
            specialized_guidance = """
SPECIALIZED GUIDANCE FOR LESION COLOR:
When answering color-related questions, pay careful attention to whether there are multiple distinct colors present across the affected areas. "Combination" would be appropriate when different lesions display different colors (e.g., some lesions appear red while others appear white), or when individual lesions show mixed or varied coloration patterns.
"""

        if multiple_answers_allowed:
            task_description = """
Based on all the evidence above, determine the most accurate answer(s) to the question. Your task is to:
1. Analyze the integrated evidence
2. Consider the model predictions, noting any consensus or disagreement, but maintain your critical judgment
3. Provide a detailed reasoning for your conclusion
4. Select the final answer(s) from the available options
5. Provide a confidence score from 0.0 to 1.0 for your answer. Be conservative in your confidence assessment. Consider all possible sources of uncertainty, including image quality limitations, interpretation ambiguity, and potential contradictions. Confidence scores should rarely exceed 0.8 unless evidence is absolutely conclusive and unambiguous.

If selecting multiple answers is appropriate, provide them in a comma-separated list. If no answer can be determined, select "Not mentioned".
"""
        else:
            task_description = """
Based on all the evidence above, determine the SINGLE most accurate answer to the question. Your task is to:
1. Analyze the integrated evidence
2. Consider the model predictions, noting any consensus or disagreement, but maintain your critical judgment
3. Provide a detailed reasoning for your conclusion
4. Select ONLY ONE answer option that is most accurate
5. Provide a confidence score from 0.0 to 1.0 for your answer. Be conservative in your confidence assessment. Consider all possible sources of uncertainty, including image quality limitations, interpretation ambiguity, and potential contradictions. Confidence scores should rarely exceed 0.8 unless evidence is absolutely conclusive and unambiguous.

For this question type, you must select ONLY ONE option as your answer. If no answer can be determined, select "Not mentioned".
"""

        return f"""You are a medical expert analyzing dermatological findings. Use the provided evidence to determine the most accurate answer(s) for the following question:

QUESTION: {question_text}
QUESTION TYPE: {question_type}
OPTIONS: {", ".join(options)}

INTEGRATED EVIDENCE:
{json.dumps(integrated_evidence, indent=2)}

MODEL PREDICTIONS:
{model_prediction_text}

{specialized_guidance}

IMPORTANT: While multiple model predictions are provided, be aware that these predictions can be inaccurate or inconsistent. Do not assume majority agreement equals correctness. Evaluate the evidence critically and independently from these predictions. Your job is to determine the correct answer based primarily on the integrated evidence, treating model predictions as secondary suggestions that may contain errors.

{task_description}

Format your response as a JSON object with these fields:
1. "reasoning": Your step-by-step reasoning process
2. "answer": Your final answer(s) as a single string or comma-separated list of options
3. "confidence": A score from 0.0 to 1.0 representing your confidence level in this answer
4. "evidence_used": The key evidence that supports your answer
5. "uncertainty_factors": Any factors that reduce your confidence
6. "counterfactual": What evidence would make you choose a different answer

When providing your answer, strictly adhere to the available options and only select from them.
"""

    def _parse_json_response(self, text):
        """Parse JSON from LLM response."""
        cleaned_text = text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError:
            print(f"Warning: Could not parse as JSON")
            return {"parse_error": "Could not parse as JSON", "raw_text": text}
    
    def _validate_answer(self, answer, options):
        """Validate the answer against available options."""
        if not answer:
            return "Not mentioned"
            
        answer = answer.lower()
        valid_answers = []
        
        if ',' in answer:
            answer_parts = [part.strip() for part in answer.split(',')]
            for part in answer_parts:
                for option in options:
                    if part == option.lower():
                        valid_answers.append(option)
        else:
            for option in options:
                if answer == option.lower():
                    valid_answers.append(option)
        
        if not valid_answers:
            if "not mentioned" in answer:
                valid_answers = ["Not mentioned"]
            else:
                valid_answers = ["Not mentioned"]
        
        return ", ".join(valid_answers)

In [None]:
class SelfReflectionEngine:
    """Applies self-reflection to the reasoning process."""
    
    def __init__(self, client):
        self.client = client
    
    def apply_reflection(self, question_text, question_type, options, integrated_evidence, reasoning_result):
        """
        Apply self-reflection to the initial reasoning result.
        
        Args:
            question_text: The question text
            question_type: The type of question
            options: Available answer options
            integrated_evidence: Integrated evidence
            reasoning_result: Initial reasoning result
            
        Returns:
            Dictionary with reflection results
        """
        prompt = self._create_reflection_prompt(
            question_text,
            question_type,
            options,
            integrated_evidence,
            reasoning_result
        )
        
        try:
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[prompt]
            )
            
            reflection_text = response.text
            
            reflection_result = self._parse_json_response(reflection_text)
            
            if 'revised_answer' in reflection_result:
                validated_answer = self._validate_answer(reflection_result.get('revised_answer', ''), options)
                reflection_result['validated_revised_answer'] = validated_answer
            
            return reflection_result
            
        except Exception as e:
            print(f"Error applying reflection: {str(e)}")
            return {
                "reflection": f"Error: {str(e)}",
                "requires_revision": False,
                "confidence": reasoning_result.get('confidence', 0.0),
                "error": str(e)
            }
    
    def _create_reflection_prompt(self, question_text, question_type, options, integrated_evidence, reasoning_result):
        """Create a prompt for the self-reflection layer."""
        return f"""You are a medical expert critically reviewing your own reasoning about a dermatological question. 
Carefully examine the initial reasoning and check for errors, biases, and inconsistencies:

QUESTION: {question_text}
QUESTION TYPE: {question_type}
OPTIONS: {", ".join(options)}

INTEGRATED EVIDENCE:
{json.dumps(integrated_evidence, indent=2)}

INITIAL REASONING:
{json.dumps(reasoning_result, indent=2)}

Your task is to:
1. Critically examine the initial reasoning for errors, biases, or incomplete analysis
2. Identify any evidence that was overlooked or misinterpreted
3. Evaluate whether the confidence level was appropriate
4. Determine if a different answer would be more accurate
5. Check if the evidence truly supports the chosen answer

Format your response as a JSON object with these fields:
1. "reflection": Your critical review of the initial reasoning
2. "overlooked_evidence": Any important evidence that was missed or undervalued
3. "misinterpreted_evidence": Any evidence that was incorrectly interpreted
4. "reasoning_gaps": Logical gaps or assumptions in the initial reasoning
5. "confidence_assessment": Was the confidence level appropriate? Why or why not?
6. "requires_revision": Boolean indicating if the answer needs to be revised (true/false)
7. "revised_answer": If revision is needed, the corrected answer
8. "revised_confidence": If revision is needed, the corrected confidence level (0.0-1.0)
9. "revision_explanation": If revision is needed, the explanation for the change

Be particularly careful to identify:
- Cherry-picking: Did the initial reasoning focus only on evidence supporting its conclusion?
- Overconfidence: Was the confidence level too high given the available evidence?
- Alternative explanations: Are there valid alternative interpretations of the evidence?
- Implicit assumptions: Were there unstated assumptions in the reasoning process?

Be honest and thorough in your self-reflection, even if it means acknowledging errors in the initial reasoning.
"""

    def _parse_json_response(self, text):
        """Parse JSON from LLM response."""
        cleaned_text = text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError:
            print(f"Warning: Could not parse as JSON")
            return {"parse_error": "Could not parse as JSON", "raw_text": text}
    
    def _validate_answer(self, answer, options):
        """Validate the answer against available options."""
        if not answer:
            return "Not mentioned"
            
        answer = answer.lower()
        valid_answers = []
        
        if ',' in answer:
            answer_parts = [part.strip() for part in answer.split(',')]
            for part in answer_parts:
                for option in options:
                    if part == option.lower():
                        valid_answers.append(option)
        else:
            for option in options:
                if answer == option.lower():
                    valid_answers.append(option)
        
        if not valid_answers:
            if "not mentioned" in answer:
                valid_answers = ["Not mentioned"]
            else:
                valid_answers = ["Not mentioned"]
        
        return ", ".join(valid_answers)

In [None]:
class ReAnalysisEngine:
    """Handles re-analysis when initial reasoning is insufficient."""
    
    def __init__(self, client):
        self.client = client
    
    def deep_analysis(self, question_text, question_type, options, integrated_evidence, reasoning_result, reflection_result):
        """
        Perform a deeper analysis based on reflection results.
        
        Args:
            question_text: The question text
            question_type: The type of question
            options: Available answer options
            integrated_evidence: Integrated evidence
            reasoning_result: Initial reasoning result
            reflection_result: Self-reflection result
            
        Returns:
            Dictionary with deep analysis result
        """
        prompt = self._create_deep_analysis_prompt(
            question_text,
            question_type,
            options,
            integrated_evidence,
            reasoning_result,
            reflection_result
        )
        
        try:
            response = self.client.models.generate_content(
                model=Config.GEMINI_MODEL,
                contents=[prompt]
            )
            
            analysis_text = response.text
            
            deep_analysis = self._parse_json_response(analysis_text)
            
            validated_answer = self._validate_answer(deep_analysis.get('final_answer', ''), options)
            deep_analysis['validated_final_answer'] = validated_answer
            
            return deep_analysis
            
        except Exception as e:
            print(f"Error performing deep analysis: {str(e)}")
            return {
                "deep_reasoning": f"Error: {str(e)}",
                "final_answer": reasoning_result.get('validated_answer', 'Not mentioned'),
                "validated_final_answer": reasoning_result.get('validated_answer', 'Not mentioned'),
                "final_confidence": reasoning_result.get('confidence', 0.0),
                "error": str(e)
            }
    
    def _create_deep_analysis_prompt(self, question_text, question_type, options, integrated_evidence, reasoning_result, reflection_result):
        """Create a prompt for deep analysis."""
        return f"""You are a medical expert performing a deep analysis for a dermatological question after identifying issues with initial reasoning.
Review all evidence and reasoning paths comprehensively:

QUESTION: {question_text}
QUESTION TYPE: {question_type}
OPTIONS: {", ".join(options)}

INTEGRATED EVIDENCE:
{json.dumps(integrated_evidence, indent=2)}

INITIAL REASONING:
{json.dumps(reasoning_result, indent=2)}

REFLECTION:
{json.dumps(reflection_result, indent=2)}

Your task is to:
1. Re-examine ALL available evidence with fresh eyes
2. Address the specific issues highlighted in the reflection
3. Consider each answer option systematically
4. Weigh evidence for and against each potential answer
5. Determine the most accurate answer based on comprehensive analysis

For issues identified in reflection:
- Overlooked evidence: {reflection_result.get('overlooked_evidence', 'None identified')}
- Misinterpreted evidence: {reflection_result.get('misinterpreted_evidence', 'None identified')}
- Reasoning gaps: {reflection_result.get('reasoning_gaps', 'None identified')}

Format your response as a JSON object with these fields:
1. "deep_reasoning": Your comprehensive analysis considering all evidence and perspectives
2. "systematic_assessment": Assessment of evidence for EACH possible answer option
3. "final_answer": Your conclusion after deep analysis
4. "final_confidence": Your confidence level after deep analysis (0.0-1.0)
5. "key_determinants": The most important factors that determined your final answer
6. "remaining_uncertainties": Any unresolved questions or limitations

Be thorough, balanced, and precise in your analysis. Consider the evidence holistically and avoid the pitfalls identified in the reflection phase.
"""

    def _parse_json_response(self, text):
        """Parse JSON from LLM response."""
        cleaned_text = text
        if "```json" in cleaned_text:
            cleaned_text = cleaned_text.split("```json")[1]
        if "```" in cleaned_text:
            cleaned_text = cleaned_text.split("```")[0]
        
        try:
            return json.loads(cleaned_text)
        except json.JSONDecodeError:
            print(f"Warning: Could not parse as JSON")
            return {"parse_error": "Could not parse as JSON", "raw_text": text}
    
    def _validate_answer(self, answer, options):
        """Validate the answer against available options."""
        if not answer:
            return "Not mentioned"
            
        answer = answer.lower()
        valid_answers = []
        
        if ',' in answer:
            answer_parts = [part.strip() for part in answer.split(',')]
            for part in answer_parts:
                for option in options:
                    if part == option.lower():
                        valid_answers.append(option)
        else:
            for option in options:
                if answer == option.lower():
                    valid_answers.append(option)
        
        if not valid_answers:
            if "not mentioned" in answer:
                valid_answers = ["Not mentioned"]
            else:
                valid_answers = ["Not mentioned"]
        
        return ", ".join(valid_answers)

In [None]:
class AgenticDermatologyPipeline:
    """Main pipeline for agentic dermatology analysis."""
    
    def __init__(self, api_key=None):
        if api_key is None:
            load_dotenv()
            api_key = os.getenv("API_KEY")
        
        self.client = genai.Client(api_key=api_key)
        
        self.image_analyzer = ImageAnalysisService(self.client)
        self.clinical_analyzer = ClinicalContextAnalyzer(self.client)
        self.evidence_integrator = EvidenceIntegrator(self.client)
        self.reasoning_engine = ReasoningEngine(self.client)
        self.reflection_engine = SelfReflectionEngine(self.client)
        self.reanalysis_engine = ReAnalysisEngine(self.client)
    
    def process_single_encounter(self, agentic_data, encounter_id):
        """
        Process a single encounter with all its questions using the agentic pipeline.

        Args:
            agentic_data: AgenticRAGData instance containing all encounter data
            encounter_id: The specific encounter ID to process

        Returns:
            Dictionary with all questions processed with agentic reasoning for this encounter
        """
        all_pairs = agentic_data.get_all_encounter_question_pairs()
        encounter_pairs = [pair for pair in all_pairs if pair[0] == encounter_id]

        if not encounter_pairs:
            print(f"No data found for encounter {encounter_id}")
            return None

        print(f"Processing {len(encounter_pairs)} questions for encounter {encounter_id}")

        encounter_results = {encounter_id: {}}

        # Extract image analysis once per encounter
        print(f"Computing image analysis for {encounter_id}")
        sample_data = agentic_data.get_combined_data(encounter_pairs[0][0], encounter_pairs[0][1])
        image_analysis = self.image_analyzer.analyze_images(sample_data['images'], encounter_id)

        # Extract clinical context once per encounter
        print(f"Extracting clinical context for {encounter_id}")
        clinical_context = self.clinical_analyzer.extract_clinical_context(
            sample_data['query_context'], 
            encounter_id
        )

        for i, (encounter_id, base_qid) in enumerate(encounter_pairs):
            print(f"Processing question {i+1}/{len(encounter_pairs)}: {base_qid}")

            sample_data = agentic_data.get_combined_data(encounter_id, base_qid)
            if not sample_data:
                print(f"Warning: No data found for {encounter_id}, {base_qid}")
                continue

            # Extract question details
            question_text = sample_data['query_context'].split("MAIN QUESTION TO ANSWER:")[1].split("\n")[0].strip()
            question_type = sample_data['question_type']
            options = sample_data['options']
            model_predictions = sample_data['model_predictions']

            # Integrate evidence (reuse previously extracted clinical context)
            print(f"Integrating evidence for {encounter_id}, {base_qid}")
            integrated_evidence = self.evidence_integrator.integrate_evidence(
                image_analysis,
                clinical_context,
                question_type
            )

            # Initial reasoning
            print(f"Initial reasoning for {encounter_id}, {base_qid}")
            reasoning_result = self.reasoning_engine.apply_initial_reasoning(
                question_text,
                question_type,
                options,
                integrated_evidence,
                model_predictions
            )

            # Determine if self-reflection is needed based on confidence
            confidence = reasoning_result.get('confidence', 0.0)
            if isinstance(confidence, str):
                try:
                    confidence = float(confidence)
                except:
                    confidence = 0.0

            final_result = reasoning_result
            reflection_path = []

            # Apply self-reflection if confidence is below threshold
            if confidence < Config.CONFIDENCE_THRESHOLD:
                print(f"Confidence {confidence} below threshold. Applying self-reflection.")

                reflection_result = self.reflection_engine.apply_reflection(
                    question_text,
                    question_type,
                    options,
                    integrated_evidence,
                    reasoning_result
                )
                reflection_path.append(reflection_result)

                # Determine if re-analysis is needed based on reflection
                requires_revision = reflection_result.get('requires_revision', False)
                if requires_revision:
                    print(f"Reflection indicates revision needed. Performing deep analysis.")

                    deep_analysis = self.reanalysis_engine.deep_analysis(
                        question_text,
                        question_type,
                        options,
                        integrated_evidence,
                        reasoning_result,
                        reflection_result
                    )
                    reflection_path.append(deep_analysis)

                    final_result = {
                        "reasoning": deep_analysis.get('deep_reasoning', ''),
                        "answer": deep_analysis.get('final_answer', 'Not mentioned'),
                        "validated_answer": deep_analysis.get('validated_final_answer', 'Not mentioned'),
                        "confidence": deep_analysis.get('final_confidence', 0.0)
                    }
                else:
                    # Use original answer but with updated confidence if available
                    revised_confidence = reflection_result.get('revised_confidence', reasoning_result.get('confidence', 0.0))
                    final_result = {
                        "reasoning": reasoning_result.get('reasoning', ''),
                        "answer": reasoning_result.get('answer', 'Not mentioned'),
                        "validated_answer": reasoning_result.get('validated_answer', 'Not mentioned'),
                        "confidence": revised_confidence
                    }

            encounter_results[encounter_id][base_qid] = {
                "query_context": sample_data['query_context'],
                "options": sample_data['options'],
                "model_predictions": sample_data['model_predictions'],
                "integrated_evidence": integrated_evidence,
                "reasoning_result": reasoning_result,
                "reflection_path": reflection_path,
                "final_result": final_result,
                "final_answer": final_result.get('validated_answer', 'Not mentioned')
            }

        output_file = os.path.join(Config.OUTPUT_DIR, f"agentic_reasoning_results_{encounter_id}.json")
        with open(output_file, "w") as f:
            json.dump(encounter_results, f, indent=2)

        print(f"Processed all {len(encounter_pairs)} questions for encounter {encounter_id}")
        return encounter_results
    
    def format_results_for_evaluation(self, encounter_results, output_file):
        """Format results for official evaluation."""
        QIDS = [
            "CQID010-001",
            "CQID011-001", "CQID011-002", "CQID011-003", "CQID011-004", "CQID011-005", "CQID011-006",
            "CQID012-001", "CQID012-002", "CQID012-003", "CQID012-004", "CQID012-005", "CQID012-006",
            "CQID015-001",
            "CQID020-001", "CQID020-002", "CQID020-003", "CQID020-004", "CQID020-005", 
            "CQID020-006", "CQID020-007", "CQID020-008", "CQID020-009",
            "CQID025-001",
            "CQID034-001",
            "CQID035-001",
            "CQID036-001",
        ]
        
        qid_variants = {}
        for qid in QIDS:
            base_qid, variant = qid.split('-')
            if base_qid not in qid_variants:
                qid_variants[base_qid] = []
            qid_variants[base_qid].append(qid)
        
        required_base_qids = set(qid.split('-')[0] for qid in QIDS)
        
        formatted_predictions = []
        for encounter_id, questions in encounter_results.items():
            encounter_base_qids = set(questions.keys())
            if not required_base_qids.issubset(encounter_base_qids):
                print(f"Skipping encounter {encounter_id} - missing required questions")
                continue
            
            pred_entry = {'encounter_id': encounter_id}
            
            for base_qid, question_data in questions.items():
                if base_qid not in qid_variants:
                    continue
                
                final_answer = question_data['final_answer']
                options = question_data['options']
                
                not_mentioned_index = self._find_not_mentioned_index(options)
                
                self._process_answers(
                    pred_entry, 
                    base_qid, 
                    final_answer, 
                    options, 
                    qid_variants, 
                    not_mentioned_index
                )
            
            formatted_predictions.append(pred_entry)
        
        with open(output_file, 'w') as f:
            json.dump(formatted_predictions, f, indent=2)
        
        print(f"Formatted predictions saved to {output_file} ({len(formatted_predictions)} complete encounters)")
        return formatted_predictions
    
    def _find_not_mentioned_index(self, options):
        """Find the index of 'Not mentioned' in options."""
        for i, opt in enumerate(options):
            if opt.lower() == "not mentioned":
                return i
        return len(options) - 1
    
    def _process_answers(self, pred_entry, base_qid, final_answer, options, qid_variants, not_mentioned_index):
        """Process answers and add to prediction entry."""
        if ',' in final_answer:
            answer_parts = [part.strip() for part in final_answer.split(',')]
            answer_indices = []
            
            for part in answer_parts:
                found = False
                for i, opt in enumerate(options):
                    if part.lower() == opt.lower():
                        answer_indices.append(i)
                        found = True
                        break
                
                if not found:
                    answer_indices.append(not_mentioned_index)
            
            available_variants = qid_variants[base_qid]
            
            for i, idx in enumerate(answer_indices):
                if i < len(available_variants):
                    pred_entry[available_variants[i]] = idx
            
            for i in range(len(answer_indices), len(available_variants)):
                pred_entry[available_variants[i]] = not_mentioned_index
            
        else:
            answer_index = not_mentioned_index
            
            for i, opt in enumerate(options):
                if final_answer.lower() == opt.lower():
                    answer_index = i
                    break
            
            pred_entry[qid_variants[base_qid][0]] = answer_index
            
            if len(qid_variants[base_qid]) > 1:
                for i in range(1, len(qid_variants[base_qid])):
                    pred_entry[qid_variants[base_qid][i]] = not_mentioned_index

In [None]:
# def run_all_encounters_pipeline():
#     """Run the agentic pipeline for all available encounters."""
#     # Load model predictions and validation dataset
#     model_predictions_dict = DataLoader.load_all_model_predictions(Config.MODEL_PREDICTIONS_DIR)
#     all_models_df = pd.concat(model_predictions_dict.values(), ignore_index=True)
#     validation_df = DataLoader.load_validation_dataset(Config.VAL_DATASET_PATH)
    
#     # Create agentic data and pipeline
#     agentic_data = AgenticRAGData(all_models_df, validation_df)
#     pipeline = AgenticDermatologyPipeline()
    
#     # Get all unique encounter IDs
#     all_pairs = agentic_data.get_all_encounter_question_pairs()
#     unique_encounter_ids = sorted(list(set(pair[0] for pair in all_pairs)))
#     print(f"Found {len(unique_encounter_ids)} unique encounters to process")
    
#     # Process each encounter
#     all_encounter_results = {}
#     for i, encounter_id in enumerate(unique_encounter_ids):
#         print(f"Processing encounter {i+1}/{len(unique_encounter_ids)}: {encounter_id}...")
#         encounter_results = pipeline.process_single_encounter(agentic_data, encounter_id)
#         if encounter_results:
#             all_encounter_results.update(encounter_results)
        
#         # Save intermediate results periodically
#         if (i+1) % 5 == 0 or (i+1) == len(unique_encounter_ids):
#             timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
#             intermediate_output_file = os.path.join(
#                 Config.OUTPUT_DIR, 
#                 f"intermediate_agentic_results_{i+1}_of_{len(unique_encounter_ids)}_{timestamp}.json"
#             )
#             with open(intermediate_output_file, 'w') as f:
#                 json.dump(all_encounter_results, f, indent=2)
#             print(f"Saved intermediate results after processing {i+1} encounters")
    
#     # Format and save final predictions
#     timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
#     output_file = os.path.join(
#         Config.OUTPUT_DIR, 
#         f"data_cvqa_sys_agentic_all_{timestamp}.json"
#     )
    
#     formatted_predictions = pipeline.format_results_for_evaluation(all_encounter_results, output_file)
    
#     print(f"Processed {len(formatted_predictions)} encounters successfully")
#     return formatted_predictions

# if __name__ == "__main__":
#     formatted_predictions = run_all_encounters_pipeline()
#     print(f"Total complete encounters processed: {len(formatted_predictions)}")

In [None]:
def run_single_encounter_pipeline(encounter_id):
    """Run the agentic pipeline for a single encounter."""
    # Load model predictions and validation dataset
    model_predictions_dict = DataLoader.load_all_model_predictions(Config.MODEL_PREDICTIONS_DIR)
    all_models_df = pd.concat(model_predictions_dict.values(), ignore_index=True)
    validation_df = DataLoader.load_validation_dataset(Config.VAL_DATASET_PATH)
    
    # Create agentic data and pipeline
    agentic_data = AgenticRAGData(all_models_df, validation_df)
    pipeline = AgenticDermatologyPipeline()
    
    # Process the encounter
    encounter_results = pipeline.process_single_encounter(agentic_data, encounter_id)
    
    # Format and save predictions
    output_file = os.path.join(
        Config.OUTPUT_DIR, 
        f"data_cvqa_sys_agentic_{encounter_id}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    )
    formatted_predictions = pipeline.format_results_for_evaluation(encounter_results, output_file)
    
    return formatted_predictions

In [None]:
if __name__ == "__main__":
    encounter_id = "ENC00854"
    formatted_predictions = run_single_encounter_pipeline(encounter_id)
    print(f"Processed encounter {encounter_id} with {len(formatted_predictions)} prediction entries")

In [None]:
formatted_predictions