In [11]:
import dspy
import json
import pandas as pd
import asyncio
import aiofiles
from pathlib import Path
from typing import List, Dict, Any, Tuple
import os
from collections import defaultdict, Counter
from concurrent.futures import ThreadPoolExecutor
import time

In [5]:
# Set your API key (uncomment and add your key)
# os.environ["OPENAI_API_KEY"] = "your-api-key-here"

# Configure DSPy with OpenAI GPT-4o-mini for cost efficiency
lm = dspy.LM('gemini/gemini-2.5-pro', max_tokens=20000, temperature=1.0)
dspy.configure(lm=lm)

print("Language model configured successfully")

Language model configured successfully


In [2]:
# Define paths
EXTRACTED_DATA_PATH = "/nlp/data/karthik9/Sprint1/Dental/Data/acute_pain_mds"
GROUND_TRUTH_PATH = "/nlp/data/karthik9/Sprint1/Dental/Data/jsons/dichotomous_outcomes.json"

In [15]:
def load_ground_truth_data(gt_path: str) -> Dict[str, List[Dict]]:
    """Load ground truth data and group by filename"""
    with open(gt_path, 'r') as f:
        gt_data = json.load(f)
    
    # Group by filename
    grouped_gt = defaultdict(list)
    for record in gt_data:
        filename = record.get('filename', '')
        grouped_gt[filename].append(record)
    
    print(f"📚 Loaded ground truth data:")
    print(f"   - Total records: {len(gt_data)}")
    print(f"   - Unique files: {len(grouped_gt)}")
    print(f"   - Sample filenames: {list(grouped_gt.keys())[:5]}")
    
    return dict(grouped_gt)

def load_extracted_results(extracted_path: str) -> Dict[str, List[Dict]]:
    """Load all DSPy extracted results from _do.json files"""
    extracted_data = {}
    
    for root, dirs, files in os.walk(extracted_path):
        for file in files:
            if file.endswith('_do.json'):
                # Extract filename without _do.json
                filename = file.replace('_do.json', '')
                file_path = os.path.join(root, file)
                
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                        extracted_data[filename] = data if isinstance(data, list) else [data]
                except Exception as e:
                    print(f"⚠️ Error loading {file_path}: {e}")
    
    print(f"🔍 Loaded extracted data:")
    print(f"   - Total files: {len(extracted_data)}")
    print(f"   - Sample filenames: {list(extracted_data.keys())[:6]}")
    
    return extracted_data

# %%
# Load the data
print("Loading data...")
ground_truth = load_ground_truth_data(GROUND_TRUTH_PATH)
extracted_results = load_extracted_results(EXTRACTED_DATA_PATH)

# Find common files
common_files = set(ground_truth.keys()) & set(extracted_results.keys())
print(f"\n📊 Files in both datasets: {len(common_files)}")
print(f"Sample common files: {list(common_files)[:5]}")


Loading data...
📚 Loaded ground truth data:
   - Total records: 1691
   - Unique files: 81
   - Sample filenames: ['3848_Cooper', '3846_Cooper', '569_Seymour', '3155_Gay', '2412_Kiersch']
🔍 Loaded extracted data:
   - Total files: 6
   - Sample filenames: ['1102_Qi', '1741_Mehlisch', '1789_Matthews', '2275_Kyselovic', '2518_Kellstein', '3641_Daniels']

📊 Files in both datasets: 6
Sample common files: ['1741_Mehlisch', '2275_Kyselovic', '3641_Daniels', '1102_Qi', '1789_Matthews']


In [6]:

class CompareExtractionResults(dspy.Signature):
    """Compare DSPy extracted results with ground truth for a specific study file."""
    
    filename = dspy.InputField(desc="Name of the study file being analyzed")
    ground_truth_records = dspy.InputField(desc="Ground truth records for this study as JSON string")
    extracted_records = dspy.InputField(desc="DSPy extracted records for this study as JSON string")
    
    comparison_summary = dspy.OutputField(desc="Summary comparing extracted vs ground truth: matches, missing, extra records with counts")
    extra_records_analysis = dspy.OutputField(desc="Detailed analysis of extra records that shouldn't exist - patterns, likely causes")
    missing_records_analysis = dspy.OutputField(desc="Analysis of ground truth records that were missed during extraction")
    accuracy_metrics = dspy.OutputField(desc="Precision, recall, F1-score calculations and interpretation")

class IdentifyExtraRecordPatterns(dspy.Signature):
    """Identify patterns in extra records to understand systematic over-extraction issues."""
    
    extra_records_batch = dspy.InputField(desc="Batch of extra records from multiple files as JSON")
    signature_definitions = dspy.InputField(desc="Original DSPy signatures used for extraction")
    
    common_patterns = dspy.OutputField(desc="Common patterns found in extra records across files")
    signature_issues = dspy.OutputField(desc="Specific issues with the DSPy signatures that cause over-extraction")

class AnalyzeExtraRecord(dspy.Signature):
    """Analyze why a specific extra record was extracted incorrectly."""
    
    extra_record = dspy.InputField(desc="The specific extra record that was incorrectly extracted")
    ground_truth_context = dspy.InputField(desc="Ground truth records for context of what should exist")
    filename = dspy.InputField(desc="Study filename for context")
    
    root_cause = dspy.OutputField(desc="Why this specific record was extracted when it shouldn't exist")
    signature_weakness = dspy.OutputField(desc="Which part of DSPy signature logic led to this error")
    fix_recommendation = dspy.OutputField(desc="Specific change to prevent this type of over-extraction")

class GenerateSignatureImprovements(dspy.Signature):
    """Generate improved DSPy signatures based on analysis of extraction errors."""
    
    original_signatures = dspy.InputField(desc="Original DSPy signatures that have issues")
    error_analysis_summary = dspy.InputField(desc="Summary of all extraction errors and patterns found")
    common_over_extraction_patterns = dspy.InputField(desc="Common patterns in over-extracted records")
    
    improved_signatures = dspy.OutputField(desc="Improved versions of the DSPy signatures with better constraints")
    additional_validation_rules = dspy.OutputField(desc="Validation rules to add to modules to prevent over-extraction")
    few_shot_examples = dspy.OutputField(desc="Better few-shot examples to include negative cases")

class BatchAnalyzeExtraRecords(dspy.Signature):
    """Analyze a batch of extra records for efficiency."""
    
    extra_records_batch = dspy.InputField(desc="Batch of extra records to analyze")
    ground_truth_batch = dspy.InputField(desc="Corresponding ground truth contexts")
    filenames_batch = dspy.InputField(desc="Corresponding filenames")
    
    batch_analysis = dspy.OutputField(desc="Analysis results for the entire batch with root causes and patterns")


In [None]:
class AsyncFileComparisonModule(dspy.Module):
    """Async module to compare extraction results for a single file."""
    
    def __init__(self):
        super().__init__()
        self.compare = dspy.ChainOfThought(CompareExtractionResults)
        
    async def forward(self, filename: str, ground_truth: List[Dict], extracted: List[Dict]):
        """Async compare extraction results and return analysis."""
        
        # Run in thread pool to avoid blocking
        loop = asyncio.get_event_loop()
        
        def _compare():
            gt_json = json.dumps(ground_truth, indent=2)
            extracted_json = json.dumps(extracted, indent=2)
            
            result = self.compare(
                filename=filename,
                ground_truth_records=gt_json,
                extracted_records=extracted_json
            )
            return result
        
        result = await loop.run_in_executor(None, _compare)
        
        return dspy.Prediction(
            filename=filename,
            gt_count=len(ground_truth),
            extracted_count=len(extracted),
            comparison_summary=result.comparison_summary,
            extra_records_analysis=result.extra_records_analysis,
            missing_records_analysis=result.missing_records_analysis,
            accuracy_metrics=result.accuracy_metrics
        )

class AsyncExtraRecordAnalysisModule(dspy.Module):
    """Async module to analyze extra records and find patterns with batching."""
    
    def __init__(self, batch_size: int = 10):
        super().__init__()
        self.pattern_identifier = dspy.ChainOfThought(IdentifyExtraRecordPatterns)
        self.batch_analyzer = dspy.ChainOfThought(BatchAnalyzeExtraRecords)
        self.batch_size = batch_size
        
    async def forward(self, extra_records_batch: List[Dict], signatures: str, 
                      detailed_records: List[Tuple[Dict, List[Dict], str]] = None):
        """Async analyze extra records patterns and individual cases with batching."""
        
        loop = asyncio.get_event_loop()
        
        # Analyze patterns across all extra records
        async def _analyze_patterns():
            def _pattern_analysis():
                batch_json = json.dumps(extra_records_batch, indent=2)
                return self.pattern_identifier(
                    extra_records_batch=batch_json,
                    signature_definitions=signatures
                )
            return await loop.run_in_executor(None, _pattern_analysis)
        
        # Batch analyze individual records for efficiency
        async def _analyze_individual_batches():
            if not detailed_records:
                return []
            
            analyses = []
            
            # Process records in batches
            for i in range(0, len(detailed_records), self.batch_size):
                batch = detailed_records[i:i + self.batch_size]
                
                def _batch_analysis():
                    batch_records = [record for record, _, _ in batch]
                    batch_gt = [gt for _, gt, _ in batch]
                    batch_filenames = [filename for _, _, filename in batch]
                    
                    return self.batch_analyzer(
                        extra_records_batch=json.dumps(batch_records, indent=2),
                        ground_truth_batch=json.dumps(batch_gt, indent=2),
                        filenames_batch=json.dumps(batch_filenames, indent=2)
                    )
                
                batch_result = await loop.run_in_executor(None, _batch_analysis)
                analyses.append(batch_result.batch_analysis)
            
            return analyses
        
        # Run both analyses concurrently
        pattern_analysis, individual_analyses = await asyncio.gather(
            _analyze_patterns(),
            _analyze_individual_batches()
        )
        
        return dspy.Prediction(
            common_patterns=pattern_analysis.common_patterns,
            signature_issues=pattern_analysis.signature_issues,
            individual_analyses=individual_analyses
        )

class AsyncSignatureImprovementModule(dspy.Module):
    """Async module to generate improved signatures based on error analysis."""
    
    def __init__(self):
        super().__init__()
        self.improver = dspy.ChainOfThought(GenerateSignatureImprovements)
        
    async def forward(self, original_signatures: str, error_summary: str, patterns: str):
        """Async generate improved signatures and validation rules."""
        
        loop = asyncio.get_event_loop()
        
        def _improve():
            return self.improver(
                original_signatures=original_signatures,
                error_analysis_summary=error_summary,
                common_over_extraction_patterns=patterns
            )
        
        result = await loop.run_in_executor(None, _improve)
        
        return dspy.Prediction(
            improved_signatures=result.improved_signatures,
            additional_validation_rules=result.additional_validation_rules,
            few_shot_examples=result.few_shot_examples
        )

class AsyncComprehensiveAnalysisModule(dspy.Module):
    """Main async module orchestrating the complete analysis pipeline with concurrency."""
    
    def __init__(self, max_concurrent_files: int = 6, batch_size: int = 10):
        super().__init__()
        self.file_comparator = AsyncFileComparisonModule()
        self.extra_record_analyzer = AsyncExtraRecordAnalysisModule(batch_size)
        self.signature_improver = AsyncSignatureImprovementModule()
        self.max_concurrent = max_concurrent_files
        self.semaphore = asyncio.Semaphore(max_concurrent_files)
        
    def find_extra_records(self, gt_records: List[Dict], extracted_records: List[Dict]) -> List[Dict]:
        """Find records that exist in extracted but not in ground truth."""
        
        # Create signatures for ground truth records
        gt_signatures = set()
        for record in gt_records:
            signature = (
                record.get('First_Author', ''),
                record.get('Intervention_Description', ''),
                str(record.get('Outcome_Type', '')),
                record.get('Follow_Up_Time', ''),
                record.get('Adverse_Effect_Specify', '')
            )
            gt_signatures.add(signature)
        
        # Find extracted records not in ground truth
        extra_records = []
        for extracted in extracted_records:
            print(extracted)
            signature = (
                extracted.get('First_Author', ''),
                extracted.get('Intervention_Description', ''),
                str(extracted.get('Outcome_Type', '')),
                extracted.get('Follow_Up_Time', ''),
                extracted.get('Adverse_Effect_Specify', '')
            )
            
            if signature not in gt_signatures:
                extra_records.append(extracted)
        
        return extra_records
    
    async def analyze_single_file_with_semaphore(self, filename: str, 
                                                ground_truth_data: Dict[str, List[Dict]], 
                                                extracted_data: Dict[str, List[Dict]]):
        """Analyze a single file with concurrency control."""
        
        async with self.semaphore:
            gt_records = ground_truth_data[filename]
            extracted_records = extracted_data[filename]
            
            # Find extra records
            extra_records = self.find_extra_records(gt_records, extracted_records)
            
            # Run file comparison
            file_analysis = await self.file_comparator(filename, gt_records, extracted_records)
            
            return {
                'filename': filename,
                'analysis': file_analysis,
                'extra_records_count': len(extra_records),
                'extra_records': extra_records,
                'gt_records': gt_records
            }
        
    async def forward(self, ground_truth_data: Dict[str, List[Dict]], 
                      extracted_data: Dict[str, List[Dict]], 
                      original_signatures: str,
                      max_files: int = 20):
        """Run comprehensive async analysis on the datasets."""
        
        start_time = time.time()
        
        # Find common files
        common_files = list(set(ground_truth_data.keys()) & set(extracted_data.keys()))
        analysis_files = common_files[:max_files]
        
        print(f"🚀 Starting async analysis of {len(analysis_files)} files with max {self.max_concurrent} concurrent...")
        
        # Step 1: Analyze all files concurrently
        print("📋 Processing files concurrently...")
        
        file_tasks = [
            self.analyze_single_file_with_semaphore(filename, ground_truth_data, extracted_data)
            for filename in analysis_files
        ]
        
        file_analyses = await asyncio.gather(*file_tasks)
        
        # Collect all extra records and details
        all_extra_records = []
        extra_record_details = []
        
        for file_result in file_analyses:
            extra_records = file_result['extra_records']
            all_extra_records.extend(extra_records)
            
            # Collect details for individual analysis (limit per file)
            for extra_record in extra_records[:2]:
                extra_record_details.append((
                    extra_record, 
                    file_result['gt_records'], 
                    file_result['filename']
                ))
        
        file_processing_time = time.time() - start_time
        print(f"📊 Files processed in {file_processing_time:.2f}s. Found {len(all_extra_records)} total extra records")
        
        if not all_extra_records:
            print("✅ No extra records found! Your extraction is perfect.")
            return dspy.Prediction(
                file_analyses=file_analyses,
                extra_records_analysis=dspy.Prediction(common_patterns="No extra records found", signature_issues="No issues detected", individual_analyses=[]),
                signature_improvements=dspy.Prediction(improved_signatures="No improvements needed", additional_validation_rules="No additional rules needed", few_shot_examples="Current examples are sufficient"),
                summary_stats={'total_files': len(analysis_files), 'total_extra_records': 0, 'files_with_extras': 0},
                processing_time=file_processing_time
            )
        
        # Step 2 & 3: Run extra records analysis and signature improvements concurrently
        print("🔍 Analyzing patterns and generating improvements concurrently...")
        
        # Create error summary
        error_summary = f"""
        Files analyzed: {len(analysis_files)}
        Total extra records: {len(all_extra_records)}
        Files with extra records: {len([f for f in file_analyses if f['extra_records_count'] > 0])}
        Over-extraction rate: {(len(all_extra_records) / sum(f['analysis'].extracted_count for f in file_analyses) * 100):.1f}%
        Processing time: {file_processing_time:.2f}s
        """
        
        # Run extra record analysis and signature improvement concurrently
        extra_analysis_task = self.extra_record_analyzer(
            all_extra_records, 
            original_signatures,
            extra_record_details
        )
        
        # We'll run signature improvement after getting patterns
        extra_analysis = await extra_analysis_task
        
        # Now run signature improvement with the patterns
        improvements = await self.signature_improver(
            original_signatures,
            error_summary,
            extra_analysis.common_patterns
        )
        
        total_time = time.time() - start_time
        print(f"✅ Complete analysis finished in {total_time:.2f}s")
        
        return dspy.Prediction(
            file_analyses=file_analyses,
            extra_records_analysis=extra_analysis,
            signature_improvements=improvements,
            summary_stats={
                'total_files': len(analysis_files),
                'total_extra_records': len(all_extra_records),
                'files_with_extras': len([f for f in file_analyses if f['extra_records_count'] > 0]),
                'processing_time': total_time,
                'file_processing_time': file_processing_time
            }
        )

In [16]:
async def load_ground_truth_data_async(gt_path: str) -> Dict[str, List[Dict]]:
    """Async load ground truth data and group by filename"""
    
    async with aiofiles.open(gt_path, 'r') as f:
        content = await f.read()
        gt_data = json.loads(content)
    
    grouped_gt = defaultdict(list)
    for record in gt_data:
        filename = record.get('filename', '')
        grouped_gt[filename].append(record)
    
    print(f"📚 Loaded ground truth: {len(gt_data)} records across {len(grouped_gt)} files")
    return dict(grouped_gt)

async def load_single_extracted_file(file_path: str, filename: str) -> Tuple[str, List[Dict]]:
    """Load a single extracted results file asynchronously (handles wrapped metadata)."""
    try:
        async with aiofiles.open(file_path, 'r') as f:
            content = await f.read()
            data = json.loads(content)

            # Case 1: Already a list of records
            if isinstance(data, list):
                return filename, data

            # Case 2: Dict with "extracted_records" key
            if isinstance(data, dict) and "extracted_records" in data:
                return filename, data["extracted_records"]

            # Case 3: Single dict record (wrap it)
            if isinstance(data, dict):
                return filename, [data]

            # Fallback: empty
            return filename, []
    except Exception as e:
        print(f"⚠️ Error loading {file_path}: {e}")
        return filename, []


async def load_extracted_results_async(extracted_path: str, max_concurrent: int = 10) -> Dict[str, List[Dict]]:
    """Async load all DSPy extracted results from _do.json files"""
    
    # Find all files first
    file_tasks = []
    for root, dirs, files in os.walk(extracted_path):
        for file in files:
            if file.endswith('_do.json'):
                filename = file.replace('_do.json', '')
                file_path = os.path.join(root, file)
                file_tasks.append((file_path, filename))
    
    print(f"🔍 Found {len(file_tasks)} files to load...")
    
    # Load files concurrently with semaphore
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def load_with_semaphore(file_path: str, filename: str):
        async with semaphore:
            return await load_single_extracted_file(file_path, filename)
    
    # Load all files concurrently
    load_tasks = [load_with_semaphore(file_path, filename) for file_path, filename in file_tasks]
    results = await asyncio.gather(*load_tasks)
    
    # Convert to dictionary
    extracted_data = {filename: data for filename, data in results if data}
    
    print(f"✅ Loaded extracted data: {len(extracted_data)} files")
    return extracted_data

In [17]:
import asyncio


async def run_comprehensive_analysis():
    """Main async function to run the complete analysis"""
    
    print("🚀 Starting comprehensive async DSPy analysis...")
    start_time = time.time()
    
    # Load data concurrently
    print("📥 Loading datasets concurrently...")
    
    ground_truth_task = load_ground_truth_data_async(GROUND_TRUTH_PATH)
    extracted_results_task = load_extracted_results_async(EXTRACTED_DATA_PATH, max_concurrent=10)
    
    ground_truth, extracted_results = await asyncio.gather(
        ground_truth_task, 
        extracted_results_task
    )
    
    load_time = time.time() - start_time
    print(f"✅ Data loaded in {load_time:.2f}s")
    
    # Your original signatures
    ORIGINAL_SIGNATURES = '''
class ExtractStudyMetadata(dspy.Signature):
    """Extract basic study metadata from medical research paper markdown.
    
    This extracts core identifying information about the dental pain management study.
    """
    
    markdown_content: str = dspy.InputField(desc="Full markdown content of the medical research paper")

    first_author: str = dspy.OutputField(
        desc="Last name of the first author (e.g., 'Cooper'). Extract only the surname."
    )
    
    population_code: str = dspy.OutputField(
        desc="Numeric code representing the study population type. Codes: 1=simple tooth extraction, 2=surgical tooth extraction (third molar/wisdom teeth), 3=surgical tooth extraction (other teeth), 4=pulpitis or its complications. Can be multiple codes separated by commas (e.g., '2, 3')"
    )
    


class ExtractInterventions(dspy.Signature):
    """Extract intervention details from medical research paper markdown.
    
    This extracts information about pain management interventions used in dental studies.
    Focus on medication types, dosages, and participant counts.
    """
    
    markdown_content: str = dspy.InputField(desc="Full markdown content of the medical research paper")
    
    interventions_json: str = dspy.OutputField(
        desc="""JSON string containing list of interventions. Each intervention object must have:
        - intervention_code (integer): Numeric code 1-11 where:
          1=Ibuprofen 200-400mg + Acetaminophen 500-1000mg
          2=Oxycodone 5mg or Codeine 60mg  
          3=Acetaminophen 650mg + Oxycodone 10mg
          4=Ibuprofen 200mg + Hydrocodone 5mg
          5=Hydrocodone 5mg + Acetaminophen 300-325mg
          6=Ibuprofen 400mg (fast acting or acid)
          7=Tramadol 37.5mg + Acetaminophen 325mg
          8=Acetaminophen 500-1000mg
          9=Acetaminophen 600-650mg + Codeine 60mg
          10=Naproxen 400-440mg
          11=Placebo/NA (If its not mentioned as a placebo, then it is NA)
          #12=OTHER
        - intervention_description (string): Full description with medication name and exact dose (e.g., "Ibuprofen 400mg", "Naproxen sodium 440mg")
        - n_analyzed (integer): Number of participants analyzed for this intervention group
        
        Example: [{"intervention_code": 6, "intervention_description": "Ibuprofen 400mg", "n_analyzed": 40}]"""
    )


class ExtractAllOutcomes(dspy.Signature):
    """Extract ALL outcomes from medical research paper for systematic review.
    
    This implements COMPLETE DATA CAPTURE methodology - extract every data point
    including rescue analgesia, adverse events, and other outcomes at all time points.
    Focus on dichotomous outcomes from ALL data sources: main text, figures, tables,
    and supplementary materials. Include zero-event outcomes (0/N patients).
    """
    
    markdown_content: str = dspy.InputField(desc="Full markdown content of the medical research paper including supplementary materials")
    intervention_description: str = dspy.InputField(desc="Specific intervention to extract outcomes for (e.g., 'Ibuprofen 400mg', 'Placebo')")
    
    all_outcomes_json: str = dspy.OutputField(
        desc="""JSON string containing list of ALL outcomes for the specified intervention. Each outcome object must have:
        
        MANDATORY FIELDS FOR ALL OUTCOMES:
        - outcome_type (integer): Outcome type code where:
          1=Rescue analgesia at 6 hours
          2=Rescue analgesia at 4 hours  
          4=Rescue analgesia for pulpitis population
          5=Adverse effects (nausea, vomiting, drowsiness, dizziness, headache, etc.)
          6=Other outcomes (pain relief, time to onset, etc.)
        - follow_up_time (string): Exact time point when outcome was measured (e.g., "6 hours", "24hrs", "4 hours", "7 days")
        - n_analyzed (integer): Number of participants analyzed for this specific outcome
        - n_events_number (integer): Number of patients who experienced this outcome
        - n_events_percentage (float): Percentage of patients who experienced this outcome (e.g., 17.5, 0.6, 2.4, 0.0)
        
        CONDITIONAL FIELDS:
        - adverse_effect_specify (string): Specific adverse effect name if outcome_type=5 (e.g., "Drowsiness (sleepy, tired)", "Paraesthesia oral", "Vomiting"). Use "NA" if outcome_type≠5
        - other_outcome_specify (string): Detailed description if outcome_type=6 (e.g., "Time to meaningful pain relief", "Pain intensity difference"). Use "NA" if outcome_type≠6
        - adverse_effects_all_study (string): List of all adverse effects if not reported per study arm, or "NA" if reported per arm
        
        DOCUMENTATION FIELDS:
        - extraction_notes (string): Technical documentation including data source ("From Table 2", "From Figure 5", "From Supplementary Table 3"), extraction method ("Direct from table", "Interpreted from Kaplan-Meier curve"), and population used ("Per-protocol population", "Safety population", "ITT population")
        - comments (string): Study-specific information including single vs multiple dose design, surgical techniques mentioned, methodological features, dropout rates, calculation details
        
        EXTRACTION REQUIREMENTS:
        - Extract EVERY outcome reported, including zero-event outcomes (0/N)
        - Create separate entries for each time point assessment
        - Include outcomes from ALL data sources (main text, figures, supplements)
        - Use appropriate analysis populations (efficacy vs safety)
        - Document any calculations or interpretations performed
        
        Example: [
          {"outcome_type": 1, "follow_up_time": "6 hours", "n_analyzed": 40, "n_events_number": 15, "n_events_percentage": 37.5, "adverse_effect_specify": "NA", "other_outcome_specify": "NA", "adverse_effects_all_study": "NA", "extraction_notes": "From Table 3, per-protocol population", "comments": "single dose study with overnight monitoring"},
          {"outcome_type": 5, "follow_up_time": "24 hours", "n_analyzed": 40, "n_events_number": 7, "n_events_percentage": 17.5, "adverse_effect_specify": "Drowsiness (sleepy, tired)", "other_outcome_specify": "NA", "adverse_effects_all_study": "NA", "extraction_notes": "From safety table, safety population", "comments": "mild to moderate severity"}
        ]"""
    )


class StructureComprehensiveOutcome(dspy.Signature):
    """Structure extracted data into the final comprehensive dichotomous outcome format.
    
    This combines study metadata, intervention details, and any outcome data (rescue analgesia,
    adverse events, or other outcomes) into the standardized format used for systematic review
    and meta-analysis. Each record represents one outcome for one intervention in one study.
    """
    
    study_metadata_json: str = dspy.InputField(desc="Study metadata as JSON string with first_author,  population_code")
    intervention_json: str = dspy.InputField(desc="Single intervention details as JSON string with intervention_code, intervention_description, n_analyzed")
    outcome_json: str = dspy.InputField(desc="Single outcome details as JSON string with all outcome fields including outcome_type, follow_up_time, n_events_number, etc.")
    
    structured_record_json: str = dspy.OutputField(
        desc="""Complete structured record as JSON string with exactly these fields:
        - First_Author (string): First author last name (e.g., "Cooper")
        - Population (integer): Population code (1-4)
        - Intervention_Code (integer): Intervention code (1-11)
        - Intervention_Description (string): Full intervention description with dose
        - Outcome_Type (integer): Outcome type (1=rescue analgesia 6h, 2=rescue analgesia 4h, 4=rescue analgesia pulpitis, 5=adverse effects, 6=other)
        - Outcome_Other_Specify (string): Detailed outcome description for type 6, or empty string for other types
        - Follow_Up_Time (string): Time point (e.g., "24hrs", "6 hours")
        - N_Analyzed (integer): Number of participants analyzed
        - Adverse_Effect_Specify (string): Specific adverse effect name for type 5, or empty string for other types
        - Adverse_Effects_All_Study (string): All study adverse effects if not reported per arm, or empty string
        - N_Events_Number (integer): Number of patients with this outcome
        - N_Events_Percentage (float): Percentage of patients with this outcome
        - Comments (string): Study-specific methodology, design notes, and extraction details
        
        FIELD MAPPING RULES:
        - For outcome_type 1,2,4 (rescue analgesia): Adverse_Effect_Specify="" and Outcome_Other_Specify=""
        - For outcome_type 5 (adverse effects): Outcome_Other_Specify="" and Adverse_Effect_Specify=specific adverse event name
        - For outcome_type 6 (other outcomes): Adverse_Effect_Specify="" and Outcome_Other_Specify=detailed outcome description
        - Always include extraction methodology and data source information in Comments
        - Ensure mathematical validation: (N_Events_Number/N_Analyzed)*100 = N_Events_Percentage
        - Use appropriate analysis populations (efficacy vs safety) based on outcome type
        
        Example: {"First_Author": "Cooper",  "Population": 2, "Intervention_Code": 10, "Intervention_Description": "Naproxen sodium 440mg", "Outcome_Type": 5, "Outcome_Other_Specify": "", "Follow_Up_Time": "24hrs", "N_Analyzed": 166, "Adverse_Effect_Specify": "Paraesthesia oral", "Adverse_Effects_All_Study": "", "N_Events_Number": 1, "N_Events_Percentage": 0.6, "Comments": "extracted from supplementary table 3, safety population, single dose study"}"""
    )
'''
    
    # Initialize and run the async comprehensive analysis
    analyzer = AsyncComprehensiveAnalysisModule(
        max_concurrent_files=5,  # Process 5 files concurrently
        batch_size=10           # Batch size for extra record analysis
    )
    
    # Run analysis
    results = await analyzer(
        ground_truth_data=ground_truth,
        extracted_data=extracted_results, 
        original_signatures=ORIGINAL_SIGNATURES,
        max_files=10  # Adjust based on your needs
    )
    
    return results

# %%
# Run the async analysis
print("🎬 Starting async analysis...")

# For Jupyter notebooks, you might need to handle the event loop
try:
    # If running in Jupyter, use this approach
    import nest_asyncio
    nest_asyncio.apply()
    results = asyncio.run(run_comprehensive_analysis())
except RuntimeError:
    # If event loop is already running, use this
    results = await run_comprehensive_analysis()

🎬 Starting async analysis...
🚀 Starting comprehensive async DSPy analysis...
📥 Loading datasets concurrently...
🔍 Found 6 files to load...
📚 Loaded ground truth: 1691 records across 81 files
✅ Loaded extracted data: 6 files
✅ Data loaded in 0.35s
🚀 Starting async analysis of 6 files with max 5 concurrent...
📋 Processing files concurrently...
📊 Files processed in 102.08s. Found 382 total extra records
🔍 Analyzing patterns and generating improvements concurrently...
✅ Complete analysis finished in 254.47s


In [18]:
print("🎉 ASYNC COMPREHENSIVE ANALYSIS RESULTS")
print("=" * 55)

# Performance metrics
stats = results.summary_stats
print(f"⚡ Performance Metrics:")
print(f"   Total Processing Time: {stats['processing_time']:.2f}s")
print(f"   File Processing Time: {stats['file_processing_time']:.2f}s")
print(f"   Files Analyzed: {stats['total_files']}")

# Summary statistics
print(f"\n📊 Analysis Summary:")
print(f"   Total Extra Records Found: {stats['total_extra_records']}")
print(f"   Files with Extra Records: {stats['files_with_extras']}")
if stats['total_extra_records'] > 0:
    total_extracted = sum(f['analysis'].extracted_count for f in results.file_analyses)
    print(f"   Over-extraction Rate: {(stats['total_extra_records']/total_extracted)*100:.1f}%")

# File-by-file breakdown
print(f"\n📋 FILE-BY-FILE BREAKDOWN")
print("-" * 30)

for file_analysis in results.file_analyses:
    filename = file_analysis['filename']
    analysis = file_analysis['analysis']
    extra_count = file_analysis['extra_records_count']
    
    print(f"\n📄 {filename}:")
    print(f"   Ground Truth: {analysis.gt_count}")
    print(f"   Extracted: {analysis.extracted_count}")  
    print(f"   Extra Records: {extra_count}")
    
    # Show a condensed accuracy metric
    accuracy_summary = analysis.accuracy_metrics
    if len(accuracy_summary) > 100:
        accuracy_summary = accuracy_summary[:100] + "..."
    print(f"   Metrics: {accuracy_summary}")

# Extra records pattern analysis (if any extra records exist)
if stats['total_extra_records'] > 0:
    print(f"\n🔍 EXTRA RECORDS PATTERN ANALYSIS")
    print("-" * 40)
    print(f"Common Patterns:\n{results.extra_records_analysis.common_patterns}")
    print(f"\nSignature Issues:\n{results.extra_records_analysis.signature_issues}")

    # Individual extra record analyses (show first few)
    if results.extra_records_analysis.individual_analyses:
        print(f"\n🔬 INDIVIDUAL EXTRA RECORD ANALYSIS SAMPLES")
        print("-" * 45)
        
        for i, analysis_batch in enumerate(results.extra_records_analysis.individual_analyses[:2]):
            print(f"\nBatch {i+1} Analysis:")
            # Truncate if too long
            analysis_text = str(analysis_batch)
            if len(analysis_text) > 500:
                analysis_text = analysis_text[:500] + "...[truncated]"
            print(analysis_text)

    # Signature improvements
    print(f"\n💡 SIGNATURE IMPROVEMENT RECOMMENDATIONS")
    print("-" * 45)
    print(f"Improved Signatures:\n{results.signature_improvements.improved_signatures}")
    print(f"\nAdditional Validation Rules:\n{results.signature_improvements.additional_validation_rules}")
    print(f"\nBetter Few-Shot Examples:\n{results.signature_improvements.few_shot_examples}")
else:
    print(f"\n✅ PERFECT EXTRACTION!")
    print("No extra records found - your DSPy signatures are working correctly!")

# %% [markdown]
# ## Export Async Results

# %%
async def export_results_async(results, filename: str = "dental_dspy_async_analysis.json"):
    """Async export detailed results"""
    
    export_data = {
        'analysis_timestamp': pd.Timestamp.now().isoformat(),
        'performance_metrics': {
            'total_processing_time': results.summary_stats['processing_time'],
            'file_processing_time': results.summary_stats['file_processing_time'],
            'files_processed_concurrently': True,
            'max_concurrent_files': 5
        },
        'summary_statistics': results.summary_stats,
        'signature_improvements': {
            'improved_signatures': results.signature_improvements.improved_signatures,
            'validation_rules': results.signature_improvements.additional_validation_rules,
            'few_shot_examples': results.signature_improvements.few_shot_examples
        },
        'extra_records_patterns': {
            'common_patterns': results.extra_records_analysis.common_patterns,
            'signature_issues': results.extra_records_analysis.signature_issues
        },
        'individual_analyses': results.extra_records_analysis.individual_analyses,
        'file_analyses_summary': [
            {
                'filename': f['filename'],
                'gt_count': f['analysis'].gt_count,
                'extracted_count': f['analysis'].extracted_count,
                'extra_count': f['extra_records_count']
            }
            for f in results.file_analyses
        ]
    }
    
    # Export asynchronously
    async with aiofiles.open(filename, 'w') as f:
        await f.write(json.dumps(export_data, indent=2, default=str))
    
    print(f"💾 Results exported to: {filename}")

# Export results
await export_results_async(results)

# %% [markdown]
# ## Performance Analysis and Next Steps

# %%
print(f"\n⚡ PERFORMANCE ANALYSIS")
print("=" * 30)

stats = results.summary_stats
print(f"Files Processed: {stats['total_files']}")
print(f"Total Time: {stats['processing_time']:.2f}s")
print(f"Average Time per File: {stats['file_processing_time']/stats['total_files']:.2f}s")
print(f"Concurrent Processing: ✅ Enabled")

if stats['total_extra_records'] > 0:
    print(f"\n🎯 KEY FINDINGS:")
    print(f"   - {stats['total_extra_records']} extra records need investigation")
    print(f"   - {stats['files_with_extras']} files have over-extraction issues") 
    print(f"   - Focus on improving signature constraints")
    print(f"   - Consider adding validation rules")
    
    print(f"\n📋 RECOMMENDED NEXT STEPS:")
    print("1. Review the signature improvements above")
    print("2. Implement the suggested validation rules")
    print("3. Add negative examples to few-shot prompts")
    print("4. Test improved signatures on a subset of files")
    print("5. Re-run this async analysis to measure improvement")
else:
    print(f"\n🎉 PERFECT RESULTS!")
    print("Your DSPy signatures are working perfectly!")

print(f"\n✅ Async DSPy Module Analysis Complete!")
print("Use the exported JSON for detailed implementation guidance.")

🎉 ASYNC COMPREHENSIVE ANALYSIS RESULTS
⚡ Performance Metrics:
   Total Processing Time: 254.47s
   File Processing Time: 102.08s
   Files Analyzed: 6

📊 Analysis Summary:
   Total Extra Records Found: 382
   Files with Extra Records: 6
   Over-extraction Rate: 100.0%

📋 FILE-BY-FILE BREAKDOWN
------------------------------

📄 1741_Mehlisch:
   Ground Truth: 42
   Extracted: 112
   Extra Records: 112
   Metrics: - **True Positives (TP):** 42
- **False Positives (FP):** 64
- **False Negatives (FN):** 0

- **Prec...

📄 2275_Kyselovic:
   Ground Truth: 24
   Extracted: 30
   Extra Records: 30
   Metrics: - **True Positives (TP):** 24 (records that are in both ground truth and extracted)
- **False Positi...

📄 3641_Daniels:
   Ground Truth: 24
   Extracted: 43
   Extra Records: 43
   Metrics: - **True Positives (TP)**: 24 (records that are in both ground truth and extracted)
- **False Positi...

📄 1102_Qi:
   Ground Truth: 45
   Extracted: 98
   Extra Records: 98
   Metrics: - **True Positi