Code for sampling errors from GPT-4 and Mistral outputs.

In [2]:
from evaluate_output import MetaAnalysisTaskEvaluator
import utils
from collections import defaultdict
from typing import Dict
import yaml
import json
import os
import random

ModuleNotFoundError: No module named 'yaml'

In [5]:
def categorize_outcome_type(output_file_path: str) -> (Dict, Dict):
    """
    Categorizes the errors in the outcome type task
    Categories:
        1. Model outputs in an undesirable format
        2. Model outputs binary when its continuous
        3. Model outputs continuous when its binary
        4. Model outputs unknown when the reference is known
    
    Args:
        output_file_path: Output file path of the outcome type task
    
    Returns:
        Dict: A dictionary containing the error categories and the record ids that fall into each category
    """
    
    def is_badly_formatted_output(output: str) -> bool:
        return output not in ["A", "B", "C"]
    
    evaluator = MetaAnalysisTaskEvaluator('outcome_type', output_file_path, 'metrics/outcome_type/', None)
    evaluator.run_evaluation()
    
    errors = defaultdict(list)
    # character_to_string_mapping = {"A": "binary", "B": "continuous", "C": "x"}
    for record in evaluator.data:
        
        # Clean the output in the same way from `evaluate_output.py`
        output = record['output'].replace("The answer is ", "").replace(".", "").replace("(", "").replace(")","")
        for char in output:
            if not char.isspace():
                output = char
                break
               
        # Check for badly formatted outputs
        if is_badly_formatted_output(output):
            errors["badly_formatted_output"].append(record['id'])
            
        # Check for unknown when reference is known
        elif output == "C" and record['outcome_type'] != "":
            errors["unknown_when_reference_known"].append(record['id'])
        
        # Check for binary when continuous
        elif output == "B" and record['outcome_type'] == "binary":
            errors["binary_when_continuous"].append(record['id'])
            
        # Check for continuous when binary
        elif output == "A" and record['outcome_type'] == "continuous":
            errors["continuous_when_binary"].append(record['id'])
            
    # Get the records that fall into each category
    error_records = {}
    for category, ids in errors.items():
        error_records[category] = []
        for record in evaluator.data:
            if record['id'] in ids:
                error_records[category].append(record)
            
    return errors, error_records

def categorize_outcomes(output_file_path: str, outcome_type: str) -> (Dict, Dict):
    """
    Categorizes the errors in the binary outcomes task
    Categories:
        1. Model outputs in an undesirable format
        2. Model has output but reference is unknown
        3. Model outputs unknown but reference is known
        4. Reference is and output is known but model outputs incorrect value
        
    Args:
        output_file_path: Output file path of the given task
        outcome_type: The outcome type of the task
        
    Returns:
        Dict: A dictionary containing the error categories and the record ids that fall into each category
    """
    evaluator = MetaAnalysisTaskEvaluator(outcome_type, output_file_path, 'metrics/' + outcome_type + '/', None)
    evaluator.run_evaluation()

    errors = defaultdict(list)
    
    # Define the reference keys and output keys based on the outcome type
    if outcome_type == 'binary_outcomes':
        reference_keys = ["intervention_events", "intervention_group_size", "comparator_events", "comparator_group_size"]
        output_keys = [f'{category}_output' for category in reference_keys]
    elif outcome_type == 'continuous_outcomes':
        reference_keys = ['intervention_mean', 'intervention_standard_deviation', 'intervention_group_size', 'comparator_mean', 'comparator_standard_deviation', 'comparator_group_size']
        output_keys = [f'{category}_output' for category in reference_keys]
    else:
        raise ValueError("Invalid outcome type")
    
    for record in evaluator.data:
        # Check for badly formatted outputs
        try:
            _ = yaml.safe_load(utils.clean_yaml_output(record['output']))
        except:
            errors["badly_formatted_output"].append(record['id'])
            continue
            
        # Check for output but reference is unknown
        for reference_key, output_key in zip(reference_keys, output_keys):
            if record[reference_key] == "x" and record[output_key] != "x":
                # Check if the record is already in the list
                if record['id'] not in errors["known_output_but_reference_unknown"]:
                    errors["known_output_but_reference_unknown"].append(record['id'])
                
            
            # Check for unknown output but reference is known
            if record[reference_key] != "x" and record[output_key] == "x":
                # Check if the record is already in the list
                if record['id'] not in errors["unknown_output_but_reference_known"]:
                    errors["unknown_output_but_reference_known"].append(record['id'])
                
                
            # Check for incorrect answer
            if record[reference_key] != "x" and record[output_key] != "x":
                if record[reference_key] != record[output_key]:
                    # Check if the record is already in the list
                    if record['id'] not in errors["incorrect_output"]:
                        errors["incorrect_output"].append(record['id'])
                        
    # Get the records that fall into each category
    error_records = {}
    for category, ids in errors.items():
        error_records[category] = []
        for record in evaluator.data:
            if record['id'] in ids:
                error_records[category].append(record)
                        
    return errors, error_records

In [6]:
output_files_by_model = {
    "biomistral": {
        "outcome_type": "biomistral_outcome_type_test_output.json",
        "binary_outcomes": "biomistral_binary_outcomes_test_output.json",
        "continuous_outcomes": "biomistral_continuous_outcomes_test_output.json"
    },
    "gemma": {
        "outcome_type": "gemma7B_outcome_type_test_output.json",
        "binary_outcomes": "gemma7B_binary_outcomes_test_output.json",
        "continuous_outcomes": "gemma7B_continuous_outcomes_test_output.json"
    },
    "gpt4": {
        "outcome_type": "gpt4_outcome_type_test_output.json",
        "binary_outcomes": "gpt4_binary_outcomes_test_output.json",
        "continuous_outcomes": "gpt4_continuous_outcomes_test_output.json"
    },
    "gpt35": {
        "outcome_type": "gpt35_outcome_type_test_output.json",
        "binary_outcomes": "gpt35_binary_outcomes_test_output.json",
        "continuous_outcomes": "gpt35_continuous_outcomes_test_output.json"
    },
    "mistral": {
        "outcome_type": "mistral7B_outcome_type_test_output.json",
        "binary_outcomes": "mistral7B_binary_outcomes_test_output.json",
        "continuous_outcomes": "mistral7B_continuous_outcomes_test_output.json"
    },
    "olmo": {
        "outcome_type": "olmo7B_outcome_type_test_output.json",
        "binary_outcomes": "olmo7B_binary_outcomes_test_output.json",
        "continuous_outcomes": "olmo7B_continuous_outcomes_test_output.json"
    },
    "pmc_llama": {
        "outcome_type": "pmc-llama_outcome_type_test_output.json",
        "binary_outcomes": "pmc-llama_binary_outcomes_test_output.json",
        "continuous_outcomes": "pmc-llama_continuous_outcomes_test_output.json"
    },
}

In [None]:
# Save the original standard output
original_stdout = sys.stdout 

# # Redirect standard output to a null device
sys.stdout = open(os.devnull, 'w')

outcome_type_errors_by_model = {}

for model, output_files in output_files_by_model.items():
    outcome_type_errors = {}
    _, outcome_type_errors["outcome_type"] = categorize_outcome_type(f"outputs/outcome_type/{output_files['outcome_type']}")
    _, outcome_type_errors["binary_outcomes"] = categorize_outcomes(f"outputs/binary_outcomes/{output_files['binary_outcomes']}", 'binary_outcomes')
    _, outcome_type_errors["continuous_outcomes"] = categorize_outcomes(f"outputs/continuous_outcomes/{output_files['continuous_outcomes']}", 'continuous_outcomes')
    outcome_type_errors_by_model[model] = outcome_type_errors
    
# Restore the original standard output
sys.stdout = original_stdout

In [8]:
def print_errors(errors: Dict, type: str):
    print(f"{type} Errors:")
    # print(json.dumps(errors, indent=4))
    for key, value in errors.items():
        print(f"{key}: {len(value)}")
    print()

for model, errors in outcome_type_errors_by_model.items():
    print(f"Model: {model}")
    print("-" * 50)
    print_errors(errors["outcome_type"], "Outcome Type")
    print_errors(errors["binary_outcomes"], "Binary Outcomes")
    print_errors(errors["continuous_outcomes"], "Continuous Outcomes")
    print("-" * 50)

Model: biomistral
--------------------------------------------------
Outcome Type Errors:
unknown_when_reference_known: 32
badly_formatted_output: 377
continuous_when_binary: 149
binary_when_continuous: 11

Binary Outcomes Errors:
unknown_output_but_reference_known: 165

Continuous Outcomes Errors:
unknown_output_but_reference_known: 465

--------------------------------------------------
Model: gemma
--------------------------------------------------
Outcome Type Errors:
binary_when_continuous: 13
continuous_when_binary: 207

Binary Outcomes Errors:
incorrect_output: 113
unknown_output_but_reference_known: 31
known_output_but_reference_unknown: 23
badly_formatted_output: 11

Continuous Outcomes Errors:
incorrect_output: 212
unknown_output_but_reference_known: 216
badly_formatted_output: 18
known_output_but_reference_unknown: 125

--------------------------------------------------
Model: gpt4
--------------------------------------------------
Outcome Type Errors:
unknown_when_reference

In [9]:
# Build a dictionary of the errors for each model
random.seed(10)

sampled_errors = {}
for model in ['gpt4', 'mistral']:
    outcome_errors = {}
    for error_type, errors in outcome_type_errors_by_model[model].items():
        for category, records in errors.items():
            sample_size = min(20, len(records))
            sampled_records = random.sample(records, sample_size)
            outcome_errors[error_type] = outcome_errors.get(error_type, {})
            outcome_errors[error_type][category] = sampled_records
    sampled_errors[model] = outcome_errors

In [10]:
# load all the errors from gpt4 and mistral to a json file
with open('sampled_errors.json', 'w') as f:
    json.dump(sampled_errors, f, indent=4)