In [137]:
from evaluate_output import MetaAnalysisTaskEvaluator
import utils
from collections import defaultdict
from typing import Dict
import yaml

In [138]:
def categorize_outcome_type(evaluator: MetaAnalysisTaskEvaluator) -> Dict:
    """
    Categorizes the errors in the outcome type task
    Categories:
        1. Model outputs in an undesirable format
        2. Model outputs binary when its continuous
        3. Model outputs continuous when its binary
        4. Model outputs unknown when the reference is known
    
    Args:
        evaluator: MetaAnalysisTaskEvaluator object
    
    Returns:
        Dict: A dictionary containing the error categories and the record ids that fall into each category
    """
    
    def is_badly_formatted_output(output: str) -> bool:
        return output not in ["A", "B", "C"]
    
    errors = defaultdict(list)
    character_to_string_mapping = {"A": "binary", "B": "continuous", "C": "x"}
    for record in evaluator.data:
        
        # Clean the output in the same way from `evaluate_output.py`
        output = record['output'].replace("The answer is ", "").replace(".", "").replace("(", "").replace(")","")
        for char in output:
            if not char.isspace():
                output = char
                break
                
        # Check for badly formatted outputs
        if is_badly_formatted_output(output):
            errors["badly_formatted_output"].append(record['id'])
            
        # Check for unknown when reference is known
        elif output == "C" and record['outcome_type'] != "":
            errors["unknown_when_reference_known"].append(record['id'])
        
        # Check for binary when continuous
        elif output == "B" and record['outcome_type'] == "binary":
            errors["binary_when_continuous"].append(record['id'])
            
        # Check for continuous when binary
        elif output == "A" and record['outcome_type'] == "continuous":
            errors["continuous_when_binary"].append(record['id'])
            
    return errors

def categorize_outcomes(evaluator: MetaAnalysisTaskEvaluator, outcome_type: str) -> Dict:
    """
    Categorizes the errors in the binary outcomes task
    Categories:
        1. Model outputs in an undesirable format
        2. Model has output but reference is unknown
        3. Model outputs unknown but reference is known
        4. Reference is and output is known but model outputs incorrect value
        
    Args:
        evaluator: MetaAnalysisTaskEvaluator object
        outcome_type: The outcome type of the task
        
    Returns:
        Dict: A dictionary containing the error categories and the record ids that fall into each category
    """
    
    errors = defaultdict(list)
    
    # Define the reference keys and output keys based on the outcome type
    if outcome_type == 'binary':
        reference_keys = ["intervention_events", "intervention_group_size", "comparator_events", "comparator_group_size"]
        output_keys = [f'{category}_output' for category in reference_keys]
    elif outcome_type == 'continuous':
        reference_keys = ['intervention_mean', 'intervention_standard_deviation', 'intervention_group_size', 'comparator_mean', 'comparator_standard_deviation', 'comparator_group_size']
        output_keys = [f'{category}_output' for category in reference_keys]
    else:
        raise ValueError("Invalid outcome type")
    
    for record in evaluator.data:
        # Check for badly formatted outputs
        try:
            _ = yaml.safe_load(utils.clean_yaml_output(record['output']))
        except:
            errors["badly_formatted_output"].append(record['id'])
            continue
            
        # Check for output but reference is unknown
        for reference_key, output_key in zip(reference_keys, output_keys):
            if record[reference_key] == "x" and record[output_key] != "x":
                # Check if the record is already in the list
                if record['id'] not in errors["known_output_but_reference_unknown"]:
                    errors["known_output_but_reference_unknown"].append(record['id'])
                
            
            # Check for unknown output but reference is known
            if record[reference_key] != "x" and record[output_key] == "x":
                # Check if the record is already in the list
                if record['id'] not in errors["unknown_output_but_reference_known"]:
                    errors["unknown_output_but_reference_known"].append(record['id'])
                
                
            # Check for incorrect answer
            if record[reference_key] != "x" and record[output_key] != "x":
                if record[reference_key] != record[output_key]:
                    # Check if the record is already in the list
                    if record['id'] not in errors["incorrect_output"]:
                        errors["incorrect_output"].append(record['id'])
                        
    return errors

In [139]:
evaluator = MetaAnalysisTaskEvaluator('outcome_type', 'outputs/outcome_type/olmo7B_outcome_type_test_output_20240326-12:06:30.json', 'metrics/outcome_type/', None)
evaluator.run_evaluation()

Metrics for the task:
{
    "number_of_model_unknowns": {
        "outcome_type": 5,
        "total": 5
    },
    "number_of_reference_unknowns": {
        "outcome_type": 0,
        "total": 0
    },
    "exact_match_accuracy": {
        "outcome_type": 0.2896341463414634,
        "total": 0.2896341463414634
    },
    "partial_match_accuracy": {
        "partial_match_accuracy_1": 0.2896341463414634
    },
    "outcome_type_f_score": {
        "outcome_type": {
            "f1_score_binary": 0.4239401496259351,
            "f1_score_continuous": 0.0792079207920792,
            "f1_score_unknown": 0.0
        }
    }
}


In [140]:
outcome_type_errors = categorize_outcome_type(evaluator)
print(outcome_type_errors)

defaultdict(<class 'list'>, {'continuous_when_binary': [458, 32, 557, 570, 681, 522, 346, 644, 271, 413, 202, 21, 353, 471, 571, 585, 352, 610, 8, 437, 91, 351, 23, 136, 682, 511, 249, 550, 347, 306, 77, 329, 67, 594, 255, 89, 487, 650, 447, 279, 222, 394, 534, 459, 201, 167, 220, 24, 393, 646, 331, 210, 330, 22, 310, 395, 117, 156, 363, 158, 33, 150, 404, 149, 223, 324, 281, 365, 11, 410, 658, 289, 419, 113, 412, 359, 51, 212, 134, 486, 209, 30, 662, 562, 674, 442, 180, 552, 103, 309, 676, 531, 629, 326, 656, 408, 468, 298, 578, 300, 639, 100, 643, 515, 147, 350, 399, 284, 569, 215, 295, 208, 368, 560, 485, 581, 533, 465, 165, 470, 205, 211, 197, 372, 403, 170, 536, 521, 694, 390, 428, 217, 448, 218, 148, 483, 506, 26, 441, 193, 73, 677, 524, 71, 383, 456, 204, 241, 371, 285, 675, 291, 196, 420, 356, 240, 213, 671, 54, 247, 606, 453, 75, 176, 323, 684, 161, 608, 443, 312, 78, 199, 545, 429, 564, 640, 274, 439, 95, 92, 27, 435, 679, 584, 99, 81, 195, 607, 31, 601, 427, 35, 146, 302, 35

In [141]:
evaluator = MetaAnalysisTaskEvaluator('binary_outcomes', 'outputs/binary_outcomes/olmo7B_binary_outcomes_test_output_20240403-18:44:49.json', 'metrics/binary_outcomes/', None)
evaluator.run_evaluation()

Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error parsing yaml string: intervention:
    events: 10
    group_size: 397intervention:
    events: 787
    group_size: 392
Error parsing yaml string: intervention:
    events: 3
    group_size: 18intervention:
    events: 10
    group_size: 18
comparator:
    events: 6
    group_size: 18
Error in applying zero correction: Undefined results.
Error in applying zero correction: Undefined results.
Error parsing yaml string: intervention:
    events: 9
    group_size: 399intervention:
    events: 9
    group_siz

In [142]:
binary_errors = categorize_outcomes(evaluator, 'binary')
print(binary_errors)

defaultdict(<class 'list'>, {'unknown_output_but_reference_known': [130, 491, 10, 257, 539, 62, 61, 651, 137, 654, 362, 342, 127, 63], 'badly_formatted_output': [548, 49, 568, 50, 123, 373, 14, 384, 256, 375, 132, 43, 476, 587, 508, 173, 48, 131, 370, 297, 121, 500, 547, 155, 391, 502, 299, 15, 481, 339, 251, 45, 512, 66, 492, 252, 494, 381, 376, 46, 265, 47, 507, 380, 588, 254, 85, 475, 497, 154, 16, 389, 128, 690, 369, 382, 510, 499, 264, 379, 39, 338, 358, 341, 516, 133, 479, 337, 194, 498, 42, 541, 495, 493, 378, 87, 340, 361, 474, 477, 333, 400, 40, 496, 482, 41, 259, 344, 377, 691, 625, 124, 546, 343, 13, 478, 178, 405, 641, 153, 689, 501, 250, 174, 38, 467, 538, 84, 118, 182, 253, 266, 36, 86, 642], 'incorrect_output': [622, 633, 623, 313, 543, 535, 491, 304, 652, 93, 152, 10, 537, 108, 653, 107, 166, 258, 270, 632, 627, 624, 305, 540, 109, 138, 20, 628, 509, 272, 315, 122, 139, 94, 362, 65, 626, 631, 127, 269, 542, 64, 37, 621], 'known_output_but_reference_unknown': [473, 139, 

In [143]:
evaluator = MetaAnalysisTaskEvaluator('continuous_outcomes', 'outputs/continuous_outcomes/olmo7B_continuous_outcomes_test_output_20240403-21:36:24.json', 'metrics/continuous_outcomes/', None)
evaluator.run_evaluation()

An exception occurred for calculate standardized mean difference - intervention_mean: 0, control_mean: 0, intervention_sd: 0, control_sd: 0
An exception occurred for calculate standardized mean difference - intervention_mean: 80).2, control_mean: 75).2, intervention_sd: 7.77, control_sd: 7.41
Error parsing yaml string: intervention:
     mean: 120.0
     standard_deviation: 11.7
     group_size: 60
comparator:
     mean: 88.4
     standard_deviation: 11.8
    group_size: 60
Error parsing yaml string: intervention:
     mean: 4.0
     standard_deviation: 1.1
     group_size: 28
comparator:
     mean: 11.5
     standard_deviation: 3.3
    group_size: 28
Error parsing yaml string: intervention:
    mean: 83
    standard_deviation: 5
    group_size: 10
comparator:
    mean: 74
    standard_deviation: 5
    group_size: 10intervention:
    mean: 82
    standard_deviation: 5
    group_size: 10
An exception occurred for calculate standardized mean difference - intervention_mean: 80).0, control

In [144]:
continuous_errors = categorize_outcomes(evaluator, 'continuous')
print(continuous_errors)

defaultdict(<class 'list'>, {'known_output_but_reference_unknown': [562, 322, 574, 271, 11, 471, 577, 320, 589, 472, 485, 164, 513, 564, 144, 273, 356, 571, 455, 143, 268, 685, 639, 248, 600, 104, 332, 396, 89, 563, 197, 68, 576, 468, 598, 97, 560, 561, 151, 90, 470, 644, 103, 12, 319, 321, 611, 105, 403, 575, 630, 140, 407, 515, 442, 201, 69, 533, 591, 150, 247, 590, 483, 249, 374, 565, 608, 461, 661, 148, 607, 9, 365, 511, 349, 91, 101, 106], 'badly_formatted_output': [213, 82, 663, 293, 274, 406, 635, 44, 99, 111, 216, 243, 597, 282, 634, 288, 223, 160, 78, 276, 159, 446, 211, 278, 665, 163, 172, 662, 300, 433, 204, 298, 67, 676, 167, 528, 610, 404, 156, 444, 573, 79, 129, 242, 569, 72, 295, 682, 550, 486, 336, 553, 136, 646, 402, 331, 296, 420, 545, 119, 281, 217, 244, 674, 35, 214, 280, 666, 443, 503, 636, 80, 287, 595, 602, 125, 171, 169, 601, 582, 441, 397, 371, 448, 604, 350, 695, 34, 335, 603, 566, 177, 678, 54, 449, 643, 672, 592, 354, 180, 452, 161, 645, 421, 179, 114, 205, 