In [563]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np
import json
import os
import xml.etree.ElementTree as ET 
import pandas as pd

## This file compares the teamtat annotation (Json) with Extraction performed by finetuned llama (Json)

In [564]:
label = {'passivating_molecule': 'phenethylammonium iodide', 'perovskite_composition': 'MAPbI3', 'ISOSD1': {'time': '240', 'treated_pce': '15.3', 'control_pce': '16.69', 'temperature': '25', 'humidity': '90', 'control_voc': '1.03', 'treated_voc': '1.06'}, 'electron_transport_layer': 'TiO2', 'hole_transport_layer': 'Spiro-OMeTAD'}

In [565]:
extraction = {
  "control_pce": None,
  "control_voc": None,
  "treated_pce": 15.3,
  "treated_voc": 1.06,
  "passivating_molecule": "Phenylethylammonium (PEA)",
  "perovskite_composition": "[C8H9NH3]2[(CH3NH3)2PbI3 – (n=60)]",
  "electron_transport_layer": "TiO2",
  "hole_transport_layer": "spiro-OMeTAD",
  "ISOS-L-1": None,
  "ISOS-L-2": None,
  "ISOS-T-1": None,
  "ISOS-T-2": None,
  "ISOS-LC": None,
  "ISOS-D-1": None,
  "ISOS-D-2": None
}

In [566]:
def compare_data(labeled_data, extracted_data, numerical_tolerance=0.05):
    results = {}
    total_fields = len(labeled_data)
    matched_fields = 0
    numerical_differences = []

    for key, value in labeled_data.items():
        if key in extracted_data:
            extracted_value = extracted_data[key]

            # Exact match
            if value == extracted_value:
                matched_fields += 1
                results[key] = "Match"

            # Numerical comparison
            elif isinstance(value, (int, float)) and isinstance(extracted_value, (int, float)):
                if abs(value - extracted_value) <= numerical_tolerance * abs(value):
                    matched_fields += 1
                    numerical_differences.append(abs(value - extracted_value))
                    results[key] = "Numerical Match"
                else:
                    results[key] = "Numerical Mismatch"

            # Fuzzy string match
            elif isinstance(value, str) and isinstance(extracted_value, str):
                similarity = SequenceMatcher(None, value.lower(), extracted_value.lower()).ratio()
                if similarity > 0.8:  # Threshold for similarity
                    matched_fields += 1
                    results[key] = f"Fuzzy Match ({similarity:.2f})"
                else:
                    results[key] = f"Mismatch ({similarity:.2f})"

            else:
                results[key] = "Mismatch"
        else:
            results[key] = "Missing in Extracted Data"

    accuracy = matched_fields / total_fields
    mean_absolute_error = np.mean(numerical_differences) if numerical_differences else None

    return {
        "results": results,
        "accuracy": accuracy,
        "mean_absolute_error": mean_absolute_error,
        "matched_fields": matched_fields,
        "total_fields": total_fields
    }


In [567]:
comparison_result = compare_data(label, extraction)
print(comparison_result)


{'results': {'passivating_molecule': 'Mismatch (0.78)', 'perovskite_composition': 'Mismatch (0.20)', 'ISOSD1': 'Missing in Extracted Data', 'electron_transport_layer': 'Match', 'hole_transport_layer': 'Fuzzy Match (1.00)'}, 'accuracy': 0.4, 'mean_absolute_error': None, 'matched_fields': 2, 'total_fields': 5}


## Retrieving teamtat label

In [568]:
def parse_bioc(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = { }
    concept_ids = set()
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name

    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        if node_id in relations:
            stability_test = relations[node_id]
            if stability_test not in data:
                data[stability_test] = {}
            if concept_id is None:
                concept_id = value
            data[stability_test][var_name] = concept_id
            concept_ids.add(concept_id)

        if concept_id is None:
            concept_id = value
        if concept_id in concept_ids: # duplicate annotation
            continue
        concept_ids.add(concept_id)
        if var_name not in data:
            data[var_name] = concept_id
        else:
            if isinstance(data[var_name], list):
                data[var_name].append(value)
            else:
                data[var_name] = [data[var_name], value]
    
    return data

In [569]:
def extract_papernum(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num


In [570]:
## See if we can extract the paper number correctly
bioc_dir = "../data/biocs"
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = extract_papernum(file_path)


In [571]:
bioc_dir = "../data/biocs"
label_data = {}
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = extract_papernum(file_path)
        row = parse_bioc(file_path)
        if len(row.keys()) > 0:
            label_data[paper_num] = row

print(label_data)   
print(f"There are {len(label_data)} annotated papers")


{'19': {'ISOSL3': {'temperature': '85', 'humidity': '50', 'time': '1560', 'treated_pce': '23.3', 'control_pce': '21', 'control_voc': '1.269', 'treated_voc': '1.271'}, 'perovskite_composition': ['Cs 0.05 MA 0.05 FA 0.9 Pb(I 0.95 Br 0.05 ) 3', 'Cs 0.05 MA 0.15 FA 0.8 PbI 3'], 'passivating_molecule': '3,4,5-trifluoroanilinium', 'hole_transport_layer': '2PACz', 'efficiency_tret': '84', 'electron_transport_layer': 'C60', 'structure_pin_nip': 'PIN'}, '84': {'structure_pin_nip': 'NIP', 'hole_transport_layer': ['Spiro-OMeTAD', 'fluorene-terminated hole-transporting material'], 'ISOST2': {'efficiency_tret': 'ISOS-T2', 'time': 'ISOS-T2', 'temperature': 'ISOS-T2'}, 'perovskite_composition': 'FAPbI 3 ) 0.95 (MAPbBr 3 ) 0.05', 'electron_transport_layer': 'mp-TIO2', 'ISOSL1': {'efficiency_tret': 'ISOS-L1', 'time': 'ISOS-L1'}, 'ISOSD1': {'efficiency_tret': 'ISOS-D1', 'time': 'ISOS-D1', 'temperature': 'ISOS-D1'}}, '56': {'treated_pce': '25.5%', 'perovskite_composition': 'FA 0.83 Cs 0.17 Pb 0.5 Sn 0.5 

### Remove journal_publication and date_published key from teamtat since xml file didn't mentioned them. 

In [572]:
def take_out_keys(json_obj, keys_to_remove):
    ##This function will be irrelevant Hopefully after we have markdown text to annotate and have publication info in the text
    if isinstance(json_obj, dict):
        # Process dictionaries
        return {
            key: take_out_keys(value, keys_to_remove)
            for key, value in json_obj.items()
            if key not in keys_to_remove
        }
    elif isinstance(json_obj, list):
        # Process lists
        return [take_out_keys(item, keys_to_remove) for item in json_obj]
    else:
        # Return the value as is for non-dict and non-list items
        return json_obj
key_remove = {'journal_publication', 'date_published'}

In [573]:
clean_label_data = take_out_keys(label_data, key_remove)
clean_label_data

{'19': {'ISOSL3': {'temperature': '85',
   'humidity': '50',
   'time': '1560',
   'treated_pce': '23.3',
   'control_pce': '21',
   'control_voc': '1.269',
   'treated_voc': '1.271'},
  'perovskite_composition': ['Cs 0.05 MA 0.05 FA 0.9 Pb(I 0.95 Br 0.05 ) 3',
   'Cs 0.05 MA 0.15 FA 0.8 PbI 3'],
  'passivating_molecule': '3,4,5-trifluoroanilinium',
  'hole_transport_layer': '2PACz',
  'efficiency_tret': '84',
  'electron_transport_layer': 'C60',
  'structure_pin_nip': 'PIN'},
 '84': {'structure_pin_nip': 'NIP',
  'hole_transport_layer': ['Spiro-OMeTAD',
   'fluorene-terminated hole-transporting material'],
  'ISOST2': {'efficiency_tret': 'ISOS-T2',
   'time': 'ISOS-T2',
   'temperature': 'ISOS-T2'},
  'perovskite_composition': 'FAPbI 3 ) 0.95 (MAPbBr 3 ) 0.05',
  'electron_transport_layer': 'mp-TIO2',
  'ISOSL1': {'efficiency_tret': 'ISOS-L1', 'time': 'ISOS-L1'},
  'ISOSD1': {'efficiency_tret': 'ISOS-D1',
   'time': 'ISOS-D1',
   'temperature': 'ISOS-D1'}},
 '56': {'treated_pce': '25.

## Retrieving LLama Prediction

In [574]:
import json

# Path to your JSON file
file_path = '../data/deepseek_5_progressive.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    llama_data = json.load(file)

# Print the data to verify
print(llama_data)
print(f"There are {len(llama_data)} papers that llama evaluated")

{'103': {'control_pce': 14.44, 'control_voc': 0.926, 'treated_pce': 23.7, 'treated_voc': 1.47, 'passivating_molecule': 'Methylenediammonium chloride (MDACl2)', 'perovskite_composition': 'FAPbI3 with 3.8 mol% MDACl2', 'electron_transport_layer': 'TiO2 (mesoporous)', 'pin_nip_structure': 'PIN', 'hole_transport_layer': 'CuPC', 'stability_tests': [{'test_name': 'ISOS-D', 'temperature': 25, 'time': 600, 'humidity': '20-30%', 'control_efficiency': 14.44, 'treatment_efficiency': 23.7}]}, '77': {'control_pce': '21.2', 'control_voc': '1.07', 'treated_pce': '24.1', 'treated_voc': '1.14', 'passivating_molecule': 'PEAMAI', 'perovskite_composition': 'FA 0.9 Cs 0.1 PbI 3', 'electron_transport_layer': 'ITO', 'pin_nip_structure': 'PIN', 'hole_transport_layer': 'PTAA', 'stability_tests': [{'test_name': 'ISOS-D', 'temperature': '55', 'time': '800', 'humidity': 'N/A', 'control_efficiency': '90', 'treatment_efficiency': '90'}, {'test_name': 'ISOS-E', 'temperature': '85', 'time': '1500', 'humidity': 'N/A',

### Clean Llama predictoin to get rid of key with None and 'Not mentioned' value

In [575]:
# Recursive function to clean the JSON
def clean_json(obj):
    if isinstance(obj, dict):
        # Recursively process dictionary
        return {
            key: clean_json(value)
            for key, value in obj.items()
            if value not in ['None', None, "Not mentioned"]
        }
    elif isinstance(obj, list):
        # Recursively process list
        return [clean_json(item) for item in obj if item not in ['None', None, "Not mentioned"]]
    else:
        # Return the value as is
        return obj



In [576]:
filtered_llama_data = clean_json(llama_data)
filtered_llama_data

{'103': {'control_pce': 14.44,
  'control_voc': 0.926,
  'treated_pce': 23.7,
  'treated_voc': 1.47,
  'passivating_molecule': 'Methylenediammonium chloride (MDACl2)',
  'perovskite_composition': 'FAPbI3 with 3.8 mol% MDACl2',
  'electron_transport_layer': 'TiO2 (mesoporous)',
  'pin_nip_structure': 'PIN',
  'hole_transport_layer': 'CuPC',
  'stability_tests': [{'test_name': 'ISOS-D',
    'temperature': 25,
    'time': 600,
    'humidity': '20-30%',
    'control_efficiency': 14.44,
    'treatment_efficiency': 23.7}]},
 '77': {'control_pce': '21.2',
  'control_voc': '1.07',
  'treated_pce': '24.1',
  'treated_voc': '1.14',
  'passivating_molecule': 'PEAMAI',
  'perovskite_composition': 'FA 0.9 Cs 0.1 PbI 3',
  'electron_transport_layer': 'ITO',
  'pin_nip_structure': 'PIN',
  'hole_transport_layer': 'PTAA',
  'stability_tests': [{'test_name': 'ISOS-D',
    'temperature': '55',
    'time': '800',
    'humidity': 'N/A',
    'control_efficiency': '90',
    'treatment_efficiency': '90'},
  

In [577]:
def drop_fake_stability(obj):
    for value in obj.values():
        if 'stability_tests' in value:
            value['stability_tests'] = [
                test for test in value['stability_tests'] if len(test.keys()) > 1
            ]
    return obj
                        

In [578]:
filtered_llama_data = drop_fake_stability(filtered_llama_data)
filtered_llama_data

{'103': {'control_pce': 14.44,
  'control_voc': 0.926,
  'treated_pce': 23.7,
  'treated_voc': 1.47,
  'passivating_molecule': 'Methylenediammonium chloride (MDACl2)',
  'perovskite_composition': 'FAPbI3 with 3.8 mol% MDACl2',
  'electron_transport_layer': 'TiO2 (mesoporous)',
  'pin_nip_structure': 'PIN',
  'hole_transport_layer': 'CuPC',
  'stability_tests': [{'test_name': 'ISOS-D',
    'temperature': 25,
    'time': 600,
    'humidity': '20-30%',
    'control_efficiency': 14.44,
    'treatment_efficiency': 23.7}]},
 '77': {'control_pce': '21.2',
  'control_voc': '1.07',
  'treated_pce': '24.1',
  'treated_voc': '1.14',
  'passivating_molecule': 'PEAMAI',
  'perovskite_composition': 'FA 0.9 Cs 0.1 PbI 3',
  'electron_transport_layer': 'ITO',
  'pin_nip_structure': 'PIN',
  'hole_transport_layer': 'PTAA',
  'stability_tests': [{'test_name': 'ISOS-D',
    'temperature': '55',
    'time': '800',
    'humidity': 'N/A',
    'control_efficiency': '90',
    'treatment_efficiency': '90'},
  

### Subsetting the LLama Prediction since we only have 54 true lables

In [579]:
# Extract the subset from json_150 where keys match with json_54
subset_llama = {key: filtered_llama_data[key] for key in label_data.keys() if key in filtered_llama_data}

# Print the result
print(len(subset_llama))


54


## First, we will evaluate one pair of papers and adjust specific formatting

In [580]:
##0th paper prediction
subset_llama

{'19': {'treated_pce': 24.09,
  'passivating_molecule': '3FPEA',
  'perovskite_composition': 'Cs0.05MA0.05FA0.9Pb(I0.95Br0.05)3',
  'electron_transport_layer': '2PACz',
  'pin_nip_structure': 'inverted',
  'hole_transport_layer': 'ITO',
  'stability_tests': [{'test_name': 'ISOS-L-3',
    'temperature': 85,
    'time': 1560,
    'humidity': '50% RH',
    'treatment_efficiency': 24.09}]},
 '84': {'control_pce': 21.6,
  'control_voc': 1.24,
  'treated_pce': 23.2,
  'treated_voc': 1.14,
  'passivating_molecule': 'DM',
  'perovskite_composition': '(FAPbI3)0.95 (MAPbBr3)0.05',
  'electron_transport_layer': 'TiO2',
  'pin_nip_structure': 'PIN',
  'hole_transport_layer': 'DM',
  'stability_tests': [{'test_name': 'thermal stability test',
    'temperature': 60,
    'time': 50,
    'humidity': 'air (~25% RH)',
    'control_efficiency': 21.6,
    'treatment_efficiency': 22.3},
   {'test_name': 'thermal stability test',
    'temperature': 70,
    'time': 50,
    'humidity': 'air (~25% RH)',
    'c

In [581]:
#The actual label
clean_label_data['0']

{'ISOSL3': {'control_pce': '24',
  'efficiency_tret': '95%',
  'time': '1200',
  'treated_pce': '26.9',
  'treated_voc': '1.18'},
 'structure_pin_nip': 'PIN',
 'passivating_molecule': '4Cl-BZS',
 'perovskite_composition': 'Cs 0.05 FA 0.85 MA 0.1 PbI 3',
 'hole_transport_layer': '2PACz and Me-4PACz',
 'electron_transport_layer': 'C60'}

#### Evaluation in parts
- numerical data 
- text data (molecule)
- stability
    - Change how to parse xml
    - Change the output of the model as ID but no specification on number


In [582]:
## We need precision and recall for EACH variable
## For each variable, calculate the F1 score - There is F1 score for each variable
## Take a weighted average ***For now, just take the average. 

Variables (number)
- control_pce
- treatment_pce
- control_voc
- treatment_voc

Variable (text)
- structure_pin_nip
- passivating_molecule
- perovskite_composition
- hole_transport_layer
- electron_transport_layer

Stability (later)

In [583]:
def check_float(value):
    for char in value:
        if (char == ".") | (char.isdigit()):
            continue
        else:
            return False
    return True


In [584]:
import re

def check_float(value):
    """Returns True if value can be converted to a float, otherwise False."""
    try:
        float(value)
        return True
    except ValueError:
        return False

def clean_numeric_value(value):
    """
    Cleans a numeric value by removing non-numeric characters except decimals.
    Ensures the output is either a float or a string that can be converted.
    """
    if value is None:
        return None
    
    value = str(value).strip()  # Convert to string and remove leading/trailing spaces
    cleaned_value = re.sub(r"[^\d.]", "", value)  # Remove all non-numeric characters except "."

    return cleaned_value if check_float(cleaned_value) else None

def numerical_comparison(id, label_annotation, extraction_annotation, numerical_tolerance=0.1):
    """
    Compares numerical values with a tolerance, handling different formatting issues.
    """
    if id not in extraction_annotation:
        return "FN"  # False Negative: Missing extracted value
    
    label_data = label_annotation.get(id)
    extract_data = extraction_annotation.get(id)

    # Clean and convert numeric values
    label_data = clean_numeric_value(label_data)
    extract_data = clean_numeric_value(extract_data)

    if label_data is None or extract_data is None:
        return "FN"  # False Negative: If either value is invalid

    label_data = float(label_data)
    extract_data = float(extract_data)

    # Apply numerical tolerance check
    if abs(label_data - extract_data) <= numerical_tolerance * abs(label_data):
        return "TP"  # True Positive: Correct numerical extraction
    else:
        return "FP"  # False Positive: Incorrect numerical extraction


In [585]:
from difflib import SequenceMatcher

def text_comparison(id, label_annotation, extraction_annotation, text_similarity_threshold=0.8):
    """Compares text values using string similarity matching."""

    # Handle special case for structure_pin_nip
    key_to_check = "pin_nip_structure" if id == "structure_pin_nip" else id

    # If the key is missing in the extracted annotation, return False Negative
    if key_to_check not in extraction_annotation:
        return "FN"

    label_data = label_annotation.get(id, "")
    extract_data = extraction_annotation.get(key_to_check, "")

    # Convert lists to strings if necessary
    if isinstance(label_data, list):
        label_data = " ".join(map(str, label_data))  # Convert list to string
    if isinstance(extract_data, list):
        extract_data = " ".join(map(str, extract_data))  # Convert list to string

    # Ensure values are strings
    if not isinstance(label_data, str) or not isinstance(extract_data, str):
        return "FP"  # If data is still not a string, return False Positive

    # Compute similarity score
    similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()

    return "TP" if similarity > text_similarity_threshold else "FP"

        


In [586]:
def stability_comparison(id, label_annotation, extraction_annotation):
    # Ensure "stability_tests" exists and is a list
    if "stability_tests" not in extraction_annotation or not isinstance(extraction_annotation["stability_tests"], list):
        return "FN"  # No stability test data found

    # If "stability_tests" is empty, return False Negative
    if len(extraction_annotation["stability_tests"]) == 0:
        return "FN"

    for dictionary in extraction_annotation["stability_tests"]:
        # Skip if "test_name" is missing or invalid
        if "test_name" not in dictionary or not isinstance(dictionary["test_name"], str):
            continue
        
        test_name = dictionary["test_name"]
        if id[4] == test_name[-1]:  # Match last character of ID and test name
            return "TP"  # True Positive: Test correctly extracted

    return "FN"  # No match found

In [587]:
def safe_division(numerator, denominator):
    """Returns division result, or 0 if the denominator is zero."""
    return numerator / denominator if denominator != 0 else 0

def compare_json(labeled_data, extracted_data):
    """
    Compare labeled and extracted JSON data for correctness.

    TP: Correct value extracted by LLM.
    FN: LLM didn't extract this variable.
    FP: LLM extracted a value, but it was incorrect.
    """
    
    numerical_variables = ["control_pce", "treated_pce", "control_voc", "treated_voc"]
    text_variables = ["structure_pin_nip", "passivating_molecule", "perovskite_composition", 
                      "electron_transport_layer", "hole_transport_layer"]
    
    # Initialize comparison dictionaries
    numerical_dict = {var: {"TP": 0, "FP": 0, "FN": 0} for var in numerical_variables}
    text_dict = {var: {"TP": 0, "FP": 0, "FN": 0} for var in text_variables}
    stability_dict = {
        "ISOS-D": {"TP": 0, "FP": 0, "FN": 0},
        "ISOS-L": {"TP": 0, "FP": 0, "FN": 0},
        "ISOS-T": {"TP": 0, "FP": 0, "FN": 0}
    }

    for key, label_value in labeled_data.items():
        if key not in extracted_data:
            print(f"Extraction was not performed. Paper num: {key}")
            continue
        
        extracted_value = extracted_data[key]

        for id, label in label_value.items():
            # Handle numerical values
            if id in numerical_variables:
                if isinstance(label, str) and label.replace(".", "").isdigit():
                    result = numerical_comparison(id, label_value, extracted_value)
                    numerical_dict[id][result] += 1

            # Handle text values
            elif id in text_variables:
                if isinstance(label, str):
                    result = text_comparison(id, label_value, extracted_value)
                    text_dict[id][result] += 1

            # Handle stability tests
            elif "ISOS" in id:
                result = stability_comparison(id, label_value, extracted_value)
                stability_type = f"ISOS-{id[4]}"  # Extract stability type
                if stability_type in stability_dict:
                    stability_dict[stability_type][result] += 1

    # Merge all results
    combined_dict = {**numerical_dict, **text_dict, **stability_dict}
    print("Performance for each variable in dictionary:", combined_dict)

    # Compute precision, recall, and F1-score
    variable_list, precision_list, recall_list, f1_list = [], [], [], []
    for variable, performance in combined_dict.items():
        TP, FP, FN = performance["TP"], performance["FP"], performance["FN"]
        
        precision = safe_division(TP, TP + FP)
        recall = safe_division(TP, TP + FN)
        f1 = safe_division(2 * precision * recall, precision + recall)

        variable_list.append(variable)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    return variable_list, precision_list, recall_list, f1_list

In [588]:
variables, precisions, recalls, f1s = compare_json(clean_label_data, subset_llama)

Performance for each variable in dictionary: {'control_pce': {'TP': 4, 'FP': 1, 'FN': 1}, 'treated_pce': {'TP': 9, 'FP': 0, 'FN': 0}, 'control_voc': {'TP': 5, 'FP': 1, 'FN': 2}, 'treated_voc': {'TP': 8, 'FP': 3, 'FN': 1}, 'structure_pin_nip': {'TP': 7, 'FP': 17, 'FN': 1}, 'passivating_molecule': {'TP': 6, 'FP': 13, 'FN': 1}, 'perovskite_composition': {'TP': 15, 'FP': 19, 'FN': 1}, 'electron_transport_layer': {'TP': 12, 'FP': 14, 'FN': 1}, 'hole_transport_layer': {'TP': 27, 'FP': 12, 'FN': 0}, 'ISOS-D': {'TP': 11, 'FP': 0, 'FN': 8}, 'ISOS-L': {'TP': 2, 'FP': 0, 'FN': 22}, 'ISOS-T': {'TP': 0, 'FP': 0, 'FN': 4}}


## Calculate Macro f1 score

In [589]:
def macro_f1(f1_list, weight = None):
    if weight == None:
        #If no weight given, do unweighted average of f1 score
        return sum(f1_list) / len(f1_list)
    total_f1 = 0
    for i in range(len(f1_list)):
        total_f1 += (f1_list[i] * weight[i])
    return total_f1 / sum(weight)
    


In [590]:
## The macro f1 score unweighted
macro_f1(f1s)

0.599084595959596

### This concludes the pipeline of evaluating extraction quality

# Todo:
- stability evaluation
    - Need to reiterate on the teamtat annotation

- Putting weights on F1 Score
- Using Bert!!

## Stability annotation brainstorm

- We will have as at most 5 realation (if there is an overlap, choose the stability test that offers more info)
    - Exp there is ISOS L1 and ISOS L2. If there is more iformation in IsosL2, use that as a test and compare on ISOSL that llama extracted (if any)
- We will iterate through different test recorded on annotation and compared it with suitable stability extracted on Llama