In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np
import json
import os
import xml.etree.ElementTree as ET 
import pandas as pd

## This file compares the teamtat annotation (Json) with Extraction performed by finetuned llama (Json)

In [2]:
label = {'passivating_molecule': 'phenethylammonium iodide', 'perovskite_composition': 'MAPbI3', 'ISOSD1': {'time': '240', 'treated_pce': '15.3', 'control_pce': '16.69', 'temperature': '25', 'humidity': '90', 'control_voc': '1.03', 'treated_voc': '1.06'}, 'electron_transport_layer': 'TiO2', 'hole_transport_layer': 'Spiro-OMeTAD'}

In [3]:
extraction = {
  "control_pce": None,
  "control_voc": None,
  "treated_pce": 15.3,
  "treated_voc": 1.06,
  "passivating_molecule": "Phenylethylammonium (PEA)",
  "perovskite_composition": "[C8H9NH3]2[(CH3NH3)2PbI3 – (n=60)]",
  "electron_transport_layer": "TiO2",
  "hole_transport_layer": "spiro-OMeTAD",
  "ISOS-L-1": None,
  "ISOS-L-2": None,
  "ISOS-T-1": None,
  "ISOS-T-2": None,
  "ISOS-LC": None,
  "ISOS-D-1": None,
  "ISOS-D-2": None
}

In [4]:
def compare_data(labeled_data, extracted_data, numerical_tolerance=0.05):
    results = {}
    total_fields = len(labeled_data)
    matched_fields = 0
    numerical_differences = []

    for key, value in labeled_data.items():
        if key in extracted_data:
            extracted_value = extracted_data[key]

            # Exact match
            if value == extracted_value:
                matched_fields += 1
                results[key] = "Match"

            # Numerical comparison
            elif isinstance(value, (int, float)) and isinstance(extracted_value, (int, float)):
                if abs(value - extracted_value) <= numerical_tolerance * abs(value):
                    matched_fields += 1
                    numerical_differences.append(abs(value - extracted_value))
                    results[key] = "Numerical Match"
                else:
                    results[key] = "Numerical Mismatch"

            # Fuzzy string match
            elif isinstance(value, str) and isinstance(extracted_value, str):
                similarity = SequenceMatcher(None, value.lower(), extracted_value.lower()).ratio()
                if similarity > 0.8:  # Threshold for similarity
                    matched_fields += 1
                    results[key] = f"Fuzzy Match ({similarity:.2f})"
                else:
                    results[key] = f"Mismatch ({similarity:.2f})"

            else:
                results[key] = "Mismatch"
        else:
            results[key] = "Missing in Extracted Data"

    accuracy = matched_fields / total_fields
    mean_absolute_error = np.mean(numerical_differences) if numerical_differences else None

    return {
        "results": results,
        "accuracy": accuracy,
        "mean_absolute_error": mean_absolute_error,
        "matched_fields": matched_fields,
        "total_fields": total_fields
    }


In [5]:
comparison_result = compare_data(label, extraction)
print(comparison_result)


{'results': {'passivating_molecule': 'Mismatch (0.78)', 'perovskite_composition': 'Mismatch (0.20)', 'ISOSD1': 'Missing in Extracted Data', 'electron_transport_layer': 'Match', 'hole_transport_layer': 'Fuzzy Match (1.00)'}, 'accuracy': 0.4, 'mean_absolute_error': None, 'matched_fields': 2, 'total_fields': 5}


## Retrieving teamtat label

In [3]:
def parse_bioc(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = { }
    concept_ids = set()
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name

    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        if node_id in relations:
            stability_test = relations[node_id]
            if stability_test not in data:
                data[stability_test] = {}
            if concept_id is None:
                concept_id = value
            data[stability_test][var_name] = concept_id
            concept_ids.add(concept_id)

        if concept_id is None:
            concept_id = value
        if concept_id in concept_ids: # duplicate annotation
            continue
        concept_ids.add(concept_id)
        if var_name not in data:
            data[var_name] = concept_id
        else:
            if isinstance(data[var_name], list):
                data[var_name].append(value)
            else:
                data[var_name] = [data[var_name], value]
    
    return data

In [4]:
def extract_papernum(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num


In [None]:
## See if we can extract the paper number correctly
bioc_dir = "../data/biocs"
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = extract_papernum(file_path)


In [None]:
bioc_dir = "../data/biocs"
label_data = {}
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = extract_papernum(file_path)
        row = parse_bioc(file_path)
        if len(row.keys()) > 0:
            label_data[paper_num] = row

print(label_data)   
print(f"There are {len(label_data)} annotated papers")


{'0': {'ISOSL3': {'control_pce': '24', 'efficiency_tret': '95%', 'time': '1200', 'treated_pce': '26.9', 'treated_voc': '1.18'}, 'structure_pin_nip': 'PIN', 'passivating_molecule': '4Cl-BZS', 'perovskite_composition': 'Cs 0.05 FA 0.85 MA 0.1 PbI 3', 'hole_transport_layer': '2PACz and Me-4PACz', 'electron_transport_layer': 'C60', 'journal_publication': ['Science', 'science'], 'date_published': '4/11/2024'}, '1': {'ISOSD2': {'treated_pce': '23.32', 'treated_voc': '1.16', 'control_pce': '21.3', 'time': '500'}, 'passivating_molecule': 'phenethylammonium iodide (PEAI)', 'electron_transport_layer': ['HC(NH 2 ) 2 -CH 3 NH 3', 'SnO 2'], 'journal_publication': 'nature', 'hole_transport_layer': 'spiro-OMeTAD', 'date_published': '1 April 2019'}, '2': {'treated_voc': '1.185', 'ISOSD1': {'treated_voc': '1.185', 'temperature': '85', 'humidity': '85', 'treated_pce': '24.35', 'time': '1056', 'control_pce': '22.39'}, 'electron_transport_layer': 'SnO2', 'hole_transport_layer': 'Spiro-OMeTAD', 'humidity':

### Remove journal_publication and date_published key from teamtat since xml file didn't mentioned them. 

In [7]:
def take_out_keys(json_obj, keys_to_remove):
    ##This function will be irrelevant Hopefully after we have markdown text to annotate and have publication info in the text
    if isinstance(json_obj, dict):
        # Process dictionaries
        return {
            key: take_out_keys(value, keys_to_remove)
            for key, value in json_obj.items()
            if key not in keys_to_remove
        }
    elif isinstance(json_obj, list):
        # Process lists
        return [take_out_keys(item, keys_to_remove) for item in json_obj]
    else:
        # Return the value as is for non-dict and non-list items
        return json_obj
key_remove = {'journal_publication', 'date_published'}

In [8]:
clean_label_data = take_out_keys(label_data, key_remove)
clean_label_data

{'0': {'ISOSL3': {'control_pce': '24',
   'efficiency_tret': '95%',
   'time': '1200',
   'treated_pce': '26.9',
   'treated_voc': '1.18'},
  'structure_pin_nip': 'PIN',
  'passivating_molecule': '4Cl-BZS',
  'perovskite_composition': 'Cs 0.05 FA 0.85 MA 0.1 PbI 3',
  'hole_transport_layer': '2PACz and Me-4PACz',
  'electron_transport_layer': 'C60'},
 '1': {'ISOSD2': {'treated_pce': '23.32',
   'treated_voc': '1.16',
   'control_pce': '21.3',
   'time': '500'},
  'passivating_molecule': 'phenethylammonium iodide (PEAI)',
  'electron_transport_layer': ['HC(NH 2 ) 2 -CH 3 NH 3', 'SnO 2'],
  'hole_transport_layer': 'spiro-OMeTAD'},
 '2': {'treated_voc': '1.185',
  'ISOSD1': {'treated_voc': '1.185',
   'temperature': '85',
   'humidity': '85',
   'treated_pce': '24.35',
   'time': '1056',
   'control_pce': '22.39'},
  'electron_transport_layer': 'SnO2',
  'hole_transport_layer': 'Spiro-OMeTAD',
  'humidity': '85',
  'perovskite_composition': '(C 4 H 9 NH 3 ) 2 PbI 4',
  'efficiency_tret': 

## Retrieving LLama Prediction

In [None]:
import json

# Path to your JSON file
file_path = '../data/finetuned_llama_output.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    llama_data = json.load(file)

# Print the data to verify
print(llama_data)
print(f"There are {len(llama_data)} papers that llama evaluated")

{'111': {'control_pce': None, 'control_voc': None, 'treated_pce': 23.9, 'treated_voc': 1.51, 'passivating_molecule': 'isopropylammonium chloride', 'perovskite_composition': 'FAPbI3', 'electron_transport_layer': None, 'hole_transport_layer': None, 'pin_nip_structure': 'PIN', 'stability_tests': [{'test_name': 'ISOS-D', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'ISOS-L', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'ISOS-T', 'temperature': 150, 'time': 30, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'ISOS-LC', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'ISOS-LT', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}]}, '56': {'control_pce': No

### Clean Llama predictoin to get rid of key with None and 'Not mentioned' value

In [10]:
# Recursive function to clean the JSON
def clean_json(obj):
    if isinstance(obj, dict):
        # Recursively process dictionary
        return {
            key: clean_json(value)
            for key, value in obj.items()
            if value not in ['None', None, "Not mentioned"]
        }
    elif isinstance(obj, list):
        # Recursively process list
        return [clean_json(item) for item in obj if item not in ['None', None, "Not mentioned"]]
    else:
        # Return the value as is
        return obj



In [11]:
filtered_llama_data = clean_json(llama_data)
filtered_llama_data

{'111': {'treated_pce': 23.9,
  'treated_voc': 1.51,
  'passivating_molecule': 'isopropylammonium chloride',
  'perovskite_composition': 'FAPbI3',
  'pin_nip_structure': 'PIN',
  'stability_tests': [{'test_name': 'ISOS-D'},
   {'test_name': 'ISOS-L'},
   {'test_name': 'ISOS-T', 'temperature': 150, 'time': 30},
   {'test_name': 'ISOS-LC'},
   {'test_name': 'ISOS-LT'}]},
 '56': {'treated_pce': 21.1,
  'treated_voc': 1.23,
  'passivating_molecule': 'LiTFSI',
  'perovskite_composition': 'FA0.83Cs0.17Pb0.5Sn0.5I3',
  'electron_transport_layer': 'PEDOT:PSS',
  'hole_transport_layer': 'C60 and BCP',
  'pin_nip_structure': 'PIN',
  'stability_tests': [{'test_name': 'ISOS-D'},
   {'test_name': 'ISOS-L'},
   {'test_name': 'ISOS-T'},
   {'test_name': 'ISOS-LC'},
   {'test_name': 'ISOS-LT'}]},
 '87': {'treated_pce': 21.52,
  'treated_voc': 1.15,
  'passivating_molecule': 'Eu 3+ -Eu 2+ ion pair',
  'perovskite_composition': 'PbX2 layer (CsI 20 mg, PbI2 530 mg, PbBr2 20 mg, PbCl2 20 mg, MAI 5 mg dis

In [12]:
def drop_fake_stability(obj):
    for value in obj.values():
        if 'stability_tests' in value:
            value['stability_tests'] = [
                test for test in value['stability_tests'] if len(test.keys()) > 1
            ]
    return obj
                        

In [13]:
filtered_llama_data = drop_fake_stability(filtered_llama_data)
filtered_llama_data

{'111': {'treated_pce': 23.9,
  'treated_voc': 1.51,
  'passivating_molecule': 'isopropylammonium chloride',
  'perovskite_composition': 'FAPbI3',
  'pin_nip_structure': 'PIN',
  'stability_tests': [{'test_name': 'ISOS-T',
    'temperature': 150,
    'time': 30}]},
 '56': {'treated_pce': 21.1,
  'treated_voc': 1.23,
  'passivating_molecule': 'LiTFSI',
  'perovskite_composition': 'FA0.83Cs0.17Pb0.5Sn0.5I3',
  'electron_transport_layer': 'PEDOT:PSS',
  'hole_transport_layer': 'C60 and BCP',
  'pin_nip_structure': 'PIN',
  'stability_tests': []},
 '87': {'treated_pce': 21.52,
  'treated_voc': 1.15,
  'passivating_molecule': 'Eu 3+ -Eu 2+ ion pair',
  'perovskite_composition': 'PbX2 layer (CsI 20 mg, PbI2 530 mg, PbBr2 20 mg, PbCl2 20 mg, MAI 5 mg dissolved in 900 μL DMF and 100 μL DMSO mixed solvent)',
  'electron_transport_layer': 'SnO2',
  'hole_transport_layer': 'polymer modified spiro-OMeTAD',
  'pin_nip_structure': 'PIN',
  'stability_tests': [{'test_name': '1 sun illumination',
    

### Subsetting the LLama Prediction since we only have 54 true lables

In [14]:
# Extract the subset from json_150 where keys match with json_54
subset_llama = {key: filtered_llama_data[key] for key in label_data.keys() if key in filtered_llama_data}

# Print the result
print(len(subset_llama))


54


## First, we will evaluate one pair of papers and adjust specific formatting

In [15]:
##0th paper prediction
subset_llama

{'0': {'control_pce': '25.7',
  'control_voc': '1.17',
  'treated_pce': '26.15',
  'treated_voc': '1.18',
  'passivating_molecule': '4-chlorobenzenesulfonate (4Cl-BZS)',
  'perovskite_composition': 'α-phase FAPbI3',
  'electron_transport_layer': 'C60',
  'hole_transport_layer': 'SAMs (self-assembled monolayers)',
  'pin_nip_structure': 'PIN',
  'stability_tests': [{'test_name': 'ISOS-D-2I',
    'temperature': 85,
    'time': 1500,
    'humidity': '50 ± 10%',
    'control_efficiency': '19.9%',
    'treatment_efficiency': '95%'},
   {'test_name': 'ISOS-L-3',
    'temperature': 65,
    'time': 1200,
    'humidity': '50 ± 10%',
    'control_efficiency': '23.2%',
    'treatment_efficiency': '95%'}]},
 '1': {'control_pce': '20.95%',
  'control_voc': '1.11',
  'treated_pce': '23.32%',
  'treated_voc': '1.18',
  'passivating_molecule': 'phenethylammonium iodide (PEAI)',
  'perovskite_composition': 'FA1-xMAxPbI3',
  'electron_transport_layer': 'SnO2',
  'hole_transport_layer': 'spiro-OMeTAD',
 

In [16]:
#The actual label
clean_label_data['0']

{'ISOSL3': {'control_pce': '24',
  'efficiency_tret': '95%',
  'time': '1200',
  'treated_pce': '26.9',
  'treated_voc': '1.18'},
 'structure_pin_nip': 'PIN',
 'passivating_molecule': '4Cl-BZS',
 'perovskite_composition': 'Cs 0.05 FA 0.85 MA 0.1 PbI 3',
 'hole_transport_layer': '2PACz and Me-4PACz',
 'electron_transport_layer': 'C60'}

#### Evaluation in parts
- numerical data 
- text data (molecule)
- stability
    - Change how to parse xml
    - Change the output of the model as ID but no specification on number


In [17]:
## We need precision and recall for EACH variable
## For each variable, calculate the F1 score - There is F1 score for each variable
## Take a weighted average ***For now, just take the average. 

Variables (number)
- control_pce
- treatment_pce
- control_voc
- treatment_voc

Variable (text)
- structure_pin_nip
- passivating_molecule
- perovskite_composition
- hole_transport_layer
- electron_transport_layer

Stability (later)

In [18]:
def check_float(value):
    for char in value:
        if (char == ".") | (char.isdigit()):
            continue
        else:
            return False
    return True


In [19]:
def numerical_comparison(id, label_annotation, extraction_annotation, numerical_tolerance = 0.1):
    ##We are given an id to make numerical annotation
    #The argument is id, and 2 json annotation from label and extraction

    if id not in extraction_annotation:
        # print(f"FN_{id}")
        return f"FN"
    else:
        label_data = label_annotation[id]
        ##This code is because of imperfect annotation on teamtat, Delete this once annotation is perfect
        #When label_data highlights the unit
        if check_float(label_data):
            label_data = float(label_data)
        else:
            label_data = float(label_data[:-1].strip())

        extract_data = str(extraction_annotation[id])
        if check_float(extract_data):
            extract_data = float(extract_data)
        else:
            extract_data = float(extract_data[:-1].strip())

        if abs(label_data - extract_data) <= numerical_tolerance * abs(label_data):
            # print(f"TP_{id}")
            return f"TP"
        else:
            # print(f"FP_{id}")
            return f"FP"

In [20]:
def text_comparison(id, label_annotation, extraction_annotation, text_similarity_threshold = 0.8):

    if id == "structure_pin_nip":
        if "pin_nip_structure" not in extraction_annotation:
            return f"FN"
        else:
            label_data = label_annotation[id]
            extract_data = extraction_annotation["pin_nip_structure"]

            similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()
            if similarity > text_similarity_threshold:  # Threshold for similarity
                # print(f"TP_{id}")
                return f"TP"
            else:
                # print(f"FP_{id}")
                return f"FP"
    else:
        if id not in extraction_annotation:
            # print(f"FN_{id}")
            return f"FN"
        else:
            label_data = label_annotation[id]
            extract_data = extraction_annotation[id]

            similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()
            if similarity > text_similarity_threshold:  # Threshold for similarity
                # print(f"TP_{id}")
                return f"TP"
            else:
                # print(f"FP_{id}")
                return f"FP"
        


In [29]:
def stability_comparison(id, label_annotation, extraction_annotation):
    ## Return FN if id is not in extraction_annotation
    if len(extraction_annotation['stability_tests']) == 0:
        return f"FN"
    for dictionary in extraction_annotation['stability_tests']:
        test_name = dictionary['test_name']
        if id[4] == test_name[-1]:
            return f"TP"
    return f"FN"

In [30]:
def compare_json(labeled_data, extracted_data):
    ## List of variable for TP, FP, FN
    '''Just as summary
    TP: Corrected value extracted by LLM
    FN: LLM didn't extract this variable
    FP: LLM DID extract the value, but it was wrong value
    '''
    numerical_variable = ["control_pce", "treated_pce", "control_voc", "treated_voc"]
    #First numerical comparison!!
    numerical_dict = {
    "control_pce" : {"TP": 0, "FP":0, "FN":0},
    "treated_pce" : {"TP": 0, "FP":0, "FN":0},
    "control_voc" : {"TP": 0, "FP":0, "FN":0},
    "treated_voc" : {"TP": 0, "FP":0, "FN":0}}


    text_variable = ["structure_pin_nip", "passivating_molecule", "perovskite_composition", "electron_transport_layer", "hole_transport_layer"]
    text_dict = {
    "structure_pin_nip" : {"TP": 0, "FP":0, "FN":0},
    "passivating_molecule" : {"TP": 0, "FP":0, "FN":0},
    "perovskite_composition" : {"TP": 0, "FP":0, "FN":0},
    "electron_transport_layer" : {"TP": 0, "FP":0, "FN":0},
    "hole_transport_layer" : {"TP": 0, "FP":0, "FN":0}}

    stability_dict = {
    "ISOS-D" : {"TP": 0, "FP":0, "FN":0},
    "ISOS-L" : {"TP": 0, "FP":0, "FN":0},
    "ISOS-T" : {"TP": 0, "FP":0, "FN":0}}
    # "ISOS-LC" : {"TP": 0, "FP":0, "FN":0},
    # "ISOS-LT" : {"TP": 0, "FP":0, "FN":0}}

    for key, label_value in labeled_data.items():
        if key in extracted_data:
            ##Make sure that there is extracted data associated with the anotation
            extracted_value = extracted_data[key]
            for id in label_value.keys():
                # print(id)
                ##Check if id is the numerical value
                if id in numerical_variable:
                    # print(id, label_value[id])
                    ##This is handling incorrect annotation, annotation that is not just numbers
                        # Came accross id of treated_voc: methylammonium sulfate
                        # Came accross control_pce ['4.9%', '20.5']
                    if type(label_value[id]) != str:
                        # print("value not string")
                        continue
                    if not label_value[id].replace(".", '').isdigit():
                        # print("value including units")
                        continue
                    ##Perform numerical comparison
                    result = numerical_comparison(id, label_value, extracted_value)
                    
                    # print(result)
                    numerical_dict[id][result] += 1

                # ##Check if id is text value
                elif id in text_variable:
                    if type(label_value[id]) != str:
                        # print("value not string", label_value[id])
                        continue

                    result = text_comparison(id, label_value, extracted_value)
                    # print(result)
                    text_dict[id][result] += 1

                #SUBJECT TO CHANGE AFTER Q1 
                ##Check if id is stability
                if "ISOS" in id:
                    result = stability_comparison(id, label_value, extracted_value)
                    if id[4] == "D":
                        stability_dict["ISOS-D"][result] += 1
                    elif id[4] == "L":
                        stability_dict["ISOS-L"][result] += 1
                    if id[4] == "T":
                        stability_dict["ISOS-T"][result] += 1

                    
                    
        else:
            print(f"Extraction were not performed. Paper num: {key}")

    # print("Numerical comparison dictionary",numerical_dict)
    # print("Text comparison dictionary",text_dict)

    combined_dict = numerical_dict.copy()
    combined_dict.update(text_dict)
    combined_dict.update(stability_dict)
    print("Performance for each variable in dictionary",combined_dict)
    
    # Perform recall, precision, f1 score computation
    variable_list = []
    precision_list = []
    recall_list = []
    f1_list = []
    for variable, performance in combined_dict.items():
        precision = performance["TP"] / (performance["TP"] + performance["FP"])
        recall = performance["TP"] / (performance["TP"] + performance["FN"])
        f1 = (2 * precision * recall) / (precision + recall)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
        variable_list.append(variable)

    return variable_list, precision_list, recall_list, f1_list





In [31]:
variables, precisions, recalls, f1s = compare_json(clean_label_data, subset_llama)

Performance for each variable in dictionary {'control_pce': {'TP': 3, 'FP': 2, 'FN': 1}, 'treated_pce': {'TP': 8, 'FP': 1, 'FN': 0}, 'control_voc': {'TP': 5, 'FP': 0, 'FN': 3}, 'treated_voc': {'TP': 10, 'FP': 1, 'FN': 1}, 'structure_pin_nip': {'TP': 7, 'FP': 17, 'FN': 1}, 'passivating_molecule': {'TP': 6, 'FP': 13, 'FN': 1}, 'perovskite_composition': {'TP': 17, 'FP': 16, 'FN': 2}, 'electron_transport_layer': {'TP': 6, 'FP': 16, 'FN': 5}, 'hole_transport_layer': {'TP': 18, 'FP': 14, 'FN': 7}, 'ISOS-D': {'TP': 5, 'FP': 0, 'FN': 14}, 'ISOS-L': {'TP': 2, 'FP': 0, 'FN': 22}, 'ISOS-T': {'TP': 4, 'FP': 0, 'FN': 0}}


## Calculate Macro f1 score

In [32]:
def macro_f1(f1_list, weight = None):
    if weight == None:
        #If no weight given, do unweighted average of f1 score
        return sum(f1_list) / len(f1_list)
    total_f1 = 0
    for i in range(len(f1_list)):
        total_f1 += (f1_list[i] * weight[i])
    return total_f1 / sum(weight)
    


In [33]:
## The macro f1 score unweighted
macro_f1(f1s)

0.6170647968732333

### This concludes the pipeline of evaluating extraction quality

# Todo:
- stability evaluation
    - Need to reiterate on the teamtat annotation

- Putting weights on F1 Score
- Using Bert!!

## Stability annotation brainstorm

- We will have as at most 5 realation (if there is an overlap, choose the stability test that offers more info)
    - Exp there is ISOS L1 and ISOS L2. If there is more iformation in IsosL2, use that as a test and compare on ISOSL that llama extracted (if any)
- We will iterate through different test recorded on annotation and compared it with suitable stability extracted on Llama