# This file compares the teamtat annotation with Extraction performed (Json)

In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np
import json
import os
import xml.etree.ElementTree as ET 
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity

## File Preparation

In [2]:
def str_toJson(string):
    ##The json output from annotation dataframe was not in correct json format
    # We will change the None to null
    json_string = string.replace("None", "null")

    try:
        # Try to load the JSON string
        json_object = json.loads(json_string)
        return json_object
    except json.JSONDecodeError as e:
        # Catch JSONDecodeError if the string is not valid JSON
        print(f"Error decoding JSON: {e}")
        return None
    except Exception as e:
        # Catch any other exceptions
        print(f"An error occurred: {e}")
        return None

In [3]:
def include_passivating(dictionary):
    ##In extraction json, realized that some extraction has passivating molecule that is NOT included in its stability testing. 
    ## Since passivating molecule (if exist) needs to be in stability testing (nexted dictionary), we will transfer the information and spit out a cleaned dictionary. 
    if "passivating_molecule" in dictionary.keys():
        passivating = dictionary['passivating_molecule']
        del dictionary['passivating_molecule']
        
        for entity in dictionary.keys():
            if entity.startswith('test'):
                # print(i['entity'])
                if type(dictionary[entity]) == dict:
                    if 'passivating_molecule' in dictionary[entity].keys():
                        continue
                    else:
                        # print("Have to include passivating molecule in tests")
                        dictionary[entity]['passivating_molecule'] = passivating
        
    return dictionary

In [4]:
## Convert all numerical data into float for both

#### WORK ON NUMERICAL DATA THAT INCLUDES RANGE "-" 
def convert_numeric(dictionary):
    numerical_key = ['time', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    exception_numeric = ['humidity', 'temperature']

    translation_table = str.maketrans('', '', 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()')
    for key in dictionary.keys():
        if (key.startswith('test')) & (type(dictionary[key]) == dict):
            for entity in dictionary[key].keys():
                if entity in numerical_key:
                    # print(dictionary[key][entity])
                    if isinstance(dictionary[key][entity], str): 
                        substitute = re.sub(r'[^0-9.]', '', dictionary[key][entity][:4])
                        if len(substitute) != 0:
                            numerical_value = float(substitute)
                            dictionary[key][entity] = numerical_value
                        else:
                            dictionary[key][entity] = None
                elif entity in exception_numeric:
                    if isinstance(dictionary[key][entity], str): 
                        if "-" not in dictionary[key][entity]:
                            # print("regular_case",dictionary[key][entity])
                            substitute = re.sub(r'[^0-9.]', '', dictionary[key][entity][:4])
                            if len(substitute) != 0:
                                numerical_value = float(substitute)
                                dictionary[key][entity] = numerical_value
                            else:
                                dictionary[key][entity] = None
                        # else:
                            
                        #     print(dictionary[key][entity])


    return dictionary

In [5]:
def convert_efficiency(dictionary):
    entity_decimal = ['efficiency_cont','efficiency_tret']
    for key in dictionary.keys():
        if (key.startswith('test')) & (type(dictionary[key]) == dict):
            for entity in dictionary[key].keys():
                if (entity in entity_decimal) and (dictionary[key][entity] != None):
                    if dictionary[key][entity] == dictionary[key][entity] > 1:
                        dictionary[key][entity] = dictionary[key][entity] / 100
    return dictionary


    

#### Analyzing these outputs

Annotation notes: 
- THE 4 basic variable that is to compare is PEROVSKITE COMPOSITION, ETL, HTL, STRUCTURE
- Stability entity: efficiency_control is wrong, All value is None, so ignore. 
- Common entity: ['stability_type', 'passivating_molecule', 'humidity', 'temperature', 'time', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc', 'efficiency_tret'] 
    - 'efficiency_cont' are included or not

- the efficiency in extracted data need to be converted to decimals since it is in percentage SOMETIMES
    - 'control_efficiency' and 'treatment_efficiency can be ignored


Extraction notes:
- some extraction has passivating molecule that is NOT included in its stability testing. 

### Loading Teamtat Annotation as dataframe

In [6]:
pd.read_csv("../data/150_papers_json_update.csv")

Unnamed: 0,first_num,id,text,memory,output,second_num
0,0,0_54,\t\t\t of 5 Downloaded from https://www.scienc...,"{""perovskite_composition"": ""Cs0.05FA0.85MA0.1P...","{""perovskite_composition"": ""Cs0.05FA0.85MA0.1P...",54
1,1,1_22,\t\t\t NAture PhotoNiCS | VOL 13 | JULY 2019 |...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra...",22
2,2,2_75,\t\t\t Nature eNerGY | VOL 6 | JANUARY 2021 | ...,"{""perovskite_composition"": ""(FAPbI3)0.95(MAPbB...","{""perovskite_composition"": ""(FAPbI3)0.95(MAPbB...",75
3,3,3_52,\t\t\t of 6 RESEARCH | REPORT Downloaded from ...,"{""perovskite_composition"": ""Cs0.05(MA0.10FA0.8...","{""perovskite_composition"": ""Cs0.05(MA0.10FA0.8...",52
4,4,4_26,"Proppe 1,2,10 , Andrew Johnston 2,10 , Sam T...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": ""(MAPbBr3)0.05(FAPb...",26
...,...,...,...,...,...,...
143,145,145_31,\t\t\t www.advancedsciencenews.com© 2018 WILEY...,"{""perovskite_composition"": ""Cs0.05(MA0.17FA0.8...","{""perovskite_composition"": ""Cs0.05(MA0.17FA0.8...",31
144,146,146_36,\t\t\t https://doi.org/10.1038/s41467-022-3420...,"{""perovskite_composition"": ""Cs0.05(MA0.05FA0.9...","{""perovskite_composition"": ""Cs0.05(MA0.05FA0.9...",36
145,147,147_41,Table 1 | PV parameters of the best-performed...,"{""perovskite_composition"": ""FAPbI 3"", ""electro...","{""perovskite_composition"": ""FAPbI 3"", ""electro...",41
146,148,148_26,"\t\t\t 15214095, 2020, 12, Downloaded from htt...","{""perovskite_composition"": ""Cs0.05FA0.85MA0.10...","{""perovskite_composition"": ""Cs0.05FA0.85MA0.10...",26


In [7]:
#Teamtat Annotation
annotation_df = pd.read_csv("../data/150_papers_json_update.csv")[["id", "first_num", "output"]]
annotation_df = annotation_df.sort_values(by = ['first_num'])

In [8]:
##Change the format (minor) to be converted to json
annotation_df['output'] = annotation_df['output'].apply(str_toJson)
annotation_df

Unnamed: 0,id,first_num,output
0,0_54,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
1,1_22,1,"{'perovskite_composition': None, 'electron_tra..."
2,2_75,2,{'perovskite_composition': '(FAPbI3)0.95(MAPbB...
3,3_52,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...
4,4_26,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...,...
143,145_31,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
144,146_36,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
145,147_41,147,"{'perovskite_composition': 'FAPbI 3', 'electro..."
146,148_26,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [9]:
annotation_df['output'][7]

{'perovskite_composition': 'Cs0.05(MA)0.16(FA)0.79Pb(I0.83Br0.17 )3',
 'electron_transport_layer': 'C60',
 'hole_transport_layer': None,
 'structure_pin_nip': 'NIP',
 'test_1': {'stability_type': 'ISOSD1',
  'passivating_molecule': '4-tert-butyl-benzylammonium',
  'humidity': '85',
  'temperature': None,
  'time': '100',
  'control_pce': None,
  'treated_pce': None,
  'control_voc': '1.09',
  'treated_voc': '1.13',
  'efficiency_cont': '60',
  'efficiency_tret': '90'},
 'test_2_2': {'stability_type': 'ISOSL3',
  'passivating_molecule': 'phenylethylammonium',
  'humidity': '50',
  'temperature': '90',
  'time': '100',
  'control_pce': '8.9',
  'treated_pce': '5.94',
  'control_voc': None,
  'treated_voc': None,
  'efficiency_cont': '0',
  'efficiency_tret': '95'},
 'test_2': {'stability_type': 'ISOSL1',
  'passivating_molecule': '(ethylenedioxy)bis(ethylammonium) lead iodide',
  'humidity': '65',
  'temperature': None,
  'time': '1000',
  'control_pce': None,
  'treated_pce': '21.06',
 

In [10]:
annotation_df['output'] = annotation_df['output'].apply(convert_numeric)

In [None]:
## Exporting annotation
# annotation_df.to_csv('annotation.csv', index=False)

### Loading in JSON extraction

In [13]:
## extraction performed by Llama (Daniel)
# Read JSON from a file
with open("../data/output1.json", 'r') as f:
    extraction = json.load(f)

extraction_df = pd.DataFrame(list(extraction.items()), columns=['paper_num', 'output'])
extraction_df['paper_num'] = pd.to_numeric(extraction_df['paper_num'])
extraction_df = extraction_df.sort_values('paper_num')
extraction_df


Unnamed: 0,paper_num,output
77,0,"{'perovskite_composition': 'FAPbI3', 'electron..."
124,1,"{'perovskite_composition': 'FA1-x MAx PbI3', '..."
7,2,"{'perovskite_composition': '(BA)2PbI4', 'elect..."
34,3,{'perovskite_composition': 'Cs0.05 (MA0.10FA0....
30,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...
40,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
87,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
120,147,"{'perovskite_composition': 'FAPbI3', 'electron..."
83,148,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...


In [14]:
extraction_df['output'] = extraction_df['output'].apply(include_passivating)
extraction_df

Unnamed: 0,paper_num,output
77,0,"{'perovskite_composition': 'FAPbI3', 'electron..."
124,1,"{'perovskite_composition': 'FA1-x MAx PbI3', '..."
7,2,"{'perovskite_composition': '(BA)2PbI4', 'elect..."
34,3,{'perovskite_composition': 'Cs0.05 (MA0.10FA0....
30,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...
40,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
87,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
120,147,"{'perovskite_composition': 'FAPbI3', 'electron..."
83,148,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...


In [15]:
extraction_df['output'] = extraction_df['output'].apply(convert_numeric)
annotation_df['output'] = annotation_df['output'].apply(convert_numeric)


In [16]:
extraction_df['output'] = extraction_df['output'].apply(convert_efficiency)

## Merging dataframe

In [17]:
evaluate_df = annotation_df.merge(extraction_df, left_on='first_num', right_on='paper_num')[["paper_num", "output_x",'output_y']]
evaluate_df.columns = ['paper_num', 'annotation', 'extracted']
evaluate_df

Unnamed: 0,paper_num,annotation,extracted
0,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...,"{'perovskite_composition': 'FAPbI3', 'electron..."
1,1,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': 'FA1-x MAx PbI3', '..."
2,2,{'perovskite_composition': '(FAPbI3)0.95(MAPbB...,"{'perovskite_composition': '(BA)2PbI4', 'elect..."
3,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...,{'perovskite_composition': 'Cs0.05 (MA0.10FA0....
4,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...,...
121,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
122,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
123,147,"{'perovskite_composition': 'FAPbI 3', 'electro...","{'perovskite_composition': 'FAPbI3', 'electron..."
124,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...


In [18]:
# evaluate_df.to_csv('merged.csv', index=False)

In [19]:
for row in evaluate_df.itertuples():
    label_value = row.annotation
    extracted_value = row.extracted

    print(label_value)
    print(extracted_value)

{'perovskite_composition': 'Cs0.05FA0.85MA0.1PbI3', 'electron_transport_layer': 'C60', 'hole_transport_layer': '2PACz', 'structure_pin_nip': 'PIN', 'test_1': {'stability_type': 'ISOSL3', 'passivating_molecule': '4-chlorobenzenesulfonate', 'humidity': 50.0, 'temperature': 65.0, 'time': 1200.0, 'control_pce': 24.0, 'treated_pce': 26.9, 'control_voc': None, 'treated_voc': 1.18, 'efficiency_cont': None, 'efficiency_tret': 95.0}, 'test_1_2': {'stability_type': 'ISOSL3', 'passivating_molecule': '4-chlorobenzenesulfonate', 'humidity': 50.0, 'temperature': 85.0, 'time': 540.0, 'control_pce': 24.0, 'treated_pce': 26.9, 'control_voc': None, 'treated_voc': None, 'efficiency_cont': None, 'efficiency_tret': 87.0}, 'test_2': {'stability_type': 'ISOSD2', 'passivating_molecule': '4-chlorobenzenesulfonate', 'humidity': None, 'temperature': 85.0, 'time': 1500.0, 'control_pce': 24.0, 'treated_pce': 26.9, 'control_voc': None, 'treated_voc': None, 'efficiency_cont': None, 'efficiency_tret': 95.0}}
{'perovs

In [20]:
evaluate_df["annotation"][6]

{'perovskite_composition': '(FAPbI3)0.94(MAPbBr3)0.06',
 'electron_transport_layer': 'Tin Oxide',
 'hole_transport_layer': 'Spiro-OMeTAD',
 'structure_pin_nip': 'NIP',
 'test_3': {'stability_type': None,
  'passivating_molecule': 'pyrene based methylammonium iodide',
  'humidity': None,
  'temperature': None,
  'time': None,
  'control_pce': 20.0,
  'treated_pce': 20.9,
  'control_voc': 1.16,
  'treated_voc': 1.16,
  'efficiency_cont': None,
  'efficiency_tret': None},
 'test_2': {'stability_type': None,
  'passivating_molecule': 'pyrene based ammonium iodide',
  'humidity': None,
  'temperature': None,
  'time': None,
  'control_pce': 20.0,
  'treated_pce': 19.5,
  'control_voc': 1.16,
  'treated_voc': 1.06,
  'efficiency_cont': None,
  'efficiency_tret': None},
 'test_1': {'stability_type': 'ISOSLT',
  'passivating_molecule': 'pyrene based ethylammonium iodide',
  'humidity': None,
  'temperature': 40.0,
  'time': 2000.0,
  'control_pce': 19.3,
  'treated_pce': 22.4,
  'control_voc':

In [21]:
evaluate_df["extracted"][1]

{'perovskite_composition': 'FA1-x MAx PbI3',
 'electron_transport_layer': 'SnO2',
 'structure_pin_nip': 'NIP',
 'hole_transport_layer': 'spiro-OMeTAD',
 'test_1': {'stability_type': None,
  'temperature': 85,
  'time': 500,
  'humidity': None,
  'efficiency_cont': 0.191,
  'efficiency_tret': 0.2332,
  'passivating_molecule': 'phenethylammonium iodide (PEAI)',
  'control_pce': None,
  'control_voc': None,
  'treated_pce': 23.32,
  'treated_voc': 1.18},
 'test_2': {'stability_type': None,
  'temperature': None,
  'time': None,
  'humidity': None,
  'efficiency_cont': None,
  'efficiency_tret': None,
  'passivating_molecule': 'phenethylammonium iodide (PEAI)',
  'control_pce': None,
  'control_voc': None,
  'treated_pce': 23.32,
  'treated_voc': 1.18}}

## Evaluation

- We need precision and recall for EACH variable
- For each variable, calculate the F1 score - There is F1 score for each variable
- Take a weighted average ***For now, just take the average.

In [100]:
def tests_comparison(stability_annotated, label_dict, stability_extracted, extract_dict):
    # print(stability_annotated, label_dict, stability_extracted, extract_dict)
    stability_entity_annotated = ['stability_type', 'passivating_molecule', 'temperature', 'time', 'humidity', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    stability_entity_extracted = ['test_name', 'passivating_molecule', 'temperature', 'time', 'humidity', 'control_efficiency', 'treatment_efficiency', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    
    # print(f"stability_annotated{stability_annotated}")
    # print(f"label_dict{label_dict}")
    # print(f"stability_extracted{stability_extracted}")
    # print(f"extract_dict{extract_dict}")



    compared_metric = []
    numeric_data_annotated = []
    numeric_data_extracted = []
    for entity_i in range(len(stability_entity_annotated)):
        if entity_i <= 1:
            if stability_entity_extracted[entity_i] not in extract_dict.keys():
                extract_dict[stability_entity_extracted[entity_i]] = None

            if (label_dict[stability_entity_annotated[entity_i]] == None) | (extract_dict[stability_entity_extracted[entity_i]] == None):
                compared_metric.append(None)
            else:
                ##Text entity, perform Sequence Matcher 
                compared = SequenceMatcher(None, label_dict[stability_entity_annotated[entity_i]], extract_dict[stability_entity_extracted[entity_i]]).ratio()
                # print(compared)
                if entity_i == 0:
                    if compared > 0.9:
                        compared_metric.append(1)
                    else:
                        compared_metric.append(0)
                else:
                    compared_metric.append(compared)
        else:
            if stability_entity_extracted[entity_i] not in extract_dict.keys():
                extract_dict[stability_entity_extracted[entity_i]] = 0
            elif extract_dict[stability_entity_extracted[entity_i]] == None:
                extract_dict[stability_entity_extracted[entity_i]] = 0

            if stability_entity_annotated[entity_i] not in label_dict.keys():
                label_dict[stability_entity_annotated[entity_i]] = 0
            elif label_dict[stability_entity_annotated[entity_i]] == None:
                label_dict[stability_entity_annotated[entity_i]] = 0

                
            numeric_data_annotated.append(label_dict[stability_entity_annotated[entity_i]])
            numeric_data_extracted.append(extract_dict[stability_entity_extracted[entity_i]])

    if isinstance(numeric_data_extracted[0], list):
        ##There was one column with two temperature recorded as a list (probably thermal cycling)
        numeric_data_extracted[0] = numeric_data_extracted[0][1]

    # print(numeric_data_annotated, numeric_data_extracted)

    numeric_annotated_clean = []
    numeric_extracted_clean = []
    ##Clean the numeric data to skip any strings
    for i in range(len(numeric_data_annotated)):
        if (type(numeric_data_annotated[i]) == str) | (type(numeric_data_extracted[i]) == str):
            continue
        else:
            numeric_annotated_clean.append(numeric_data_annotated[i])
            numeric_extracted_clean.append(numeric_data_extracted[i])

    cos_sim = cosine_similarity([numeric_annotated_clean], [numeric_extracted_clean])
    compared_metric.append(cos_sim[0][0])
    
    return compared_metric    

In [107]:
def entity_comparison(entity, label, extracted_dict, text_similarity_threshold = 0.75, numerical_tolerance = 0.027):
    '''
    The tolarance of 2.7% was what was reasonable looking at the absolute difference
    treated_voc 1.18, 1.149, absolute difference 0.026271186440677895

    The text similarity were set to 75% due to the structure example
    FP, NIP, n-i-p, 0.75
    This should be positive
    
    '''
    text_entity = ['stability_type', 'passivating_molecule']
    numerical_entity = ['time', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    numerical_exception = ['temperature', 'humidity']

    if entity in text_entity:
        key_to_check = "test_name" if entity == "stability_type" else entity

        # If the key is missing in the extracted annotation, return False Negative
        if (label[entity]!=None) & (extracted_dict[key_to_check]==None):
            # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "FN"
        elif (label[entity]==None) & (extracted_dict[key_to_check]!=None):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "TN"

        label_data = label.get(entity, "")
        extract_data = extracted_dict.get(key_to_check, "")

        # Convert lists to strings if necessary
        if isinstance(label_data, list):
            label_data = " ".join(map(str, label_data))  # Convert list to string
        if isinstance(extract_data, list):
            extract_data = " ".join(map(str, extract_data))  # Convert list to string

        # Ensure values are strings
        if not isinstance(label_data, str) or not isinstance(extract_data, str):
            # print(f"FP, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "FP"  # If data is still not a string, return False Positive

        # Compute similarity score
        similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()

        if similarity > text_similarity_threshold:
            # print(f"TP,{entity} {label_data}, {extract_data}")
            return 'TP'
        else:
            # print(f"FP,{entity} {label_data}, {extract_data}, {similarity}")
            return "FP"
    elif entity in numerical_entity:
        # key_to_check = "control_efficiency" if entity == "efficiency_cont" else ("treatment_efficiency" if entity == "efficiency_tret" else entity)

        # print(f"annotated{label[entity]}")
        # print(f"extracted{extracted_dict[entity]}")
        if extracted_dict[entity] == None:
            extracted_dict[entity] = 0

        # If the key is missing in the extracted annotation, return False Negative
        if (label[entity]!=0) & ((extracted_dict[entity]==0) | (entity not in extracted_dict.keys())):
            # print(f"FN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "FN"
        elif (label[entity]==0) & (extracted_dict[entity]!=0):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "TN"
        elif (label[entity]==0) & (extracted_dict[entity]==0):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "TN"


        if isinstance(extracted_dict[entity], list):
            ##There was one column with two temperature recorded as a list (probably thermal cycling)
            extracted_dict[entity] = extracted_dict[entity][1]

        # Apply numerical tolerance check
        if (abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )<= numerical_tolerance:

            # print(f"Numerical differences matched: {entity} {label[entity]}, {extracted_dict[entity]}, absolute difference {(abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )}")
            return "TP"  # True Positive: Correct numerical extraction
        else:

            # print(f"Numerical differences no match: {entity}, {label[entity]}, {extracted_dict[entity]}, absolute difference {(abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )}")
            return "FP"  # False Positive: Incorrect numerical extraction    
    else: 
        if isinstance(label[entity], (float, int)):
            if extracted_dict[entity] == None:
                extracted_dict[entity] = 0

            # If the key is missing in the extracted annotation, return False Negative
            if (label[entity]!=0) & ((extracted_dict[entity]==0) | (entity not in extracted_dict.keys())):
                # print(f"FN, {label_annotation[id]}, {extraction_annotation[entity]}")
                return "FN"
            elif (label[entity]==0) & (extracted_dict[entity]!=0):
                # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
                return "TN"
            elif (label[entity]==0) & (extracted_dict[entity]==0):
                # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
                return "TN"


            if isinstance(extracted_dict[entity], list):
                ##There was one column with two temperature recorded as a list (probably thermal cycling)
                extracted_dict[entity] = extracted_dict[entity][1]

            # Apply numerical tolerance check
            if (abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )<= numerical_tolerance:

                # print(f"Numerical differences matched: {entity} {label[entity]}, {extracted_dict[entity]}, absolute difference {(abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )}")
                return "TP"  # True Positive: Correct numerical extraction
            else:

                # print(f"Numerical differences no match: {entity}, {label[entity]}, {extracted_dict[entity]}, absolute difference {(abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )}")
                return "FP"  # False Positive: Incorrect numerical extraction    
        else:
            # print(label[entity], type(label[entity]))
            if extracted_dict[entity] == None:
                extracted_dict[entity] = 0
            
            if ((extracted_dict[entity]==0) | (entity not in extracted_dict.keys())):
                # print(f"FN, {label_annotation[id]}, {extraction_annotation[entity]}")
                return "FN"

            if isinstance(extracted_dict[entity], list):
                ##There was one column with two temperature recorded as a list (probably thermal cycling)
                extracted_dict[entity] = extracted_dict[entity][1]
            
            if isinstance(extracted_dict[entity], str):
                ##Label is str, extraction is str, so perform text similarity
                similarity = SequenceMatcher(None, label[entity].lower(), extracted_dict[entity].lower()).ratio()
                if similarity > text_similarity_threshold:
                    # print(f"TP, {label_data}, {extract_data}, {similarity}")
                    return 'TP'
                else:
                    # print(f"FP, {label_data}, {extract_data}, {similarity}")
                    return "FP"
            else:
                if "+" in label[entity]:
                    # print(label[entity].split("+-"))
                    value = float(label[entity].split("+-")[0])
                    margin_error = float(label[entity].split("+-")[1])
                    range = (value-margin_error, value-margin_error)
                    if (range[0]<= extracted_dict[entity]) & (extracted_dict[entity]<=range[1]):
                        # print(f"TP, {label_data}, {extract_data}, {similarity}")
                        return 'TP'
                    else:
                        # print(f"FP, {label_data}, {extract_data}, {similarity}")
                        return "FP"
                else:
                    lower = float(label[entity].split("-")[0])
                    upper = float(label[entity].split("-")[1])
                    if (lower<= extracted_dict[entity]) & (extracted_dict[entity]<=upper):
                        # print(f"TP, {label_data}, {extract_data}, {similarity}")
                        return 'TP'
                    else:
                        # print(f"FP, {label_data}, {extract_data}, {similarity}")
                        return "FP"


In [108]:
def safe_division(numerator, denominator):
    """Returns division result, or 0 if the denominator is zero."""
    return numerator / denominator if denominator != 0 else 0

In [109]:
def text_comparison(id, label_annotation, extraction_annotation, text_similarity_threshold=0.8):
    """Compares text values using string similarity matching.
    - THE 4 basic variable that is to compare is PEROVSKITE COMPOSITION, ETL, HTL, STRUCTURE
    """

    # Handle special case for structure_pin_nip
    # key_to_check = "pin_nip_structure" if id == "structure_pin_nip" else id

    # If the key is missing in the extracted annotation, return False Negative
    if (label_annotation[id]!=None) & (extraction_annotation[id]==None):
        # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "FN"
    elif (label_annotation[id]==None) & (extraction_annotation[id]!=None):
        # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "TN"

    label_data = label_annotation.get(id, "")
    if id == 'electron_transport_layer' and label_data == "buckminsterfullerene":
        label_data = 'C60'
    extract_data = extraction_annotation.get(id, "")

    # Convert lists to strings if necessary
    if isinstance(label_data, list):
        label_data = " ".join(map(str, label_data))  # Convert list to string
    if isinstance(extract_data, list):
        extract_data = " ".join(map(str, extract_data))  # Convert list to string

    # Ensure values are strings
    if not isinstance(label_data, str) or not isinstance(extract_data, str):
        # print(f"FP, {label_annotation[id]}, {extraction_annotation[id]}")
        return "FP"  # If data is still not a string, return False Positive

    # Compute similarity score
    similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()

    if similarity > text_similarity_threshold:
        # print(f"TP, {label_data}, {extract_data}, {similarity}")
        return 'TP'
    else:
        # print(f"FP, {label_data}, {extract_data}, {similarity}")
        return "FP"


In [110]:
def compare_json(df):
    """
    Compare labeled and extracted JSON data for correctness.

    TP: Correct value extracted by LLM.
    FN: LLM didn't extract this variable.
    FP: LLM extracted a value, but it was incorrect.
    TN: LLM halucinated and returned value that was not extracted
    """
    
    text_variables = ['perovskite_composition', 'electron_transport_layer', 'hole_transport_layer', 'structure_pin_nip']

    
    stability_entity_annotated = ['stability_type', 'temperature', 'time', 'humidity', 'passivating_molecule', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    stability_entity_extracted = ['test_name', 'temperature', 'time', 'humidity', 'passivating_molecule','control_efficiency', 'treatment_efficiency', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    
    # Initialize comparison dictionaries
    text_dict = {var: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for var in text_variables}
    stability_dict = {var: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for var in stability_entity_annotated}

    for row in df.itertuples():       
        label_value = row.annotation
        extracted_value = row.extracted

        # print(label_value)
        # print(extracted_value)

        for id, label in label_value.items():
            if ('test' in id) and (isinstance(label_value[id], dict)):
                ##Plan for stability test evaluation
                '''
                For each stability condition in annotation, 
                    Pair them with stability condition in extracted
                        With stability of annotation and extraction, use function tests_comparison that returns how similar 2 stabilities are
                    
                Once all the pair is calculated, find the stability name of extraction that was closest to annotation stability. 

                Using this dictionary, we will increment FN, FP, TN, TP for each element of the entity.
                '''
                matched = 0
                stability_match = {}
                for extract_id, extract_label in extracted_value.items():
                    if ('test' in extract_id) and (isinstance(extracted_value[extract_id], dict)):
                        matched += 1
                        match_list = tests_comparison(id, label, extract_id, extract_label)
                        match_list = [0 if item is None else item for item in match_list]
                        # print(extracted_value[extract_id])
                        # print(match_list)
                        stability_match[extract_id] = match_list
        
                if matched == 0:
                    #No stability were extracted, we will add stability_unmatched
                        ##We need to account for if there was NO stability extracted. 
                    for key in stability_dict:
                        if 'FN' in stability_dict[key]:
                            stability_dict[key]['FN'] += 1
                else:
                    stability_match_mean = {stability: np.mean(lis) for stability, lis in stability_match.items()}
                    max_key = max(stability_match_mean, key=stability_match_mean.get)  
                    # print(extracted_value[max_key])
                    ##Now, I need to compare each entity in that found max_key and fill in that FN, dictionary.
                    for entity in label_value[id].keys():
                        if entity == 'efficiency_control':
                            continue
                        entity_result = entity_comparison(entity, label, extracted_value[max_key])
                        stability_dict[entity][entity_result] += 1  
            else:  
                result = text_comparison(id, label_value, extracted_value)
                text_dict[id][result] += 1


    # Merge all results
    combined_dict = {**text_dict, **stability_dict}
    # print("Performance for each variable in dictionary:", combined_dict)

    # Compute precision, recall, and F1-score
    variable_list, precision_list, recall_list, f1_list = [], [], [], []
    for variable, performance in combined_dict.items():
        TP, FP, FN = performance["TP"], performance["FP"], performance["FN"]
        
        precision = safe_division(TP, TP + FP)
        recall = safe_division(TP, TP + FN)
        f1 = safe_division(2 * precision * recall, precision + recall)

        variable_list.append(variable)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    return combined_dict, variable_list, precision_list, recall_list, f1_list

In [111]:
dict_result, variables, precisions, recalls, f1s = compare_json(evaluate_df)

In [112]:
dict_result

{'perovskite_composition': {'TP': 59, 'FP': 35, 'FN': 1, 'TN': 31},
 'electron_transport_layer': {'TP': 9, 'FP': 62, 'FN': 4, 'TN': 51},
 'hole_transport_layer': {'TP': 32, 'FP': 45, 'FN': 6, 'TN': 43},
 'structure_pin_nip': {'TP': 37, 'FP': 45, 'FN': 12, 'TN': 32},
 'stability_type': {'TP': 0, 'FP': 44, 'FN': 147, 'TN': 0},
 'temperature': {'TP': 58, 'FP': 31, 'FN': 20, 'TN': 82},
 'time': {'TP': 76, 'FP': 51, 'FN': 31, 'TN': 33},
 'humidity': {'TP': 33, 'FP': 14, 'FN': 32, 'TN': 112},
 'passivating_molecule': {'TP': 15, 'FP': 63, 'FN': 16, 'TN': 97},
 'efficiency_cont': {'TP': 0, 'FP': 61, 'FN': 29, 'TN': 101},
 'efficiency_tret': {'TP': 1, 'FP': 106, 'FN': 64, 'TN': 20},
 'control_pce': {'TP': 24, 'FP': 30, 'FN': 17, 'TN': 120},
 'treated_pce': {'TP': 66, 'FP': 33, 'FN': 25, 'TN': 67},
 'control_voc': {'TP': 22, 'FP': 6, 'FN': 18, 'TN': 145},
 'treated_voc': {'TP': 50, 'FP': 8, 'FN': 20, 'TN': 113}}

## Calculate Macro f1 score

In [163]:
def macro_f1(f1_list, weight = None):
    if weight == None:
        #If no weight given, do unweighted average of f1 score
        return sum(f1_list) / len(f1_list)
    total_f1 = 0
    for i in range(len(f1_list)):
        total_f1 += (f1_list[i] * weight[i])
    return total_f1 / sum(weight)
    


In [164]:
variables

['perovskite_composition',
 'electron_transport_layer',
 'hole_transport_layer',
 'structure_pin_nip',
 'stability_type',
 'temperature',
 'time',
 'humidity',
 'passivating_molecule',
 'efficiency_cont',
 'efficiency_tret',
 'control_pce',
 'treated_pce',
 'control_voc',
 'treated_voc']

In [165]:
weight_dict = {key: None for key in variables}
weight_dict['treated_pce'] = 2
weight_dict['efficiency_tret'] = 2
weight_dict['passivating_molecule'] = 1.7


weight_dict



{'perovskite_composition': None,
 'electron_transport_layer': None,
 'hole_transport_layer': None,
 'structure_pin_nip': None,
 'stability_type': None,
 'temperature': None,
 'time': None,
 'humidity': None,
 'passivating_molecule': 1.7,
 'efficiency_cont': None,
 'efficiency_tret': 2,
 'control_pce': None,
 'treated_pce': 2,
 'control_voc': None,
 'treated_voc': None}

In [166]:
f1s

[0.5954198473282443,
 0.2528735632183908,
 0.5762711864406781,
 0.5426356589147286,
 0,
 0.7005649717514124,
 0.6519823788546255,
 0.6017699115044248,
 0.2342342342342342,
 0.24242424242424246,
 0.37681159420289856,
 0.4835164835164835,
 0.6910994764397906,
 0.6769230769230768,
 0.7903225806451613]

In [167]:
weights = [1, 0.5, 0.5, 1, 1.5, 0.3, 0.5]

In [168]:
## The macro f1 score unweighted
macro_f1(f1s)

0.49445661375989286

### This concludes the pipeline of evaluating extraction quality