# This file compares the teamtat annotation with Extraction performed (Json)

In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np
import json
import os
import xml.etree.ElementTree as ET 
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity

## File Preparation

In [2]:
def str_toJson(string):
    ##The json output from annotation dataframe was not in correct json format
    # We will change the None to null
    json_string = string.replace("None", "null")

    try:
        # Try to load the JSON string
        json_object = json.loads(json_string)
        return json_object
    except json.JSONDecodeError as e:
        # Catch JSONDecodeError if the string is not valid JSON
        print(f"Error decoding JSON: {e}")
        return None
    except Exception as e:
        # Catch any other exceptions
        print(f"An error occurred: {e}")
        return None

In [3]:
def include_passivating(dictionary):
    ##In extraction json, realized that some extraction has passivating molecule that is NOT included in its stability testing. 
    ## Since passivating molecule (if exist) needs to be in stability testing (nexted dictionary), we will transfer the information and spit out a cleaned dictionary. 
    if "passivating_molecule" in dictionary.keys():
        passivating = dictionary['passivating_molecule']
        del dictionary['passivating_molecule']
        
        for entity in dictionary.keys():
            if entity.startswith('test'):
                # print(i['entity'])
                if type(dictionary[entity]) == dict:
                    if 'passivating_molecule' in dictionary[entity].keys():
                        continue
                    else:
                        # print("Have to include passivating molecule in tests")
                        dictionary[entity]['passivating_molecule'] = passivating
        
    return dictionary

In [59]:
## Convert all numerical data into float for both
def convert_numeric(dictionary):
    numerical_key = ['temperature', 'time', 'humidity', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc', 'control_efficiency', 'treatment_efficiency']

    translation_table = str.maketrans('', '', 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()')
    for key in dictionary.keys():
        if (key.startswith('test')) & (type(dictionary[key]) == dict):
            for entity in dictionary[key].keys():
                if entity in numerical_key:
                    # print(dictionary[key][entity])
                    if isinstance(dictionary[key][entity], str): 
                        substitute = re.sub(r'[^0-9.]', '', dictionary[key][entity][:4])
                        if len(substitute) != 0:
                            numerical_value = float(substitute)
                            dictionary[key][entity] = numerical_value
                        else:
                            dictionary[key][entity] = None
    return dictionary

#### Analyzing these outputs

Annotation notes: 
- THE 4 basic variable that is to compare is PEROVSKITE COMPOSITION, ETL, HTL, STRUCTURE
- Stability entity: efficiency_control is wrong, All value is None, so ignore. 
- Common entity: ['stability_type', 'passivating_molecule', 'humidity', 'temperature', 'time', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc', 'efficiency_tret'] 
    - 'efficiency_cont' are included or not


Extraction notes:
- some extraction has passivating molecule that is NOT included in its stability testing. 

### Loading Teamtat Annotation as dataframe

In [60]:
#Teamtat Annotation
annotation_df = pd.read_csv("data/150_papers_json.csv")[["id", "first_num", "output"]]
annotation_df = annotation_df.sort_values(by = ['first_num'])

In [61]:
##Change the format (minor) to be converted to json
annotation_df['output'] = annotation_df['output'].apply(str_toJson)
annotation_df

Unnamed: 0,id,first_num,output
0,0_54,0,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...
1,1_22,1,"{'perovskite_composition': None, 'electron_tra..."
2,2_75,2,{'perovskite_composition': 'dibutylammonium le...
3,3_52,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...
4,4_26,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...,...
144,145_31,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
145,146_36,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
146,147_41,147,{'perovskite_composition': 'formamidinium lead...
147,148_26,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [62]:
annotation_df['output'] = annotation_df['output'].apply(convert_numeric)

None
65
1200
24
26.9
None
1.18
95%
None
85
500
None
19.1
None
1.16
None
85
25
1620
>
24.35
None
1.185
0.98
0.586
85
85
1056
None
21.34
None
None
0.94
None
25
1620
None
24.06
None
None
0.98
None
None
2000
None
24.5
None
1.20
0.99
0.25
None
None
None
None
22.1
None
1.135
None
None
None
1344
None
18.89
None
1.16
0.97
65
None
1000
None
None
None
None
0.9
65
None
1000
None
None
None
None
0.9
None
40
2000
19.3
22.4
None
1.177
0.85
0.6
None
None
None
14.64
17.6
None
None
None
None
None
800
None
24.41
None
None
1
None
40
500
None
None
None
None
0.915
None
room temperature
24
20.2
21.8
1.11
1.16
1.02
0.98
None
None
20
None
19.8
17.5
1.31
exhibit stable V OC over time
None
None
195 h
(T80: ≈9 h
23.8
None
1.29
None
None
60
265
None
23.8
None
1.29
0.92
0.82
None
None
33
21.58
23.7
1.109
1.184
1
0.9
None
None
None
21.78
23.79
1.11
1.17
None
None
None
None
18.87
21.05
1.04
1.11
None
None
None
48
20.04
20.94
1.16
1.19
0.86
None
None
None
None
None
None
None
None
50
85
1560
21
24.09%
1.269
1.271
None


### Loading in JSON extraction

In [7]:
## extraction performed by Llama (Daniel)
# Read JSON from a file
with open("data/deepseek_newschema_OG.json", 'r') as f:
    extraction = json.load(f)

extraction_df = pd.DataFrame(list(extraction.items()), columns=['paper_num', 'output'])
extraction_df['paper_num'] = pd.to_numeric(extraction_df['paper_num'])
extraction_df = extraction_df.sort_values('paper_num')
extraction_df


Unnamed: 0,paper_num,output
77,0,"{'perovskite_composition': 'FAPbI3', 'electron..."
125,1,"{'perovskite_composition': 'FA1-x MAx PbI3', '..."
8,2,"{'perovskite_composition': '(BA)₂PbI₄', 'elect..."
32,3,{'perovskite_composition': 'Cs5(MA0.10FA0.90)P...
29,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...
4,144,{'perovskite_composition': 'Cs0.1FA0.6MA0.3Sn0...
38,145,{'perovskite_composition': 'Cs 0.05 (MA 0.17 F...
89,146,{'perovskite_composition': 'Cs 0.05 (MA 0.05 F...
120,147,"{'perovskite_composition': 'FAPbI3', 'electron..."


In [8]:
extraction_df['output'] = extraction_df['output'].apply(include_passivating)
extraction_df

Unnamed: 0,paper_num,output
77,0,"{'perovskite_composition': 'FAPbI3', 'electron..."
125,1,"{'perovskite_composition': 'FA1-x MAx PbI3', '..."
8,2,"{'perovskite_composition': '(BA)₂PbI₄', 'elect..."
32,3,{'perovskite_composition': 'Cs5(MA0.10FA0.90)P...
29,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...
4,144,{'perovskite_composition': 'Cs0.1FA0.6MA0.3Sn0...
38,145,{'perovskite_composition': 'Cs 0.05 (MA 0.17 F...
89,146,{'perovskite_composition': 'Cs 0.05 (MA 0.05 F...
120,147,"{'perovskite_composition': 'FAPbI3', 'electron..."


In [9]:
extraction_df['output'] = extraction_df['output'].apply(convert_numeric)
annotation_df['output'] = annotation_df['output'].apply(convert_numeric)


## Merging dataframe

In [10]:
evaluate_df = annotation_df.merge(extraction_df, left_on='first_num', right_on='paper_num')[["paper_num", "output_x",'output_y']]
evaluate_df.columns = ['paper_num', 'annotation', 'extracted']
evaluate_df

Unnamed: 0,paper_num,annotation,extracted
0,0,{'perovskite_composition': 'Cs 0.05 FA 0.85 MA...,"{'perovskite_composition': 'FAPbI3', 'electron..."
1,1,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': 'FA1-x MAx PbI3', '..."
2,2,{'perovskite_composition': 'dibutylammonium le...,"{'perovskite_composition': '(BA)₂PbI₄', 'elect..."
3,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...,{'perovskite_composition': 'Cs5(MA0.10FA0.90)P...
4,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
...,...,...,...
124,144,"{'perovskite_composition': None, 'electron_tra...",{'perovskite_composition': 'Cs0.1FA0.6MA0.3Sn0...
125,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...,{'perovskite_composition': 'Cs 0.05 (MA 0.17 F...
126,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...,{'perovskite_composition': 'Cs 0.05 (MA 0.05 F...
127,147,{'perovskite_composition': 'formamidinium lead...,"{'perovskite_composition': 'FAPbI3', 'electron..."


## Evaluation

- We need precision and recall for EACH variable
- For each variable, calculate the F1 score - There is F1 score for each variable
- Take a weighted average ***For now, just take the average.

In [39]:
def text_comparison(id, label_annotation, extraction_annotation, text_similarity_threshold=0.8):
    """Compares text values using string similarity matching.
    - THE 4 basic variable that is to compare is PEROVSKITE COMPOSITION, ETL, HTL, STRUCTURE
    """
    
    # Handle special case for structure_pin_nip
    key_to_check = "pin_nip_structure" if id == "structure_pin_nip" else id

    # If the key is missing in the extracted annotation, return False Negative
    if (label_annotation[id]!=None) & (extraction_annotation[key_to_check]==None):
        # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "FN"
    elif (label_annotation[id]==None) & (extraction_annotation[key_to_check]!=None):
        # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "TN"

    label_data = label_annotation.get(id, "")
    extract_data = extraction_annotation.get(key_to_check, "")

    # Convert lists to strings if necessary
    if isinstance(label_data, list):
        label_data = " ".join(map(str, label_data))  # Convert list to string
    if isinstance(extract_data, list):
        extract_data = " ".join(map(str, extract_data))  # Convert list to string

    # Ensure values are strings
    if not isinstance(label_data, str) or not isinstance(extract_data, str):
        # print(f"FP, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "FP"  # If data is still not a string, return False Positive

    # Compute similarity score
    similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()

    if similarity > text_similarity_threshold:
        print(f"TP, {label_data}, {extract_data}")
        return 'TP'
    else:
        print(f"FP, {label_data}, {extract_data}")
        return "FP"


In [40]:
def tests_comparison(stability_annotated, label_dict, stability_extracted, extract_dict):
    # print(stability_annotated, label_dict, stability_extracted, extract_dict)
    stability_entity_annotated = ['stability_type', 'passivating_molecule', 'temperature', 'time', 'humidity', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    stability_entity_extracted = ['test_name', 'passivating_molecule', 'temperature', 'time', 'humidity', 'control_efficiency', 'treatment_efficiency', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    
    compared_metric = []
    numeric_data_annotated = []
    numeric_data_extracted = []
    for entity_i in range(len(stability_entity_annotated)):
        if entity_i <= 1:
            if stability_entity_extracted[entity_i] not in extract_dict.keys():
                extract_dict[stability_entity_extracted[entity_i]] = None

            if (label_dict[stability_entity_annotated[entity_i]] == None) | (extract_dict[stability_entity_extracted[entity_i]] == None):
                compared_metric.append(None)
            else:
                ##Text entity, perform Sequence Matcher 
                compared = SequenceMatcher(None, label_dict[stability_entity_annotated[entity_i]], extract_dict[stability_entity_extracted[entity_i]]).ratio()
                # print(compared)
                if entity_i == 0:
                    if compared > 0.9:
                        compared_metric.append(1)
                    else:
                        compared_metric.append(0)
                else:
                    compared_metric.append(compared)
        else:
            if stability_entity_extracted[entity_i] not in extract_dict.keys():
                extract_dict[stability_entity_extracted[entity_i]] = 0
            elif extract_dict[stability_entity_extracted[entity_i]] == None:
                extract_dict[stability_entity_extracted[entity_i]] = 0

            if stability_entity_annotated[entity_i] not in label_dict.keys():
                label_dict[stability_entity_annotated[entity_i]] = 0
            elif label_dict[stability_entity_annotated[entity_i]] == None:
                label_dict[stability_entity_annotated[entity_i]] = 0

                
            numeric_data_annotated.append(label_dict[stability_entity_annotated[entity_i]])
            numeric_data_extracted.append(extract_dict[stability_entity_extracted[entity_i]])

    if isinstance(numeric_data_extracted[0], list):
        ##There was one column with two temperature recorded as a list (probably thermal cycling)
        numeric_data_extracted[0] = numeric_data_extracted[0][1]
    cos_sim = cosine_similarity([numeric_data_annotated], [numeric_data_extracted])
    compared_metric.append(cos_sim[0][0])
    
    return compared_metric    

In [45]:
def entity_comparison(entity, label, extracted_dict, text_similarity_threshold = 0.8, numerical_tolerance = 0.01):
    text_entity = ['stability_type', 'passivating_molecule']
    numerical_entity = ['temperature', 'time', 'humidity','efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']

    if entity in text_entity:
        key_to_check = "test_name" if entity == "stability_type" else entity

        # If the key is missing in the extracted annotation, return False Negative
        if (label[entity]!=None) & (extracted_dict[key_to_check]==None):
            # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "FN"
        elif (label[entity]==None) & (extracted_dict[key_to_check]!=None):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "TN"

        label_data = label.get(entity, "")
        extract_data = extracted_dict.get(key_to_check, "")

        # Convert lists to strings if necessary
        if isinstance(label_data, list):
            label_data = " ".join(map(str, label_data))  # Convert list to string
        if isinstance(extract_data, list):
            extract_data = " ".join(map(str, extract_data))  # Convert list to string

        # Ensure values are strings
        if not isinstance(label_data, str) or not isinstance(extract_data, str):
            # print(f"FP, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "FP"  # If data is still not a string, return False Positive

        # Compute similarity score
        similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()

        if similarity > text_similarity_threshold:
            # print(f"TP, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return 'TP'
        else:
            return "FP"
    else:
        key_to_check = "control_efficiency" if entity == "efficiency_cont" else ("treatment_efficiency" if entity == "efficiency_tret" else entity)
        # print(entity, key_to_check)
        # print(f"annotated{label[entity]}")
        # print(f"extracted{extracted_dict[key_to_check]}")

        # If the key is missing in the extracted annotation, return False Negative
        if (label[entity]!=None) & ((extracted_dict[key_to_check]==None) | (key_to_check not in extracted_dict.keys())):
            # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "FN"
        elif (label[entity]==None) & (extracted_dict[key_to_check]!=None):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
            return "TN"

        if isinstance(extracted_dict[key_to_check], list):
            ##There was one column with two temperature recorded as a list (probably thermal cycling)
            extracted_dict[key_to_check] = extracted_dict[key_to_check][1]

        # Apply numerical tolerance check
        if abs(label[entity] - extracted_dict[key_to_check]) <= numerical_tolerance * abs(label[entity]):
            print(f"Numerical differences matched: {entity} {label[entity]}, {extracted_dict[key_to_check]}")
            return "TP"  # True Positive: Correct numerical extraction
        else:
            print(f"Numerical differences no match: {entity}, {label[entity]}, {extracted_dict[key_to_check]}")
            return "FP"  # False Positive: Incorrect numerical extraction    

In [46]:
def safe_division(numerator, denominator):
    """Returns division result, or 0 if the denominator is zero."""
    return numerator / denominator if denominator != 0 else 0

In [47]:
def compare_json(df):
    """
    Compare labeled and extracted JSON data for correctness.

    TP: Correct value extracted by LLM.
    FN: LLM didn't extract this variable.
    FP: LLM extracted a value, but it was incorrect.
    TN: LLM halucinated and returned value that was not extracted
    """
    
    text_variables = ['perovskite_composition', 'electron_transport_layer', 'hole_transport_layer', 'structure_pin_nip']

    
    stability_entity_annotated = ['stability_type', 'temperature', 'time', 'humidity', 'passivating_molecule', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    stability_entity_extracted = ['test_name', 'temperature', 'time', 'humidity', 'passivating_molecule','control_efficiency', 'treatment_efficiency', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    
    # Initialize comparison dictionaries
    text_dict = {var: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for var in text_variables}
    stability_dict = {var: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for var in stability_entity_annotated}

    for row in df.itertuples():       
        label_value = row.annotation
        extracted_value = row.extracted

        # print(label_value)
        # print(extracted_value)

        for id, label in label_value.items():
            if ('test' in id) and (isinstance(label_value[id], dict)):
                ##Plan for stability test evaluation
                '''
                For each stability condition in annotation, 
                    Pair them with stability condition in extracted
                        With stability of annotation and extraction, use function tests_comparison that returns how similar 2 stabilities are
                    
                Once all the pair is calculated, find the stability name of extraction that was closest to annotation stability. 

                Using this dictionary, we will increment FN, FP, TN, TP for each element of the entity.
                '''
                matched = 0
                stability_match = {}
                for extract_id, extract_label in extracted_value.items():
                    if ('test' in extract_id) and (isinstance(extracted_value[extract_id], dict)):
                        matched += 1
                        match_list = tests_comparison(id, label, extract_id, extract_label)
                        match_list = [0 if item is None else item for item in match_list]
                        stability_match[extract_id] = match_list
        
                if matched == 0:
                    #No stability were extracted, we will add stability_unmatched
                        ##We need to account for if there was NO stability extracted. 
                    for key in stability_dict:
                        if 'FN' in stability_dict[key]:
                            stability_dict[key]['FN'] += 1
                else:
                    stability_match_mean = {stability: np.mean(lis) for stability, lis in stability_match.items()}
                    max_key = max(stability_match_mean, key=stability_match_mean.get)  
                    ##Now, I need to compare each entity in that found max_key and fill in that FN, dictionary.
                    for entity in label_value[id].keys():
                        if entity == 'efficiency_control':
                            continue
                        entity_result = entity_comparison(entity, label, extracted_value[max_key])
                        stability_dict[entity][entity_result] += 1  
            else:  
                result = text_comparison(id, label_value, extracted_value)
                text_dict[id][result] += 1


    # Merge all results
    combined_dict = {**text_dict, **stability_dict}
    # print("Performance for each variable in dictionary:", combined_dict)

    # Compute precision, recall, and F1-score
    variable_list, precision_list, recall_list, f1_list = [], [], [], []
    for variable, performance in combined_dict.items():
        TP, FP, FN = performance["TP"], performance["FP"], performance["FN"]
        
        precision = safe_division(TP, TP + FP)
        recall = safe_division(TP, TP + FN)
        f1 = safe_division(2 * precision * recall, precision + recall)

        variable_list.append(variable)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    return combined_dict, variable_list, precision_list, recall_list, f1_list

In [48]:
dict_result, variables, precisions, recalls, f1s = compare_json(evaluate_df)

FP, Cs 0.05 FA 0.85 MA 0.1 PbI 3, FAPbI3
TP, C60, C60
FP, 2PACz and Me-4PACz, TiO2
TP, PIN, PIN
Numerical differences matched: humidity 0, 0
Numerical differences no match: temperature, 65.0, 85
Numerical differences no match: time, 1200.0, 1500
Numerical differences no match: control_pce, 24.0, 21.4
Numerical differences no match: treated_pce, 269.0, 23.94
Numerical differences no match: control_voc, 0, 1.092
Numerical differences no match: treated_voc, 118.0, 1.149
Numerical differences no match: efficiency_tret, 95.0, 23.94
Numerical differences no match: efficiency_cont, 0, 21.4
FP, TinOxide, SnO2
FP, poly[bis(4-phenyl) (2,4,6-trimethylphenyl)amine] (PTAA), spiro-OMeTAD
Numerical differences matched: humidity 0, 0
Numerical differences matched: temperature 85.0, 85
Numerical differences matched: time 500.0, 500.0
Numerical differences no match: control_pce, 0, 20.55
Numerical differences no match: treated_pce, 191.0, 20.68
Numerical differences no match: control_voc, 0, 1.09
Numeri

In [26]:
dict_result

{'perovskite_composition': {'TP': 40, 'FP': 54, 'FN': 1, 'TN': 34},
 'electron_transport_layer': {'TP': 5, 'FP': 69, 'FN': 7, 'TN': 48},
 'hole_transport_layer': {'TP': 41, 'FP': 43, 'FN': 4, 'TN': 41},
 'structure_pin_nip': {'TP': 27, 'FP': 58, 'FN': 17, 'TN': 27},
 'stability_type': {'TP': 53, 'FP': 82, 'FN': 14, 'TN': 47},
 'temperature': {'TP': 74, 'FP': 110, 'FN': 12, 'TN': 0},
 'time': {'TP': 76, 'FP': 108, 'FN': 12, 'TN': 0},
 'humidity': {'TP': 113, 'FP': 71, 'FN': 12, 'TN': 0},
 'passivating_molecule': {'TP': 8, 'FP': 70, 'FN': 17, 'TN': 101},
 'efficiency_cont': {'TP': 19, 'FP': 165, 'FN': 12, 'TN': 0},
 'efficiency_tret': {'TP': 8, 'FP': 176, 'FN': 12, 'TN': 0},
 'control_pce': {'TP': 18, 'FP': 166, 'FN': 12, 'TN': 0},
 'treated_pce': {'TP': 10, 'FP': 174, 'FN': 12, 'TN': 0},
 'control_voc': {'TP': 34, 'FP': 150, 'FN': 12, 'TN': 0},
 'treated_voc': {'TP': 19, 'FP': 165, 'FN': 12, 'TN': 0}}

## Calculate Macro f1 score

In [18]:
def macro_f1(f1_list, weight = None):
    if weight == None:
        #If no weight given, do unweighted average of f1 score
        return sum(f1_list) / len(f1_list)
    total_f1 = 0
    for i in range(len(f1_list)):
        total_f1 += (f1_list[i] * weight[i])
    return total_f1 / sum(weight)
    


In [19]:
## The macro f1 score unweighted
macro_f1(f1s)

0.3516316245937326

### This concludes the pipeline of evaluating extraction quality