# This file compares the teamtat annotation with Extraction performed (Json)

In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np
import json
import os
import xml.etree.ElementTree as ET 
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Evaluation schema 3
data = {
    'perovskite_composition': None,
    'electron_transport_layer': None,
    'hole_transport_layer': None,
    'structure_pin_nip': None,
    'passivating_molecule': None,
    'control_pce': None,
    'treated_pce': None,
    'control_voc': None,
    'treated_voc': None,
    'stability_type': None,
    'humidity': None,
    'temperature': None,
    'time': None,
    'efficiency_tret': None,
    'efficiency_cont': None
}

data


{'perovskite_composition': None,
 'electron_transport_layer': None,
 'hole_transport_layer': None,
 'structure_pin_nip': None,
 'passivating_molecule': None,
 'control_pce': None,
 'treated_pce': None,
 'control_voc': None,
 'treated_voc': None,
 'stability_type': None,
 'humidity': None,
 'temperature': None,
 'time': None,
 'efficiency_tret': None,
 'efficiency_cont': None}

## File Preparation

In [3]:
def str_toJson(string):
    ##The json output from annotation dataframe was not in correct json format
    # We will change the None to null
    # json_string = string.replace("None", "null")
    json_string = json.dumps(string)
    try:
        # Try to load the JSON string
        json_object = json.loads(json_string)
        return json_object
    except json.JSONDecodeError as e:
        # Catch JSONDecodeError if the string is not valid JSON
        print(f"Error decoding JSON: {e}")
        return None
    except Exception as e:
        # Catch any other exceptions
        print(f"An error occurred: {e}")
        return None

In [4]:
## Convert all numerical data into float for both

#### WORK ON NUMERICAL DATA THAT INCLUDES RANGE "-" 
def convert_numeric(dictionary):
    numerical_key = ['time', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    exception_numeric = ['humidity', 'temperature']

    translation_table = str.maketrans('', '', 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()')
    for key in dictionary.keys():
        if key in numerical_key:
            # print(dictionary[key][entity])
            if isinstance(dictionary[key], str): 
                substitute = re.sub(r'[^0-9.]', '', dictionary[key][:4])
                if len(substitute) != 0:
                    numerical_value = float(substitute)
                    dictionary[key] = numerical_value
                else:
                    dictionary[key] = None
        elif key in exception_numeric:
            if isinstance(dictionary[key], str): 
                if "-" not in dictionary[key]:
                    # print("regular_case",dictionary[key][entity])
                    substitute = re.sub(r'[^0-9.]', '', dictionary[key][:4])
                    if len(substitute) != 0:
                        numerical_value = float(substitute)
                        dictionary[key] = numerical_value
                    else:
                        dictionary[key] = None
                #


    return dictionary

In [5]:
def convert_efficiency(dictionary):
    entity_decimal = ['efficiency_cont','efficiency_tret']
    for key in dictionary.keys():
        if (key.startswith('test')) & (type(dictionary[key]) == dict):
            for entity in dictionary[key].keys():
                if (entity in entity_decimal) and (dictionary[key][entity] != None):
                    if dictionary[key][entity] == dictionary[key][entity] > 1:
                        dictionary[key][entity] = dictionary[key][entity] / 100
    return dictionary

#### Analyzing these outputs

Annotation notes: 
- THE 4 basic variable that is to compare is PEROVSKITE COMPOSITION, ETL, HTL, STRUCTURE
- Stability entity: efficiency_control is wrong, All value is None, so ignore. 
- Common entity: ['stability_type', 'passivating_molecule', 'humidity', 'temperature', 'time', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc', 'efficiency_tret'] 
    - 'efficiency_cont' are included or not

- the efficiency in extracted data need to be converted to decimals since it is in percentage SOMETIMES
    - 'control_efficiency' and 'treatment_efficiency can be ignored


Extraction notes:
- some extraction has passivating molecule that is NOT included in its stability testing. 

### Loading Teamtat Annotation as dataframe

In [6]:
with open('data/annotations_flattened.json', 'r') as f:
    json_data = json.load(f)

In [7]:
flattened_format = []
for key in json_data:
    papers = json_data[key]
    if papers is None:
        flattened_format.append({ "paper_id": key, "output": None })
        continue
    if len(papers.keys()) == 0:
        print(key)
    for passivator in papers:
        paper_data = papers[passivator]
        paper_keys = paper_data.keys()
        test_keys = [key for key in paper_keys if "test" in key]
        for test_key in test_keys:
            flattened_paper = {k: v for k, v in paper_data.items() if k not in test_keys}
            flattened_paper.update(paper_data[test_key])
            flattened_format.append({ "paper_id": int(key), "output": flattened_paper })

In [8]:
flattened_format

[{'paper_id': 0,
  'output': {'perovskite_composition': 'Cs0.05FA0.85MA0.1PbI3',
   'electron_transport_layer': 'C60',
   'hole_transport_layer': '2PACz',
   'structure_pin_nip': 'PIN',
   'passivating_molecule': '4-chlorobenzenesulfonate',
   'control_pce': '24',
   'treated_pce': '26.9',
   'control_voc': None,
   'treated_voc': '1.18',
   'stability_type': 'ISOSL3',
   'humidity': '50',
   'temperature': '65',
   'time': '1200',
   'efficiency_cont': None,
   'efficiency_tret': '95'}},
 {'paper_id': 0,
  'output': {'perovskite_composition': 'Cs0.05FA0.85MA0.1PbI3',
   'electron_transport_layer': 'C60',
   'hole_transport_layer': '2PACz',
   'structure_pin_nip': 'PIN',
   'passivating_molecule': '4-chlorobenzenesulfonate',
   'control_pce': '24',
   'treated_pce': '26.9',
   'control_voc': None,
   'treated_voc': '1.18',
   'stability_type': 'ISOSD2',
   'humidity': None,
   'temperature': '85',
   'time': '1500',
   'efficiency_cont': None,
   'efficiency_tret': '95'}},
 {'paper_id'

In [9]:
annotation_df = pd.DataFrame(flattened_format)
annotation_df.columns = ['paper_num', 'output']
annotation_df["paper_num"] = annotation_df["paper_num"].astype(int)
annotation_df = annotation_df.sort_values(by = 'paper_num')

In [10]:
annotation_df

Unnamed: 0,paper_num,output
0,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
1,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
2,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
3,1,"{'perovskite_composition': None, 'electron_tra..."
180,2,{'perovskite_composition': '(FAPbI3)0.95(MAPbB...
...,...,...
174,147,"{'perovskite_composition': 'FAPbI 3', 'electro..."
173,147,"{'perovskite_composition': 'FAPbI 3', 'electro..."
176,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...
175,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [11]:
#drop irrelevant rows
annotation_df = annotation_df[annotation_df["output"].isna() == False]
annotation_df["paper_num"] = pd.to_numeric(annotation_df["paper_num"])
annotation_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotation_df["paper_num"] = pd.to_numeric(annotation_df["paper_num"])


Unnamed: 0,paper_num,output
0,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
1,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
2,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...
3,1,"{'perovskite_composition': None, 'electron_tra..."
180,2,{'perovskite_composition': '(FAPbI3)0.95(MAPbB...
...,...,...
174,147,"{'perovskite_composition': 'FAPbI 3', 'electro..."
173,147,"{'perovskite_composition': 'FAPbI 3', 'electro..."
176,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...
175,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [12]:
annotation_df['output'] = annotation_df['output'].apply(convert_numeric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  annotation_df['output'] = annotation_df['output'].apply(convert_numeric)


In [13]:
annotation_df["output"].iloc[0]

{'perovskite_composition': 'Cs0.05FA0.85MA0.1PbI3',
 'electron_transport_layer': 'C60',
 'hole_transport_layer': '2PACz',
 'structure_pin_nip': 'PIN',
 'passivating_molecule': '4-chlorobenzenesulfonate',
 'control_pce': 24.0,
 'treated_pce': 26.9,
 'control_voc': None,
 'treated_voc': 1.18,
 'stability_type': 'ISOSL3',
 'humidity': 50.0,
 'temperature': 65.0,
 'time': 1200.0,
 'efficiency_cont': None,
 'efficiency_tret': 95.0}

### Loading in JSON extraction

In [14]:
def convert_efficiency_key(dict):
    for key, item in dict.items():
        if 'test' in key:
            if 'retained_proportion_cont' in dict[key]:
                dict[key]['efficiency_cont'] = dict[key].pop('retained_proportion_cont')
            if 'retained_proportion_tret' in dict[key]:
                dict[key]['efficiency_tret'] = dict[key].pop('retained_proportion_tret')
    return dict
            

In [15]:
## extraction performed by basemodel
# Read JSON from a file
with open("data/deepseek_base_flat.json", 'r') as f:
    extraction = json.load(f)

extraction_base = pd.DataFrame(list(extraction.items()), columns=['paper_num', 'output'])
extraction_base['paper_num'] = pd.to_numeric(extraction_base['paper_num'])
extraction_base = extraction_base.sort_values('paper_num')
extraction_base['output'] = extraction_base['output'].apply(convert_numeric)
extraction_base['output'] = extraction_base['output'].apply(convert_efficiency)
extraction_base

Unnamed: 0,paper_num,output
79,0,"{'perovskite_composition': 'FAPbI3', 'electron..."
7,2,"{'perovskite_composition': '(BA)₂PbI₄', 'elect..."
36,3,{'perovskite_composition': 'Cs5(MA0.10FA0.90)P...
32,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...
11,5,"{'perovskite_composition': 'FA(MA)PbI3', 'elec..."
...,...,...
41,145,{'perovskite_composition': 'Cs 0.05 (MA 0.17 F...
89,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...
125,147,"{'perovskite_composition': 'FAPbI3', 'electron..."
84,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...


In [17]:
## extraction performed by finetuned deepseek
# Read JSON from a file
with open("data/deepseek_8bit_finetuned_flattened.json", 'r') as f:
    extraction = json.load(f)

extraction_train = pd.DataFrame(list(extraction.items()), columns=['paper_num', 'output'])
extraction_train['paper_num'] = pd.to_numeric(extraction_train['paper_num'])
extraction_train = extraction_train.sort_values('paper_num')

extraction_train['output'] = extraction_train['output'].apply(str_toJson)
extraction_train['output'] = extraction_train['output'].apply(convert_numeric)
extraction_train['output'] = extraction_train['output'].apply(convert_efficiency)
extraction_train['output'] = extraction_train['output'].apply(convert_efficiency_key)
extraction_train

AttributeError: 'str' object has no attribute 'keys'

## Merging dataframe

In [None]:
evaluate_df_base = annotation_df.merge(extraction_base, left_on='paper_id', right_on='paper_num')[["paper_num", "output_x",'output_y']]
evaluate_df_base.columns = ['paper_num', 'annotation', 'extracted']
evaluate_df_base

Unnamed: 0,paper_num,annotation,extracted
0,141,"{'perovskite_composition': None, 'electron_tra...",{'perovskite_composition': 'CsFAMA-mixed perov...
1,141,"{'perovskite_composition': None, 'electron_tra...",{'perovskite_composition': 'CsFAMA-mixed perov...
2,37,{'perovskite_composition': '(Cs0.2FA0.8Pb(I0.6...,{'perovskite_composition': 'Cs0.2FA0.8Pb(I0.6B...
3,62,"{'perovskite_composition': 'MAPbI 3', 'electro...",{'perovskite_composition': '3D-2D perovskite h...
4,62,"{'perovskite_composition': 'MAPbI 3', 'electro...",{'perovskite_composition': '3D-2D perovskite h...
...,...,...,...
112,124,{'perovskite_composition': 'Cs 0.05(FA0.98MA0....,{'perovskite_composition': 'Cs0.05(FA0.98MA0.0...
113,124,{'perovskite_composition': 'Cs 0.05(FA0.98MA0....,{'perovskite_composition': 'Cs0.05(FA0.98MA0.0...
114,27,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': 'CsPbI3-xBrx', 'ele..."
115,97,{'perovskite_composition': 'Cs 0.05 (FA0.92MA0...,{'perovskite_composition': 'Cs0.05(FA0.92MA0.0...


In [None]:
evaluate_df_train = annotation_df.merge(extraction_train, left_on='first_num', right_on='paper_num')[["paper_num", "output_x",'output_y']]
evaluate_df_train.columns = ['paper_num', 'annotation', 'extracted']
evaluate_df_train

Unnamed: 0,paper_num,annotation,extracted
0,0,{'perovskite_composition': 'Cs0.05FA0.85MA0.1P...,"{'perovskite_composition': 'FAPbI3', 'electron..."
1,1,"{'perovskite_composition': None, 'electron_tra...",{'perovskite_composition': 'FA1-x MAx PbI3 (FA...
2,2,{'perovskite_composition': '(FAPbI3)0.95(MAPbB...,{'perovskite_composition': 'FA0.85Cs0.15PbI2.8...
3,3,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...,{'perovskite_composition': 'Cs0.05(MA0.10FA0.8...
4,4,{'perovskite_composition': '(MAPbBr3)0.05(FAPb...,"{'perovskite_composition': None, 'electron_tra..."
...,...,...,...
121,145,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...,{'perovskite_composition': 'Cs0.05(MA0.17FA0.8...
122,146,{'perovskite_composition': 'Cs0.05(MA0.05FA0.9...,{'perovskite_composition': 'Cs0.05(FA0.05MA0.9...
123,147,"{'perovskite_composition': 'FAPbI 3', 'electro...","{'perovskite_composition': 'FAPbI3', 'electron..."
124,148,{'perovskite_composition': 'Cs0.05FA0.85MA0.10...,{'perovskite_composition': 'Cs0.05 FA0.85 MA0....


In [None]:
extraction_base["output"].iloc[0]

{'perovskite_composition': 'FAPbI3',
 'electron_transport_layer': 'TiO2',
 'hole_transport_layer': 'Spiro-OMeTAD',
 'structure_pin_nip': 'pin',
 'passivating_molecule': 'Cyclohexylmethylammonium iodide (CMAI)',
 'control_pce': 21.4,
 'control_voc': 1.092,
 'treated_pce': 23.77,
 'treated_voc': 1.145,
 'stability_type': 'ISOS-D-2I',
 'humidity': 50.0,
 'temperature': 85,
 'time': 1500,
 'efficiency_cont': None,
 'efficiency_tret': 95}

### Finding most similar annotated data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert JSON strings to comparable text format
evaluate_df_base['annotated_str'] = evaluate_df_base['annotation'].apply(lambda x: json.dumps(x, sort_keys=True))
evaluate_df_base['extracted_str'] = evaluate_df_base['extracted'].apply(lambda x: json.dumps(x, sort_keys=True))

# Compute similarity using TF-IDF and cosine similarity
vectorizer = TfidfVectorizer().fit_transform(evaluate_df_base[['annotated_str', 'extracted_str']].values.ravel())
tfidf_matrix = vectorizer.toarray()

evaluate_df_base['similarity'] = [cosine_similarity([tfidf_matrix[i*2]], [tfidf_matrix[i*2 + 1]])[0,0] for i in range(len(evaluate_df_base))]

# Keep the row with the highest similarity for each paper_num
df_best = evaluate_df_base.loc[evaluate_df_base.groupby('paper_num')['similarity'].idxmax(), ['paper_num', 'annotation', 'extracted']]

## Evaluation

- We need precision and recall for EACH variable
- For each variable, calculate the F1 score - There is F1 score for each variable
- Take a weighted average ***For now, just take the average.

In [None]:
def compare_range_to_num(range_str, num):
    if "+" in range_str:
        split_vals = range_str.split("+-")
        value = float(split_vals[0].strip())
        margin_error = float(split_vals[1].strip())
        range_vals = (value - margin_error, value + margin_error)
        if num >= range_vals[0] and num <= range_vals[1]:
            return 'TP'
        else:
            return "FP"
    else:
        split_vals = range_str.split("-")
        lower = float(split_vals[0])
        upper = float(split_vals[1].strip('%'))
        if num >= lower and num <= upper:
            # print(f"TP, {label_data}, {extract_data}, {similarity}")
            return 'TP'
        else:
            # print(f"FP, {label_data}, {extract_data}, {similarity}")
            return "FP"

def humidity_temperature_comparison(label, extracted, text_similarity_threshold=0.8):
    extracted = extracted if extracted else 0
    label = label if label else 0
    label_is_num = isinstance(label, (float, int))
    extracted_is_num = isinstance(extracted, (float, int))
    if label_is_num and extracted_is_num:
        label = int(label)
        extracted = int(extracted)
        # If the key is missing in the extracted annotation, return False Negative
        if label != 0 & extracted == 0:
            # print(f"FN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "FN"
        elif label == 0 & extracted != 0:
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "TN"
        elif label == 0 & extracted == 0:
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "TN" 
    elif label_is_num and extracted_is_num == False:
        return compare_range_to_num(extracted, label)
    elif label_is_num == False and extracted_is_num:
        return compare_range_to_num(label, extracted)
    else:
        similarity = SequenceMatcher(None, label.lower(), extracted.lower()).ratio()
        if similarity > text_similarity_threshold:
            # print(f"TP, {label_data}, {extract_data}, {similarity}")
            return 'TP'
        else:
            # print(f"FP, {label_data}, {extract_data}, {similarity}")
            return "FP"

In [None]:
def entity_comparison(entity, label, extracted_dict, text_similarity_threshold = 0.75, numerical_tolerance = 0.027):
    '''
    The tolerance of 2.7% was what was reasonable looking at the absolute difference
    treated_voc 1.18, 1.149, absolute difference 0.026271186440677895

    The text similarity was set to 75% due to the structure example
    FP, NIP, n-i-p, 0.75
    This should be positive
    
    '''
    numerical_entity = ['time', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    numerical_exception = ['humidity', 'temperature']

    if (label[entity]!=None) & (extracted_dict[entity]==None):
        # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "FN"
    elif (label[entity]==None) & (extracted_dict[entity]!=None):
        # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "TN"

    if entity in numerical_entity:
        # key_to_check = "control_efficiency" if entity == "efficiency_cont" else ("treatment_efficiency" if entity == "efficiency_tret" else entity)

        # print(f"annotated{label[entity]}")
        # print(f"extracted{extracted_dict[entity]}")
        if extracted_dict[entity] == None:
            extracted_dict[entity] = 0

        # If the key is missing in the extracted annotation, return False Negative
        if (label[entity]!=0) & ((extracted_dict[entity]==0) | (entity not in extracted_dict.keys())):
            # print(f"FN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "FN"
        elif (label[entity]==0) & (extracted_dict[entity]!=0):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "TN"
        elif (label[entity]==0) & (extracted_dict[entity]==0):
            # print(f"TN, {label_annotation[id]}, {extraction_annotation[entity]}")
            return "TN"


        if isinstance(extracted_dict[entity], list):
            ##There was one column with two temperature recorded as a list (probably thermal cycling)
            extracted_dict[entity] = extracted_dict[entity][1]

        # Apply numerical tolerance check
        if (abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )<= numerical_tolerance:

            # print(f"Numerical differences matched: {entity} {label[entity]}, {extracted_dict[entity]}, absolute difference {(abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )}")
            return "TP"  # True Positive: Correct numerical extraction
        else:

            # print(f"Numerical differences no match: {entity}, {label[entity]}, {extracted_dict[entity]}, absolute difference {(abs(label[entity] - extracted_dict[entity])) / (abs(label[entity]) )}")
            return "FP"  # False Positive: Incorrect numerical extraction
    elif entity in numerical_exception:
        return humidity_temperature_comparison(label[entity], extracted_dict[entity])
    else: 
        print("edge case") #should not happen
        return "FP"
        

In [None]:
def safe_division(numerator, denominator):
    """Returns division result, or 0 if the denominator is zero."""
    return numerator / denominator if denominator != 0 else 0

In [None]:
def text_comparison(id, label_annotation, extraction_annotation, text_similarity_threshold=0.8):
    """Compares text values using string similarity matching.
    - THE 4 basic variable that is to compare is PEROVSKITE COMPOSITION, ETL, HTL, STRUCTURE
    """

    # Handle special case for structure_pin_nip
    # key_to_check = "pin_nip_structure" if id == "structure_pin_nip" else id

    # If the key is missing in the extracted annotation, return False Negative
    if (label_annotation[id]!=None) & (extraction_annotation[id]==None):
        # print(f"FN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "FN"
    elif (label_annotation[id]==None) & (extraction_annotation[id]!=None):
        # print(f"TN, {label_annotation[id]}, {extraction_annotation[key_to_check]}")
        return "TN"

    label_data = label_annotation.get(id, "")
    if id == 'electron_transport_layer' and label_data == "buckminsterfullerene":
        label_data = 'C60'
    extract_data = extraction_annotation.get(id, "")

    # Convert lists to strings if necessary
    if isinstance(label_data, list):
        label_data = " ".join(map(str, label_data))  # Convert list to string
    if isinstance(extract_data, list):
        extract_data = " ".join(map(str, extract_data))  # Convert list to string

    # Ensure values are strings
    if not isinstance(label_data, str) or not isinstance(extract_data, str):
        # print(f"FP, {label_annotation[id]}, {extraction_annotation[id]}")
        return "FP"  # If data is still not a string, return False Positive

    # Compute similarity score
    similarity = SequenceMatcher(None, label_data.lower(), extract_data.lower()).ratio()

    if similarity > text_similarity_threshold:
        # print(f"TP, {label_data}, {extract_data}, {similarity}")
        return 'TP'
    else:
        # print(f"FP, {label_data}, {extract_data}, {similarity}")
        return "FP"


In [None]:
def compare_json(df):
    """
    Compare labeled and extracted JSON data for correctness.

    TP: Correct value extracted by LLM.
    FN: LLM didn't extract this variable.
    FP: LLM extracted a value, but it was incorrect.
    TN: LLM halucinated and returned value that was not extracted
    """
    
    text_variables = ['perovskite_composition', 'electron_transport_layer', 'hole_transport_layer', 'structure_pin_nip', 'passivating_molecule', 'stability_type']
    numerical_variables = ['time', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc', 'temperature', 'humidity'] 
    # stability_entity_annotated = ['stability_type', 'temperature', 'time', 'humidity', 'passivating_molecule', 'efficiency_cont', 'efficiency_tret', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    # stability_entity_extracted = ['test_name', 'temperature', 'time', 'humidity', 'passivating_molecule','control_efficiency', 'treatment_efficiency', 'control_pce', 'treated_pce', 'control_voc', 'treated_voc']
    
    # Initialize comparison dictionaries
    text_dict = {var: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for var in text_variables}
    numerical_dict = {var: {"TP": 0, "FP": 0, "FN": 0, "TN": 0} for var in numerical_variables}

    for row in df.itertuples():       
        label_value = row.annotation
        extracted_value = row.extracted

        for key, _ in label_value.items():
            if key not in extracted_value:
                extracted_value[key] = None #if model forgets to return key for null values
            
            if key in text_variables:
                result = text_comparison(key, label_value, extracted_value)
                text_dict[key][result] += 1
            else:
                print(key)
                result = entity_comparison(key, label_value, extracted_value)
                numerical_dict[key][result] += 1

    # Merge all results
    combined_dict = {**text_dict, **numerical_dict}
    # print("Performance for each variable in dictionary:", combined_dict)

    # Compute precision, recall, and F1-score
    variable_list, precision_list, recall_list, f1_list = [], [], [], []
    for variable, performance in combined_dict.items():
        TP, FP, FN = performance["TP"], performance["FP"], performance["FN"]
        
        precision = safe_division(TP, TP + FP)
        recall = safe_division(TP, TP + FN)
        f1 = safe_division(2 * precision * recall, precision + recall)

        variable_list.append(variable)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)

    return combined_dict, variable_list, precision_list, recall_list, f1_list

In [None]:
dict_result_base, variables_base, precisions_base, recalls_base, f1s_base = compare_json(df_best)

control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_voc
humidity
temperature
time
efficiency_cont
efficiency_tret
control_pce
treated_pce
control_voc
treated_vo

In [None]:
dict_result_train, variables_train, precisions_train, recalls_train, f1s_train = compare_json(evaluate_df_train)

NameError: name 'evaluate_df_train' is not defined

In [None]:
dict_result_base

{'perovskite_composition': {'TP': 37, 'FP': 17, 'FN': 0, 'TN': 4},
 'electron_transport_layer': {'TP': 6, 'FP': 41, 'FN': 1, 'TN': 10},
 'hole_transport_layer': {'TP': 25, 'FP': 19, 'FN': 1, 'TN': 13},
 'structure_pin_nip': {'TP': 19, 'FP': 26, 'FN': 2, 'TN': 11},
 'passivating_molecule': {'TP': 18, 'FP': 40, 'FN': 0, 'TN': 0},
 'stability_type': {'TP': 4, 'FP': 46, 'FN': 4, 'TN': 4},
 'time': {'TP': 34, 'FP': 13, 'FN': 11, 'TN': 0},
 'efficiency_cont': {'TP': 7, 'FP': 7, 'FN': 35, 'TN': 9},
 'efficiency_tret': {'TP': 24, 'FP': 10, 'FN': 22, 'TN': 2},
 'control_pce': {'TP': 24, 'FP': 15, 'FN': 10, 'TN': 9},
 'treated_pce': {'TP': 37, 'FP': 16, 'FN': 2, 'TN': 3},
 'control_voc': {'TP': 26, 'FP': 2, 'FN': 15, 'TN': 15},
 'treated_voc': {'TP': 33, 'FP': 9, 'FN': 6, 'TN': 10},
 'temperature': {'TP': 1, 'FP': 0, 'FN': 27, 'TN': 30},
 'humidity': {'TP': 3, 'FP': 1, 'FN': 13, 'TN': 41}}

In [None]:
dict_result_train

{'perovskite_composition': {'TP': 42, 'FP': 50, 'FN': 9, 'TN': 25},
 'electron_transport_layer': {'TP': 20, 'FP': 63, 'FN': 1, 'TN': 42},
 'hole_transport_layer': {'TP': 41, 'FP': 45, 'FN': 2, 'TN': 38},
 'structure_pin_nip': {'TP': 27, 'FP': 56, 'FN': 10, 'TN': 33},
 'stability_type': {'TP': 0, 'FP': 48, 'FN': 166, 'TN': 0},
 'temperature': {'TP': 52, 'FP': 48, 'FN': 0, 'TN': 114},
 'time': {'TP': 44, 'FP': 57, 'FN': 66, 'TN': 47},
 'humidity': {'TP': 30, 'FP': 16, 'FN': 29, 'TN': 139},
 'passivating_molecule': {'TP': 37, 'FP': 94, 'FN': 5, 'TN': 78},
 'efficiency_cont': {'TP': 7, 'FP': 58, 'FN': 29, 'TN': 120},
 'efficiency_tret': {'TP': 30, 'FP': 143, 'FN': 2, 'TN': 39},
 'control_pce': {'TP': 38, 'FP': 52, 'FN': 6, 'TN': 118},
 'treated_pce': {'TP': 71, 'FP': 80, 'FN': 2, 'TN': 61},
 'control_voc': {'TP': 44, 'FP': 10, 'FN': 7, 'TN': 153},
 'treated_voc': {'TP': 65, 'FP': 23, 'FN': 4, 'TN': 122}}

## Calculate Macro f1 score

In [None]:
def macro_f1(f1_list, weight = None):
    if weight == None:
        #If no weight given, do unweighted average of f1 score
        return sum(f1_list) / len(f1_list)
    total_f1 = 0
    for i in range(len(f1_list)):
        total_f1 += (f1_list[i] * weight[i])
    return total_f1 / sum(weight)
    


In [None]:
# Define column names
columns = ['Macro F1 score weight distribution', 'Base Deepseek 8bit', 'Fine-Tuned Deepseek 4 bit', 'Fine-Tuned Deepseek 8 bit', 'Llama 3 billion parameter']

# Create a DataFrame with NaN values
df_f1scores = pd.DataFrame(np.nan, index=[0, 1, 2, 3, 4, 5, 6], columns=columns)
df_f1scores

Unnamed: 0,Macro F1 score weight distribution,Base Deepseek 8bit,Fine-Tuned Deepseek 4 bit,Fine-Tuned Deepseek 8 bit,Llama 3 billion parameter
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,,,,,
5,,,,,
6,,,,,


In [None]:
f1s_train = [0]

In [None]:
## Unweighted
macro_train_0 = macro_f1(f1s_train)
macro_train_0


0.0

In [None]:
macro_base_0 = macro_f1(f1s_base)
macro_base_0

0.5283655732245532

In [None]:
## unweighted row value
unweighted = ['Macro F1 score with equal weight', macro_base_0, macro_train_0, None, None]
df_f1scores.loc[0] = unweighted


In [None]:
variables_train

NameError: name 'variables_train' is not defined

In [None]:
weights_1 = [1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 1]

In [None]:
macro_train_1 = macro_f1(f1s_train, weight = weights_1)
macro_train_1

0.0

In [None]:
macro_base_1 = macro_f1(f1s_base, weight = weights_1)
macro_base_1

0.5232433976582038

In [None]:
## Heavier weight on stability value
first_f1 = ['Heavier weight on stability', macro_base_1, macro_train_1, None, None]
df_f1scores.loc[1] = first_f1

In [None]:
weights_2 = [2, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]

In [None]:
macro_train_2 = macro_f1(f1s_train, weight = weights_2)
macro_train_2

0.0

In [None]:
macro_base_2 = macro_f1(f1s_base, weight = weights_2)
macro_base_2

0.5425467961910313

In [None]:
## Heavier weight on perovskite structure
first_f2 = ['Heavier weight on perovskite structure', macro_base_2, macro_train_2, None, None]
df_f1scores.loc[2] = first_f2

In [None]:
variables_train

NameError: name 'variables_train' is not defined

In [None]:
weights_3 = [1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2]

In [None]:
macro_train_3 = macro_f1(f1s_train, weight = weights_3)
macro_train_3

0.0

In [None]:
macro_base_3 = macro_f1(f1s_base, weight = weights_3)
macro_base_3

0.5188262775315814

In [None]:
## Heavier weight on numeric data
first_f3 = ['Heavier weight on numeric data', macro_base_3, macro_train_3, None, None]
df_f1scores.loc[3] = first_f3

In [None]:
variables_train

NameError: name 'variables_train' is not defined

In [None]:
weights_4 = [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1]

In [None]:
macro_train_4 = macro_f1(f1s_train, weight = weights_4)
macro_train_4

0.0

In [None]:
macro_base_4 = macro_f1(f1s_base, weight = weights_4)
macro_base_4

0.6320004070004069

In [None]:
## Weight to perform prediction 1
first_f4 = ['Weight to perform prediction 1', macro_base_4, macro_train_4, None, None]
df_f1scores.loc[4] = first_f4

In [None]:
variables_train

NameError: name 'variables_train' is not defined

In [None]:
weights_5 = [1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1]

In [None]:
macro_train_5 = macro_f1(f1s_train, weight = weights_5)
macro_train_5

0.0

In [None]:
macro_base_5 = macro_f1(f1s_base, weight = weights_5)
macro_base_5

0.5584317222748008

In [None]:
## Weight to perform prediction 2
first_f5 = ['Weight to perform prediction 2', macro_base_5, macro_train_5, None, None]
df_f1scores.loc[5] = first_f5

In [None]:
variables_train

NameError: name 'variables_train' is not defined

In [None]:
weights_6 = [1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0]

In [None]:
macro_train_6 = macro_f1(f1s_train, weight = weights_6)
macro_train_6

0.0

In [None]:
macro_base_6 = macro_f1(f1s_base, weight = weights_6)
macro_base_6

0.5878477089395123

In [None]:
## Weight to perform prediction 3
first_f6 = ['Weight to perform prediction 3', macro_base_6, macro_train_6, None, None]
df_f1scores.loc[6] = first_f6

In [None]:
df_f1scores

Unnamed: 0,Macro F1 score weight distribution,Base Deepseek 8bit,Fine-Tuned Deepseek 4 bit,Fine-Tuned Deepseek 8 bit,Llama 3 billion parameter
0,Macro F1 score with equal weight,0.528366,0.0,,
1,Heavier weight on stability,0.523243,0.0,,
2,Heavier weight on perovskite structure,0.542547,0.0,,
3,Heavier weight on numeric data,0.518826,0.0,,
4,Weight to perform prediction 1,0.632,0.0,,
5,Weight to perform prediction 2,0.558432,0.0,,
6,Weight to perform prediction 3,0.587848,0.0,,


### Different weight to consider
- Unweight
- Heavier weight on stability
- Heavier weight on perovskite structure
- Heavier weight on numeric data
- Weight to perform prediction 1
- Weight to perform prediction 2
- Weight to perform prediction 3