In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np
import json
import os
import xml.etree.ElementTree as ET 
import pandas as pd

## This file compares the teamtat annotation (Json) with Extraction performed by finetuned llama (Json)

In [2]:
label = {'passivating_molecule': 'phenethylammonium iodide', 'perovskite_composition': 'MAPbI3', 'ISOSD1': {'time': '240', 'treated_pce': '15.3', 'control_pce': '16.69', 'temperature': '25', 'humidity': '90', 'control_voc': '1.03', 'treated_voc': '1.06'}, 'electron_transport_layer': 'TiO2', 'hole_transport_layer': 'Spiro-OMeTAD'}

In [3]:
extraction = {
  "control_pce": None,
  "control_voc": None,
  "treated_pce": 15.3,
  "treated_voc": 1.06,
  "passivating_molecule": "Phenylethylammonium (PEA)",
  "perovskite_composition": "[C8H9NH3]2[(CH3NH3)2PbI3 – (n=60)]",
  "electron_transport_layer": "TiO2",
  "hole_transport_layer": "spiro-OMeTAD",
  "ISOS-L-1": None,
  "ISOS-L-2": None,
  "ISOS-T-1": None,
  "ISOS-T-2": None,
  "ISOS-LC": None,
  "ISOS-D-1": None,
  "ISOS-D-2": None
}

In [4]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np

def compare_data(labeled_data, extracted_data, numerical_tolerance=0.05):
    results = {}
    total_fields = len(labeled_data)
    matched_fields = 0
    numerical_differences = []

    for key, value in labeled_data.items():
        if key in extracted_data:
            extracted_value = extracted_data[key]

            # Exact match
            if value == extracted_value:
                matched_fields += 1
                results[key] = "Match"

            # Numerical comparison
            elif isinstance(value, (int, float)) and isinstance(extracted_value, (int, float)):
                if abs(value - extracted_value) <= numerical_tolerance * abs(value):
                    matched_fields += 1
                    numerical_differences.append(abs(value - extracted_value))
                    results[key] = "Numerical Match"
                else:
                    results[key] = "Numerical Mismatch"

            # Fuzzy string match
            elif isinstance(value, str) and isinstance(extracted_value, str):
                similarity = SequenceMatcher(None, value.lower(), extracted_value.lower()).ratio()
                if similarity > 0.8:  # Threshold for similarity
                    matched_fields += 1
                    results[key] = f"Fuzzy Match ({similarity:.2f})"
                else:
                    results[key] = f"Mismatch ({similarity:.2f})"

            else:
                results[key] = "Mismatch"
        else:
            results[key] = "Missing in Extracted Data"

    accuracy = matched_fields / total_fields
    mean_absolute_error = np.mean(numerical_differences) if numerical_differences else None

    return {
        "results": results,
        "accuracy": accuracy,
        "mean_absolute_error": mean_absolute_error,
        "matched_fields": matched_fields,
        "total_fields": total_fields
    }


In [5]:
comparison_result = compare_data(label, extraction)
print(comparison_result)


{'results': {'passivating_molecule': 'Mismatch (0.78)', 'perovskite_composition': 'Mismatch (0.20)', 'ISOSD1': 'Missing in Extracted Data', 'electron_transport_layer': 'Match', 'hole_transport_layer': 'Fuzzy Match (1.00)'}, 'accuracy': 0.4, 'mean_absolute_error': None, 'matched_fields': 2, 'total_fields': 5}


## Retrieving teamtat label

In [6]:
def parse_bioc(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = { }
    concept_ids = set()
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name

    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        if node_id in relations:
            stability_test = relations[node_id]
            if stability_test not in data:
                data[stability_test] = {}
            if concept_id is None:
                concept_id = value
            data[stability_test][var_name] = concept_id
            concept_ids.add(concept_id)

        if concept_id is None:
            concept_id = value
        if concept_id in concept_ids: # duplicate annotation
            continue
        concept_ids.add(concept_id)
        if var_name not in data:
            data[var_name] = concept_id
        else:
            if isinstance(data[var_name], list):
                data[var_name].append(value)
            else:
                data[var_name] = [data[var_name], value]
    
    return data

In [7]:
def extract_papernum(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num


In [None]:
## See if we can extract the paper number correctly
bioc_dir = "../../data/biocs"
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = extract_papernum(file_path)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


In [13]:
bioc_dir = "../../data/biocs"
label_data = {}
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = extract_papernum(file_path)
        row = parse_bioc(file_path)
        if len(row.keys()) > 0:
            label_data[paper_num] = row

print(label_data)   
print(f"There are {len(label_data)} annotated papers")


{'0': {'ISOSL3': {'control_pce': '24', 'efficiency_tret': '95%', 'time': '1200', 'treated_pce': '26.9', 'treated_voc': '1.18'}, 'structure_pin_nip': 'PIN', 'passivating_molecule': '4Cl-BZS', 'perovskite_composition': 'Cs 0.05 FA 0.85 MA 0.1 PbI 3', 'hole_transport_layer': '2PACz and Me-4PACz', 'electron_transport_layer': 'C60', 'journal_publication': ['Science', 'science'], 'date_published': '4/11/2024'}, '1': {'ISOSD2': {'treated_pce': '23.32', 'treated_voc': '1.16', 'control_pce': '21.3', 'time': '500'}, 'passivating_molecule': 'phenethylammonium iodide (PEAI)', 'electron_transport_layer': ['HC(NH 2 ) 2 -CH 3 NH 3', 'SnO 2'], 'journal_publication': 'nature', 'hole_transport_layer': 'spiro-OMeTAD', 'date_published': '1 April 2019'}, '2': {'treated_voc': '1.185', 'ISOSD1': {'treated_voc': '1.185', 'temperature': '85', 'humidity': '85', 'treated_pce': '24.35', 'time': '1056', 'control_pce': '22.39'}, 'electron_transport_layer': 'SnO2', 'hole_transport_layer': 'Spiro-OMeTAD', 'humidity':

## Retrieving LLama Prediction

In [10]:
import json

# Path to your JSON file
file_path = '../../data/finetuned_llama_output.json'

# Open and load the JSON file
with open(file_path, 'r') as file:
    llama_data = json.load(file)

# Print the data to verify
print(llama_data)
print(f"There are {len(llama_data)} papers that llama evaluated")

{'111': {'control_pce': None, 'control_voc': None, 'treated_pce': 23.9, 'treated_voc': None, 'passivating_molecule': 'isopropylammonium chloride', 'perovskite_composition': 'FAPbI3', 'electron_transport_layer': None, 'hole_transport_layer': None, 'stability_tests': [{'test_name': 'dark storage', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'light-soaking', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'thermal cycling', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'light cycling', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}, {'test_name': 'solar-thermal cycling', 'temperature': None, 'time': None, 'humidity': None, 'control_efficiency': None, 'treatment_efficiency': None}]}, '56': {

### Clean Llama predictoin to get rid of key with None and 'Not mentioned' value

In [40]:
# Recursive function to clean the JSON
def clean_json(obj):
    if isinstance(obj, dict):
        # Recursively process dictionary
        return {
            key: clean_json(value)
            for key, value in obj.items()
            if value not in ['None', None, "Not mentioned"]
        }
    elif isinstance(obj, list):
        # Recursively process list
        return [clean_json(item) for item in obj if item not in ['None', None, "Not mentioned"]]
    else:
        # Return the value as is
        return obj



In [42]:
filtered_llama_data = clean_json(llama_data)
filtered_llama_data

{'111': {'treated_pce': 23.9,
  'passivating_molecule': 'isopropylammonium chloride',
  'perovskite_composition': 'FAPbI3',
  'stability_tests': [{'test_name': 'dark storage'},
   {'test_name': 'light-soaking'},
   {'test_name': 'thermal cycling'},
   {'test_name': 'light cycling'},
   {'test_name': 'solar-thermal cycling'}]},
 '56': {'treated_pce': 21.5,
  'perovskite_composition': 'FA 0.83 Cs 0.17 Pb 0.5 Sn 0.5 I 3',
  'electron_transport_layer': 'PEDOT:PSS',
  'hole_transport_layer': 'C 60 and BCP',
  'stability_tests': [{'test_name': 'Dark storage'},
   {'test_name': 'Light-soaking'},
   {'test_name': 'Thermal cycling'},
   {'test_name': 'Light cycling'},
   {'test_name': 'Solar-thermal cycling'}]},
 '87': {'treated_pce': 21.52,
  'treated_voc': 1.15,
  'passivating_molecule': 'Eu 3+ -Eu 2+ ion pair',
  'perovskite_composition': '(FA,MA,Cs)Pb(I,Br)3(Cl)',
  'electron_transport_layer': 'SnO2',
  'hole_transport_layer': 'polymer modified spiro-OMeTAD',
  'stability_tests': [{'test_na

### Subsetting the LLama Prediction since we only have 54 true lables

In [None]:
# Extract the subset from json_150 where keys match with json_54
subset_llama = {key: filtered_llama_data[key] for key in label_data.keys() if key in filtered_llama_data}

# Print the result
print(len(subset_llama))


54


## First, we will evaluate one pair of papers and adjust specific formatting

In [44]:
##0th paper prediction
subset_llama['0']

{'control_pce': 0.215,
 'control_voc': 1.17,
 'treated_pce': 0.2377,
 'treated_voc': 1.145,
 'passivating_molecule': 'Cyclohexylmethylammonium iodide (CMAI)',
 'perovskite_composition': 'α-formamidinium lead triiodide (FAPbI3)',
 'electron_transport_layer': 'None mentioned',
 'hole_transport_layer': 'None mentioned',
 'stability_tests': [{'test_name': 'Dark storage at 85°C',
   'temperature': 85,
   'time': 1560,
   'control_efficiency': 0.239,
   'treatment_efficiency': 0.1979},
  {'test_name': 'Light-soaking'},
  {'test_name': 'Thermal cycling'},
  {'test_name': 'Light cycling'},
  {'test_name': 'Solar-thermal cycling'}]}

In [45]:
#The actual label
label_data['0']

{'ISOSL3': {'control_pce': '24',
  'efficiency_tret': '95%',
  'time': '1200',
  'treated_pce': '26.9',
  'treated_voc': '1.18'},
 'structure_pin_nip': 'PIN',
 'passivating_molecule': '4Cl-BZS',
 'perovskite_composition': 'Cs 0.05 FA 0.85 MA 0.1 PbI 3',
 'hole_transport_layer': '2PACz and Me-4PACz',
 'electron_transport_layer': 'C60',
 'journal_publication': ['Science', 'science'],
 'date_published': '4/11/2024'}

#### Evaluation in parts
- numerical data 
- text data (molecule)
- stability
    - Change how to parse xml
    - 