## This file is used to create 150_papers_json.csv for annotation scraping

In [1]:
import json
import os
import xml.etree.ElementTree as ET
import pandas as pd
import re

In [2]:
EMPTY_PAPER_DATA = {
    "perovskite_composition": None,
    "electron_transport_layer": None,
    "hole_transport_layer": None,
    "structure_pin_nip": None,
    "passivating_molecule": None,
    "control_pce": None,
    "treated_pce": None,
    "control_voc": None,
    "treated_voc": None,
}
EMPTY_STABILITY_TEST = {
    "stability_type": None,
    "humidity": None,
    "temperature": None,
    "time": None,
    "efficiency_cont": None,
    "efficiency_tret": None
}

In [3]:
def extract_papernum(root):
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    numbers = ''.join(filter(str.isdigit, paper_num))
    return numbers 

In [4]:
def get_passivators(root):
    passivators = {}
    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id else value
        if var_name != "passivating_molecule" or value == None:
            continue
        
        if value in passivators:
            passivators[value].append(node_id)
        else:
            passivators[value] = [node_id]
    return passivators

In [16]:
def get_relations(root):
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        relation_id = relation.get("id")
        if 'performance' in test_name:
            continue
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            if node_id not in relations:
                relations[node_id] = [relation_id]
            else:
                relations[node_id].append(relation_id)
    return relations

In [None]:
def get_json_for_paper(root, passivators, relations):
    output = {}
    if len(passivators) == 0:
        return None
    for passivator in passivators:
        relevant_tests = []
        curr_object = EMPTY_PAPER_DATA.copy()
        passivator_nodes = passivators[passivator]
        for node_id in passivator_nodes:
            if node_id in relations:
                relevant_tests = relations[node_id]
        print(passivator) 
        print(relevant_tests)
        for annotation in root.findall(".//annotation"):
            node_id = annotation.get("id")
            var_name = annotation.find("infon[@key='type']").text
            concept_id = annotation.find("infon[@key='identifier']").text
            value = annotation.find("text").text
            value = concept_id if concept_id is not None else value
            if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact", "perovskite_molecule"]: #irrelevant
                continue

            if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer", "control_pce", "control_voc", "treated_pce", "treated_voc",]:
                #in top level: composition, ETL, HTL, PIN-NIP,
                curr_object[var_name] = value
            elif node_id in relations:
                test_names = relations[node_id]
                for test_name in test_names:
                    if test_name not in relevant_tests and len(passivators) > 1: #only needs to filter by relevant tests if multiple passivators
                        continue
                    elif test_name in curr_object:
                        curr_object[test_name][var_name] = value
                    else:
                        curr_object[test_name] = EMPTY_STABILITY_TEST.copy()
                        curr_object[test_name][var_name] = value  
        output[passivator] = curr_object
    return output 

In [20]:
def parse_bioc(root):
    paper_num = extract_papernum(root)
    if paper_num != "7":
        return
        print(get_passivators(root))
        print(get_relations(root))
    passivators = get_passivators(root)
    relations  = get_relations(root)
    output = get_json_for_paper(root, passivators, relations)
    print(output)
    return output


In [21]:
bioc_dir = "data/biocs"
data = {}
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        root = ET.parse(file_path).getroot()
        paper_data = parse_bioc(root)
        paper_num = extract_papernum(root)
        data[paper_num] = paper_data


phenethylammonium iodide
['R4']
4-bromophenylethylammonium
[]
4-chlorophenylethylammonium
[]
butylammonium iodide
['R2']
benzylammonium
['R3']
(ethylenedioxy)bis(ethylammonium) lead iodide
['R1']
dodecylammonium iodide
['R5', 'R6']
phenylethylammonium lead iodide
['R7', 'R8']
4-tert-butyl-benzylammonium
['R9']
{'phenethylammonium iodide': {'perovskite_composition': 'Cs0.05(MA)0.16(FA)0.79Pb(I0.83Br0.17 )3', 'electron_transport_layer': 'C60', 'hole_transport_layer': None, 'structure_pin_nip': 'NIP', 'passivating_molecule': '4-tert-butyl-benzylammonium', 'control_pce': '20.1', 'treated_pce': '23.4', 'control_voc': '1.05', 'treated_voc': '1.11', 'R4': {'stability_type': 'ISOSL3', 'humidity': None, 'temperature': None, 'time': '100', 'efficiency_cont': '0', 'efficiency_tret': '95'}}, '4-bromophenylethylammonium': {'perovskite_composition': 'Cs0.05(MA)0.16(FA)0.79Pb(I0.83Br0.17 )3', 'electron_transport_layer': 'C60', 'hole_transport_layer': None, 'structure_pin_nip': 'NIP', 'passivating_mol

In [41]:
with open('data/annotations_flattened.json', 'w') as f:
    json.dump(data, f)

# Legacy Code

In [None]:
def get_json_for_passage(passage, relations, previous_json):
    for annotation in passage.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id is not None else value
        # if var_name == "perovskite_molecule": #due to an error in some of the annotations
        #     var_name = "passivating_molecule"
        if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact"]: #irrelevant
            continue

        if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer" ]:
            #in top level: composition, ETL, HTL, PIN-NIP,
            previous_json[var_name] = value
        if var_name == "passivating_molecule" and node_id in relations:
            has_passivator = True
        elif node_id in relations:
            test_names = relations[node_id]
            for test_name in test_names:
                if test_name not in previous_json:
                    previous_json[test_name] = EMPTY_STABILITY_TEST.copy()
                previous_json[test_name][var_name] = value
        elif len(relations.keys()) == 0:
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value
            #in stability tests:
            #test type, passivator, PCE (control + treat), VOC (control + treat)
            #efficiency (treat, control), temp, time, humidity
        else:
            #assumes that all other possible data goes into the first stability test
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value

    return previous_json
    


In [15]:
def extract_papernum(root):
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [None]:
def get_json_for_passage(passage, relations, molecules_data):
    for annotation in passage.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id is not None else value

        if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact"]:
            continue

        if var_name == "passivating_molecule":
            if value not in molecules_data:
                molecules_data[value] = EMPTY_PAPER_DATA.copy()
            current_molecule_data = molecules_data[value]
            if node_id in relations:
                
        else:
            current_molecule_data = molecules_data.get(list(molecules_data.keys())[-1], EMPTY_PAPER_DATA.copy())

        if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer"]:
            current_molecule_data[var_name] = value

        if node_id in relations:
            test_names = relations[node_id]
            for test_name in test_names:
                if test_name not in current_molecule_data:
                    current_molecule_data[test_name] = EMPTY_STABILITY_TEST.copy()
                current_molecule_data[test_name][var_name] = value
        elif len(relations.keys()) == 0:
            if "test_1" not in current_molecule_data:
                current_molecule_data["test_1"] = EMPTY_STABILITY_TEST.copy()
            current_molecule_data["test_1"][var_name] = value
        else:
            if "test_1" not in current_molecule_data:
                current_molecule_data["test_1"] = EMPTY_STABILITY_TEST.copy()
            current_molecule_data["test_1"][var_name] = value

    return molecules_data

def parse_bioc_into_json(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    passages = root.findall('.//passage')
    relations = {}
    test_names = set()

    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        if 'performance' in test_name:
            continue
        if test_name in test_names:
            test_name = test_name + "_2"
        test_names.add(test_name)
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            if node_id not in relations:
                relations[node_id] = [test_name]
            else:
                relations[node_id].append(test_name)

    molecules_data = {}
    for passage in passages:
        molecules_data = get_json_for_passage(passage, relations, molecules_data)

    return [molecules_data[key] for key in molecules_data]

In [59]:
def passivator_correct(file_path):
    has_passivator = False
    annotated_correctly = False
    tree = ET.parse(file_path)
    root = tree.getroot()


    relations = {}
    test_names = set()
    for relation in root.findall(".//relation"):
        node_ids = [node.get("refid") for node in relation.findall("node")] 
        test_name = relation.find("infon[@key='type']").text
        if 'performance' in test_name: #irrelevant tests
            continue
        if test_name not in test_names:
            test_names.add(test_name)
        else:
            test_name = test_name + "_2"
        for node_id in node_ids:
            ##handles node_id being in multiple relations
            if node_id not in relations:
                relations[node_id] = [test_name]
            else:
                relations[node_id].append(test_name)
    related_data = []
    passivators = set()
    for annotation in root.findall('.//annotation'):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = concept_id if concept_id is not None else value
        if var_name == "passivating_molecule":
            passivators.add(value)
            has_passivator = True
            if node_id in relations:
                annotated_correctly = True
        if var_name in ["treated_pce", "treated_voc", "control_pce", "control_voc"]:
            related_data.append(node_id)
    if has_passivator and annotated_correctly and len(passivators) > 1:

        for node_id in related_data:
            if node_id not in relations:
                print("id: " + extract_papernum(root))
                print(file_path)
                print(passivators)
                print(node_id + " has no relation")
                break
    if not has_passivator:
        annotated_correctly = True
    return annotated_correctly
    

In [60]:
bioc_dir = "data/biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        root = ET.parse(file_path).getroot()
        if not passivator_correct(file_path):
            data.append({"paper_num": extract_papernum(root)})

df = pd.DataFrame(data)

id: 8
data/biocs\250108025023546GFCDAG_v2.xml
{'pyrene based ammonium iodide', 'pyrene based methylammonium iodide', 'pyrene based ethylammonium iodide'}
21 has no relation
id: 14
data/biocs\250108025024844HHAEHG_v2.xml
{'4-chlorophenylethylammonium iodide', '2-phenylethylammonium iodide', '4-fluorophenylethylammonium iodide'}
2 has no relation
id: 39
data/biocs\250108025029594BEDIIG_v2.xml
{'2-thiophenemethylammonium bromide', '2-thiophenemethylammonium chloride', '2-thiophenemethylammonium iodide'}
12 has no relation
id: 73
data/biocs\250108025036593HBGDID_v2.xml
{'3,5-bis(trifluoromethyl) phenethylammonium iodide', 'phenethylammonium iodide'}
19 has no relation
id: 95
data/biocs\250108025040918FDBDGB_v2.xml
{'dodecylammonium iodide', 'octylammonium iodide', 'oleylammonium iodide', 'butylammonium iodide'}
12 has no relation
id: 114
data/biocs\250108025044907BDBFDI_v2.xml
{'Piperazinium Diiodide', 'piperazinium iodide'}
19 has no relation
id: 146
data/biocs\250108025051792AFFADG_v2.xm

In [39]:
df

Unnamed: 0,paper_num
0,1
1,9
2,16
3,18
4,20
5,27
6,36
7,37
8,40
9,46


In [23]:
bioc_dir = "data/biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        curr_paper_chunks = parse_bioc_into_chunks(file_path)
        data.extend(curr_paper_chunks)

df = pd.DataFrame(data)

0
True
True
True
{"perovskite_composition": "Cs0.05FA0.85MA0.1PbI3", "electron_transport_layer": "C60", "hole_transport_layer": "2PACz", "structure_pin_nip": "PIN", "test_1": {"stability_type": "ISOSL3", "passivating_molecule": "4-chlorobenzenesulfonate", "humidity": "50", "temperature": "65", "time": "1200", "control_pce": "24", "treated_pce": "26.9", "control_voc": null, "treated_voc": "1.18", "efficiency_cont": null, "efficiency_tret": "95", "structure_pin_nip": "PIN", "perovskite_composition": "Cs0.05FA0.85MA0.1PbI3", "hole_transport_layer": "2PACz", "electron_transport_layer": "C60"}, "test_1_2": {"stability_type": "ISOSL3", "passivating_molecule": null, "humidity": "50", "temperature": "85", "time": "540", "control_pce": "24", "treated_pce": "26.9", "control_voc": null, "treated_voc": null, "efficiency_cont": null, "efficiency_tret": "87"}, "test_2": {"stability_type": "ISOSD2", "passivating_molecule": null, "humidity": null, "temperature": "85", "time": "1500", "control_pce": "2

In [25]:
df

Unnamed: 0,id,text,memory,output
0,0_0,Paper #: 0\r,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
1,0_1,Interfacial engineering from material to solve...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
2,0_2,Formamidinium lead triiodide (FAPbI 3 ) has re...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
3,0_3,Introduction Hybrid organic-inorganic lead hal...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
4,0_4,Results and discussion According to the previ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
...,...,...,...,...
5327,#:144_58,\t\t\t This journal is © The Royal Society of ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
5328,#:144_59,\t\t\t Published on 12 April 2022. Downloaded ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
5329,#:144_60,"\t\t\t S. D. Stranks and H. J. Snaith, Metal-h...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
5330,#:144_61,"\t\t\t 2015, 10, 391-402. 2 P. K. Nayak, S. Ma...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."


In [26]:
# df.to_csv('data/150_papers_json_perchunk.csv', index=False)

In [27]:
def get_numbers(string):
    if ":" in string:
        return string.split(":")[1]
    else:
        return string

In [28]:
df[['first_num', 'second_num']] = df['id'].str.split('_', expand=True)

# print(df['first_num'])
# Step 2: Convert 'first_num' to numeric for proper sorting
df['first_num'] = df['first_num'].apply(get_numbers)
df['first_num'] = df['first_num'].astype(int)

# Step 3: Group by 'first_num' and get the last row of each group
result = df.groupby('first_num', as_index=False).last()
result.to_csv('data/150_papers_json_update.csv', index=False)

In [32]:
result['first_num'].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  35,  36,  37,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149], dtype=int64)

## Troubleshooting with passage 0

In [10]:
paper_0 = "data/biocs/250108025021837BGIFAF_v1.xml"
curr_paper_chunks = parse_bioc_into_chunks(paper_0)
curr_paper_chunks

[{'id': '0_0',
  'text': 'Paper #: 0\r',
  'memory': '{"perovskite_composition": null, "electron_transport_layer": null, "hole_transport_layer": null, "structure_pin_nip": null}',
  'output': '{"perovskite_composition": null, "electron_transport_layer": null, "hole_transport_layer": null, "structure_pin_nip": null}'},
 {'id': '0_1',
  'text': 'Interfacial engineering from material to solvent: A mechanistic understanding on stabilizing α-formamidinium lead triiodide perovskite photovoltaics\r',
  'memory': '{"perovskite_composition": null, "electron_transport_layer": null, "hole_transport_layer": null, "structure_pin_nip": null}',
  'output': '{"perovskite_composition": null, "electron_transport_layer": null, "hole_transport_layer": null, "structure_pin_nip": null}'},
 {'id': '0_2',
  'text': 'Formamidinium lead triiodide (FAPbI 3 ) has recently been considered as the most promising candidate to achieve highly efficient perovskite solar cells (PSCs). Excitingly, the state-of-the-art hig

In [11]:
string_0 = result['output'][0]

In [12]:
import json

In [13]:
json_string = string_0.replace("None", "null")
json_object = json.loads(json_string)
json_object

{'perovskite_composition': 'Cs0.05FA0.85MA0.1PbI3',
 'electron_transport_layer': 'C60',
 'hole_transport_layer': '2PACz',
 'structure_pin_nip': 'PIN',
 'test_1': {'stability_type': 'ISOSL3',
  'passivating_molecule': '4-chlorobenzenesulfonate',
  'humidity': '50',
  'temperature': '65',
  'time': '1200',
  'control_pce': '24',
  'treated_pce': '26.9',
  'control_voc': None,
  'treated_voc': '1.18',
  'efficiency_cont': None,
  'efficiency_tret': '95'},
 'test_1_2': {'stability_type': 'ISOSL3',
  'passivating_molecule': '4-chlorobenzenesulfonate',
  'humidity': '50',
  'temperature': '85',
  'time': '540',
  'control_pce': '24',
  'treated_pce': '26.9',
  'control_voc': None,
  'treated_voc': None,
  'efficiency_cont': None,
  'efficiency_tret': '87'},
 'test_2': {'stability_type': 'ISOSD2',
  'passivating_molecule': '4-chlorobenzenesulfonate',
  'humidity': None,
  'temperature': '85',
  'time': '1500',
  'control_pce': '24',
  'treated_pce': '26.9',
  'control_voc': None,
  'treated_v

In [15]:
df.to_csv("data/chunked_training.csv", index=False)

In [8]:
(df[df["id"].str.startswith("0_")].iloc[-1]["output"])

'{"perovskite_composition": "Cs 0.05 FA 0.85 MA 0.1 PbI 3", "electron_transport_layer": "C60", "hole_transport_layer": "2PACz and Me-4PACz", "structure_pin_nip": "PIN", "test_1": {"stability_type": "ISOSL", "passivating_molecule": "4-chlorobenzenesulfonate (4Cl-BZS)", "humidity": null, "temperature": "65", "time": "1200", "control_pce": "24", "treated_pce": "26.9", "control_voc": null, "treated_voc": "1.18", "efficiency_control": null, "efficiency_tret": "95%"}}'

In [9]:
paper_8 = df[df["id"].str.startswith("8_")]

In [10]:
paper_8.iloc[-1]["output"]

'{"perovskite_composition": null, "electron_transport_layer": "Tin Oxide", "hole_transport_layer": "Spiro-OMeTAD", "structure_pin_nip": "NIP", "test_1": {"stability_type": "ISOSLT", "passivating_molecule": "ethylammonium pyrene", "humidity": null, "temperature": "40", "time": "2000", "control_pce": "19.3", "treated_pce": "22.4", "control_voc": null, "treated_voc": "1.177", "efficiency_control": null, "efficiency_tret": "0.85", "efficiency_cont": "0.6"}}'

In [11]:
example_1 = paper_8[paper_8['text'].str.contains("Supplementary Text S3")]

In [13]:
example_1.to_csv("data/chunked_example.csv", index=False)