In [1]:
import xml.etree.ElementTree as ET 
import pandas as pd 
import os

In [2]:
def parse_bioc(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = { }
    concept_ids = set()
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name

    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        if node_id in relations:
            stability_test = relations[node_id]
            if stability_test not in data:
                data[stability_test] = {}
            if concept_id is None:
                concept_id = value
            data[stability_test][var_name] = concept_id
            concept_ids.add(concept_id)

        if concept_id is None:
            concept_id = value
        if concept_id in concept_ids: # duplicate annotation
            continue
        concept_ids.add(concept_id)
        if var_name not in data:
            data[var_name] = concept_id
        else:
            if isinstance(data[var_name], list):
                data[var_name].append(value)
            else:
                data[var_name] = [data[var_name], value]
    
    return data

In [7]:
bioc_dir = "../../data/biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename != "241114001214901GACDFC_v4.xml":
        continue
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        row = parse_bioc(file_path)
        if len(row.keys()) > 0:
            data.append(row)
    
print(data[0])
df = pd.DataFrame(data)
df.head()

{'perovskite_molecule': 'phenethylammonium iodide', 'perovskite_composition': 'MAPbI3', 'ISOSD1': {'time': '240', 'treated_pce': '15.3', 'control_pce': '16.69', 'temperature': '25', 'humidity': '90', 'control_voc': '1.03', 'treated_voc': '1.06'}, 'electron_transport_layer': 'TiO2', 'hole_transport_layer': 'Spiro-OMeTAD'}


Unnamed: 0,perovskite_molecule,perovskite_composition,ISOSD1,electron_transport_layer,hole_transport_layer
0,phenethylammonium iodide,MAPbI3,"{'time': '240', 'treated_pce': '15.3', 'contro...",TiO2,Spiro-OMeTAD


In [16]:
df[df['treated_pce_peak'].isna() == False]

Unnamed: 0,ISOSL3,structure_pin_nip,passivating_molecule,perovskite_composition,hole_transport_layer,electron_transport_layer,journal_publication,date_published,ISOSD2,treated_voc,...,ISOSL2,time,efficiency_cont,treated_pce_peak,control_voc_peak,treated_voc_peak,ISOST2,ISOST1,control_pce_average,treated_pce_average
23,,,tri-n-octylphosphine (TOPO) 26,,,,,,,,...,,,,19.22%,,,,,,
37,,PIN,,"[PCBM, choline chloride, FA 0.85 MA 0.15 Pb(I ...",,,,,,QAHs,...,,,,QUAs,,"[L-α-phosphatidylcholine, 16.1 ± 1.1%]",,,,
53,,,,,"[PEDOT:PSS, NiO]","[C60, C 60, BCP, Cu, SnO 2]",,,,,...,,600 h,,26.7,,,,,22.0,26.4%


In [17]:
df.to_csv("../../data/bioc_parsed.csv", index=False)

## Building Training Data

In [21]:
training_data = []
txt_dir = "../../data/txts"
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        tree = ET.parse(file_path)
        root = tree.getroot()
        annotations = root.findall(".//annotation")
        if len(annotations) == 0: # has not been annotated yet
            continue
        paper_number = root.find(".//passage").find("text").text
        paper_number = paper_number.split(": ")[1]
        txt_filepath = os.path.join(txt_dir, f"{paper_number}.txt")
        with open(txt_filepath, "r", encoding="utf-8") as f:
            unannotated = f.read()
        with open(file_path, "r", encoding="utf-8") as f:
            xml_text = f.read()
        row = { "paper_id": paper_number, "unannotated": unannotated, "annotated": xml_text }
        training_data.append(row)
df = pd.DataFrame(training_data)
df.to_csv("../../data/training_data.csv", index=False)
df.head()


Unnamed: 0,paper_id,unannotated,annotated
0,0,Paper #: 0\nInterfacial engineering from mater...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
1,1,Paper #: 1\nSurface passivation of perovskite ...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
2,2,Paper #: 2\nIntact 2D/3D halide junction perov...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
3,3,Paper #: 3\nDeterministic fabrication of 3D/2D...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
4,5,Paper #: 5\nFront-contact passivation through ...,"<?xml version=""1.0"" encoding=""UTF-8""?>\n<!DOCT..."
