In [2]:
import json
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [58]:
EMPTY_PAPER_DATA = {
    "perovskite_composition": None,
    "electron_transport_layer": None,
    "hole_transport_layer": None,
    "structure_pin_nip": None,
}
EMPTY_STABILITY_TEST = {
    "stability_type": None,
    "passivating_molecule": None,
    "humidity": None,
    "temperature": None,
    "time": None,
    "control_pce": None,
    "treated_pce": None,
    "control_voc": None,
    "treated_voc": None,
    "efficiency_control": None,
    "efficiency_tret": None
}

In [87]:
def get_json_for_passage(passage, relations, previous_json):
    concept_ids = set()
    for annotation in passage.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id is not None else value
        if var_name == "perovskite_molecule": #due to an error in some of the annotations
            var_name = "passivating_molecule"
        if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact"]: #irrelevant
            continue

        if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer" ]:
            #in top level: composition, ETL, HTL, PIN-NIP,
            previous_json[var_name] = value
        elif node_id in relations:
            test_name = relations[node_id]
            if test_name not in previous_json:
                previous_json[test_name] = EMPTY_STABILITY_TEST.copy()
            previous_json[test_name][var_name] = value
        elif len(relations.keys()) == 0:
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value
            #in stability tests:
            #test type, passivator, PCE (control + treat), VOC (control + treat)
            #efficiency (treat, control), temp, time, humidity
        else:
            #assumes that all other possible data goes into the first stability test
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value

    return previous_json
    


In [88]:
def extract_papernum(root):
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [89]:
def parse_bioc_into_chunks(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    passages = root.findall('.//passage')
    data = []

    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        if 'performance' in test_name: #irrelevant tests
            continue
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name
    
    paper_num = extract_papernum(root)
    curr_json = EMPTY_PAPER_DATA.copy()
    for relation in root.findall('.//relation'):
        test_name = relation.find
    for i, passage in enumerate(passages):
        passage_text = passage.find('.//text').text
        row = { "id": f"{paper_num}_{i}", "text": passage_text, "memory": curr_json.copy() }
        curr_json = get_json_for_passage(passage, relations, curr_json)
        row['output'] = curr_json.copy()
        data.append(row)
    return data

In [90]:
bioc_dir = "data/biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        curr_paper_chunks = parse_bioc_into_chunks(file_path)
        data.extend(curr_paper_chunks)

df = pd.DataFrame(data)

In [91]:
df

Unnamed: 0,id,text,memory,output
0,0_0,Paper #: 0\r,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
1,0_1,Interfacial engineering from material to solve...,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
2,0_2,Formamidinium lead triiodide (FAPbI 3 ) has re...,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
3,0_3,Introduction Hybrid organic-inorganic lead hal...,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
4,0_4,Results and discussion According to the previ...,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
...,...,...,...,...
5307,75_30,Table S1 . S1 Optimization of the 2D layer th...,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
5308,75_31,Table S2 . S2 Statistics of the photovoltaics...,"{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
5309,75_32,"\t\t\t Certain commercial equipment, instrumen...","{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."
5310,75_33,"\t\t\t J. Ilavsky, Nika: software for two-dime...","{'perovskite_composition': None, 'electron_tra...","{'perovskite_composition': None, 'electron_tra..."


In [92]:
paper_8 = df[df["id"].str.startswith("8_")]

In [93]:
paper_8["Perovskite Thin" in paper_8["text"]]

KeyError: False

In [94]:
paper_8.iloc[-1]["output"]

{'perovskite_composition': None,
 'electron_transport_layer': 'Tin Oxide',
 'hole_transport_layer': 'Spiro-OMeTAD',
 'structure_pin_nip': 'NIP',
 'test_1': {'stability_type': 'ISOSLT',
  'passivating_molecule': 'ethylammonium pyrene',
  'humidity': None,
  'temperature': '40',
  'time': '2000',
  'control_pce': '19.3',
  'treated_pce': '22.4',
  'control_voc': None,
  'treated_voc': '1.177',
  'efficiency_control': None,
  'efficiency_tret': '0.85',
  'efficiency_cont': '0.6'}}

In [96]:
example_1 = paper_8[paper_8['text'].str.contains("Supplementary Text S3")]

In [98]:
example_1[""].iloc[0]

{'perovskite_composition': None,
 'electron_transport_layer': 'Tin Oxide',
 'hole_transport_layer': 'Spiro-OMeTAD',
 'structure_pin_nip': 'NIP'}

In [99]:
example_1.to_csv("data/chunked_example.csv", index=False)