## This file is used to create 150_papers_json.csv for annotation scraping

In [24]:
import json
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [25]:
EMPTY_PAPER_DATA = {
    "perovskite_composition": None,
    "electron_transport_layer": None,
    "hole_transport_layer": None,
    "structure_pin_nip": None,
}
EMPTY_STABILITY_TEST = {
    "stability_type": None,
    "passivating_molecule": None,
    "humidity": None,
    "temperature": None,
    "time": None,
    "control_pce": None,
    "treated_pce": None,
    "control_voc": None,
    "treated_voc": None,
    "efficiency_cont": None,
    "efficiency_tret": None
}

In [26]:
def get_json_for_passage(passage, relations, previous_json):
    concept_ids = set()
    for annotation in passage.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id is not None else value
        # if var_name == "perovskite_molecule": #due to an error in some of the annotations
        #     var_name = "passivating_molecule"
        if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact"]: #irrelevant
            continue

        if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer" ]:
            #in top level: composition, ETL, HTL, PIN-NIP,
            previous_json[var_name] = value
        elif node_id in relations:
            test_names = relations[node_id]
            for test_name in test_names:
                if test_name not in previous_json:
                    previous_json[test_name] = EMPTY_STABILITY_TEST.copy()
                previous_json[test_name][var_name] = value
        elif len(relations.keys()) == 0:
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value
            #in stability tests:
            #test type, passivator, PCE (control + treat), VOC (control + treat)
            #efficiency (treat, control), temp, time, humidity
        else:
            #assumes that all other possible data goes into the first stability test
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value

    return previous_json
    


In [27]:
def extract_papernum(root):
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [28]:
def parse_bioc_into_chunks(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    passages = root.findall('.//passage')
    data = []

    relations = {}
    test_names = set()
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        if 'performance' in test_name: #irrelevant tests
            continue
        if test_name not in test_names:
            test_names.add(test_name)
        else:
            test_name = test_name + "_2"
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            if node_id not in relations:
                relations[node_id] = [test_name]
            else:
                relations[node_id].append(test_name)

    paper_num = extract_papernum(root)
    curr_json = EMPTY_PAPER_DATA.copy()
    for relation in root.findall('.//relation'):
        test_name = relation.find
    for i, passage in enumerate(passages):
        passage_text = passage.find('.//text').text
        row = { "id": f"{paper_num}_{i}", "text": passage_text, "memory": json.dumps(curr_json) }
        curr_json = get_json_for_passage(passage, relations, curr_json)
        row['output'] = json.dumps(curr_json)
        data.append(row)
    return data

In [30]:
bioc_dir = "../biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        curr_paper_chunks = parse_bioc_into_chunks(file_path)
        data.extend(curr_paper_chunks)

df = pd.DataFrame(data)

In [13]:
df

Unnamed: 0,id,text,memory,output
0,0_0,Paper #: 0\r,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
1,0_1,Interfacial engineering from material to solve...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
2,0_2,Formamidinium lead triiodide (FAPbI 3 ) has re...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
3,0_3,Introduction Hybrid organic-inorganic lead hal...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
4,0_4,Results and discussion According to the previ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
...,...,...,...,...
5298,#:144_58,\t\t\t This journal is © The Royal Society of ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
5299,#:144_59,\t\t\t Published on 12 April 2022. Downloaded ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
5300,#:144_60,"\t\t\t S. D. Stranks and H. J. Snaith, Metal-h...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
5301,#:144_61,"\t\t\t 2015, 10, 391-402. 2 P. K. Nayak, S. Ma...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."


In [14]:
# df.to_csv('data/150_papers_json_perchunk.csv', index=False)

In [15]:
def get_numbers(string):
    if ":" in string:
        return string.split(":")[1]
    else:
        return string

In [16]:
df[['first_num', 'second_num']] = df['id'].str.split('_', expand=True)

# print(df['first_num'])
# Step 2: Convert 'first_num' to numeric for proper sorting
df['first_num'] = df['first_num'].apply(get_numbers)
df['first_num'] = df['first_num'].astype(int)

# Step 3: Group by 'first_num' and get the last row of each group
result = df.groupby('first_num', as_index=False).last()
result.to_csv('data/150_papers_json_update.csv', index=False)

## The 150_papers_json_update.csv is exported

In [17]:
result['first_num'].unique()

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  35,  36,  37,  39,  40,
        41,  42,  43,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,
        69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,
        82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,
        95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107,
       108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133,
       134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146,
       147, 148, 149], dtype=int64)

## Troubleshooting with passage 0

In [18]:
paper_0 = "data/biocs/250108025021837BGIFAF_v1.xml"
curr_paper_chunks = parse_bioc_into_chunks(paper_0)
curr_paper_chunks

FileNotFoundError: [Errno 2] No such file or directory: 'data/biocs/250108025021837BGIFAF_v1.xml'

In [19]:
string_0 = result['output'][0]

In [20]:
import json

In [21]:
json_string = string_0.replace("None", "null")
json_object = json.loads(json_string)
json_object

{'perovskite_composition': 'Cs0.05FA0.85MA0.1PbI3',
 'electron_transport_layer': 'C60',
 'hole_transport_layer': '2PACz',
 'structure_pin_nip': 'PIN',
 'test_1': {'stability_type': 'ISOSL3',
  'passivating_molecule': '4-chlorobenzenesulfonate',
  'humidity': '50',
  'temperature': '65',
  'time': '1200',
  'control_pce': '24',
  'treated_pce': '26.9',
  'control_voc': None,
  'treated_voc': '1.18',
  'efficiency_cont': None,
  'efficiency_tret': '95'},
 'test_1_2': {'stability_type': 'ISOSL3',
  'passivating_molecule': '4-chlorobenzenesulfonate',
  'humidity': '50',
  'temperature': '85',
  'time': '540',
  'control_pce': '24',
  'treated_pce': '26.9',
  'control_voc': None,
  'treated_voc': None,
  'efficiency_cont': None,
  'efficiency_tret': '87'},
 'test_2': {'stability_type': 'ISOSD2',
  'passivating_molecule': '4-chlorobenzenesulfonate',
  'humidity': None,
  'temperature': '85',
  'time': '1500',
  'control_pce': '24',
  'treated_pce': '26.9',
  'control_voc': None,
  'treated_v

In [22]:
df.to_csv("data/chunked_training.csv", index=False)

In [23]:
(df[df["id"].str.startswith("0_")].iloc[-1]["output"])

'{"perovskite_composition": "Cs0.05FA0.85MA0.1PbI3", "electron_transport_layer": "C60", "hole_transport_layer": "2PACz", "structure_pin_nip": "PIN", "test_1": {"stability_type": "ISOSL3", "passivating_molecule": "4-chlorobenzenesulfonate", "humidity": "50", "temperature": "65", "time": "1200", "control_pce": "24", "treated_pce": "26.9", "control_voc": null, "treated_voc": "1.18", "efficiency_cont": null, "efficiency_tret": "95"}, "test_1_2": {"stability_type": "ISOSL3", "passivating_molecule": "4-chlorobenzenesulfonate", "humidity": "50", "temperature": "85", "time": "540", "control_pce": "24", "treated_pce": "26.9", "control_voc": null, "treated_voc": null, "efficiency_cont": null, "efficiency_tret": "87"}, "test_2": {"stability_type": "ISOSD2", "passivating_molecule": "4-chlorobenzenesulfonate", "humidity": null, "temperature": "85", "time": "1500", "control_pce": "24", "treated_pce": "26.9", "control_voc": null, "treated_voc": null, "efficiency_cont": null, "efficiency_tret": "95"}}

In [9]:
paper_8 = df[df["id"].str.startswith("8_")]

In [10]:
paper_8.iloc[-1]["output"]

'{"perovskite_composition": null, "electron_transport_layer": "Tin Oxide", "hole_transport_layer": "Spiro-OMeTAD", "structure_pin_nip": "NIP", "test_1": {"stability_type": "ISOSLT", "passivating_molecule": "ethylammonium pyrene", "humidity": null, "temperature": "40", "time": "2000", "control_pce": "19.3", "treated_pce": "22.4", "control_voc": null, "treated_voc": "1.177", "efficiency_control": null, "efficiency_tret": "0.85", "efficiency_cont": "0.6"}}'

In [11]:
example_1 = paper_8[paper_8['text'].str.contains("Supplementary Text S3")]

In [13]:
example_1.to_csv("data/chunked_example.csv", index=False)