In [1]:
import json
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean

### Parsing BioC XML

In [2]:
EMPTY_PAPER_DATA = {
    "perovskite_composition": None,
    "electron_transport_layer": None,
    "hole_transport_layer": None,
    "structure_pin_nip": None,
    "passivating_molecule": None,
    "control_pce": None,
    "treated_pce": None,
    "control_voc": None,
    "treated_voc": None,
}
EMPTY_STABILITY_TEST = {
    "stability_type": None,
    "humidity": None,
    "temperature": None,
    "time": None,
    "efficiency_cont": None,
    "efficiency_tret": None
}

In [3]:
def get_first_passivator(root, relations):
    passivators = {}
    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id else value
        if var_name == "passivating_molecule":
            if value in passivators:
                passivators[value].append(node_id)
            else:
                passivators[value] = [node_id]
    for passivator in passivators:
        passivator_nodes = passivators[passivator]
        for node_id in passivator_nodes:
            if node_id in relations:
                return passivator, node_id
    
    #case where there are no passivators in a relation but there are passivators in the paper
    for passivator in passivators:
        #arbitrarily choose the first passivator
        return passivator, passivators[passivator][0]
    
    #case where there are no passivators in the paper
    return None, None

In [4]:
def get_num_passivators(root):
    passivators = set()
    for annotation in root.findall(".//annotation"):
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id else value
        if var_name == "passivating_molecule":
            passivators.add(value)
    return len(passivators)

In [52]:
def get_json_for_passage(passage, relations, previous_json, passivator, passivator_node_id, num_passivators):
    concept_ids = set()
    
    for annotation in passage.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id is not None else value
        if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact", "perovskite_molecule"]: #irrelevant
            continue

        related_not_nested = ["treated_pce", "control_pce", "treated_voc", "control_voc", "passivating_molecule"]
        related_nested = ["stability_type", "humidity", "temperature", "time", "efficiency_cont", "efficiency_tret"]
        if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer" ]:
            #in top level: composition, ETL, HTL, PIN-NIP,
            previous_json[var_name] = value
        elif var_name in related_not_nested:
            if num_passivators <= 1:
                previous_json[var_name] = value
            elif node_id not in relations: #unable to infer if it belongs to the passivator
                previous_json[var_name] = None
            else:
                relevant_tests = relations[passivator_node_id]
                node_tests = relations[node_id]
                for test_name in node_tests:
                    if test_name not in relevant_tests: #only gets if it is related to the passivator
                        continue
                    previous_json[var_name] = value
        elif node_id in relations and var_name in related_nested:
            test_names = relations[node_id]
            relevant_tests = test_names
            if num_passivators > 1:
                relevant_tests = relations[passivator_node_id]
            for test_name in test_names:
                if test_name not in relevant_tests:
                    continue
                if test_name not in previous_json:
                    previous_json[test_name] = EMPTY_STABILITY_TEST.copy()
                previous_json[test_name][var_name] = value
        elif len(relations.keys()) == 0:
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value
            #in stability tests:
            #test type, passivator, PCE (control + treat), VOC (control + treat)
            #efficiency (treat, control), temp, time, humidity
        else:
            #assumes that all other possible data goes into the first stability test
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value

    return previous_json
    


In [53]:
def extract_papernum(root):
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [54]:
def parse_bioc_into_chunks(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    passages = root.findall('.//passage')
    data = []

    relations = {}
    test_names = set()
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        if test_name in test_names:
            test_name = test_name + "_2"
        test_names.add(test_name)
        if 'performance' in test_name:
            continue
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            if node_id not in relations:
                relations[node_id] = [test_name]
            else:
                relations[node_id].append(test_name)
    num_passivators = get_num_passivators(root)
    passivator, passivator_node_id = get_first_passivator(root, relations)
    paper_num = extract_papernum(root)
    curr_json = EMPTY_PAPER_DATA.copy()
    for relation in root.findall('.//relation'):
        test_name = relation.find
    for i, passage in enumerate(passages):
        passage_text = passage.find('.//text').text
        row = { "id": f"{paper_num}_{i}", "text": passage_text, "memory": json.dumps(curr_json) }
        curr_json = get_json_for_passage(passage, relations, curr_json, passivator, passivator_node_id, num_passivators)
        row['output'] = json.dumps(curr_json)
        data.append(row)
    return data

In [55]:
bioc_dir = "data/biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        curr_paper_chunks = parse_bioc_into_chunks(file_path)
        data.extend(curr_paper_chunks)


In [56]:
df = pd.DataFrame(data)

In [57]:
df["paper_num"] = df["id"].apply(lambda x: x.split("_")[0])
df.groupby("paper_num").last()["output"].iloc[0]

'{"perovskite_composition": null, "electron_transport_layer": "C60", "hole_transport_layer": "Poly(3,4-ethylenedioxythiophene):Poly(styrene sulfonate)", "structure_pin_nip": "PIN", "passivating_molecule": "ethylenediammonium diiodide", "control_pce": "19.6", "treated_pce": "23.6", "control_voc": "0.79", "treated_voc": "0.89", "test_2": {"stability_type": "ISOSLT", "humidity": null, "temperature": null, "time": "180", "efficiency_cont": "60", "efficiency_tret": "65"}, "test_1": {"stability_type": "ISOSLT", "humidity": null, "temperature": null, "time": "200", "efficiency_cont": "60", "efficiency_tret": "80"}}'

In [59]:
df = pd.DataFrame(data)
df.to_csv("data/150_papers_json_chunked_schema2.csv")

### Filtering by similarity to prompt

In [5]:
df = pd.read_csv('data/150_papers_json_chunked_schema2.csv')

In [6]:
df

Unnamed: 0.1,Unnamed: 0,id,text,memory,output
0,0.0,134_0,Paper #: 134,,
1,,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra...",,
2,1.0,134_1,was calculated to be -6.08 and -6.06 eV. The i...,,
3,,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra...",,
4,2.0,134_2,Supplementary Note 3. The moisture stability ...,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
...,...,...,...,...,...
7047,5299.0,#:144_59,\t\t\t Published on 12 April 2022. Downloaded ...,,
7048,,"{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra...",,
7049,5300.0,#:144_60,"\t\t\t S. D. Stranks and H. J. Snaith, Metal-h...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."
7050,5301.0,#:144_61,"\t\t\t 2015, 10, 391-402. 2 P. K. Nayak, S. Ma...","{""perovskite_composition"": null, ""electron_tra...","{""perovskite_composition"": null, ""electron_tra..."


In [None]:
# df[['first_num', 'second_num']] = df['id'].str.split('_', expand=True)

# # Step 2: Convert 'first_num' to numeric for proper sorting
# df['first_num'] = df['first_num'].astype(int)

# # Step 3: Group by 'first_num' and get the last row of each group
# result = df.groupby('first_num', as_index=False).last()
# result.to_csv('data/150_papers_json.csv', index=False)

In [8]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model = SentenceTransformer("Alibaba-NLP/GTE-Qwen2-1.5B-instruct")

Downloading shards: 100%|██████████| 2/2 [00:35<00:00, 17.66s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.27it/s]


In [10]:
simple_prompt = "Identify the passivating molecule tested, the corresponding PCE, VOC, and any stability test data (efficiency retained over time, temperature, test type)."
query = model.encode(simple_prompt, convert_to_tensor=True)

cosine_similarities = []
euclidean_distances = []
for index, row in df.iterrows():
    text_embeddings = model.encode(row["text"], convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(query, text_embeddings).item()
    euclidean_dist = euclidean(query.cpu().numpy(), text_embeddings.cpu().numpy())
    cosine_similarities.append(cosine_sim)
    euclidean_distances.append(euclidean_dist)

combined_scores = [0.7 * cos_sim - 0.3 * eucl_dist for cos_sim, eucl_dist in zip(cosine_similarities, euclidean_distances)]

threshold = np.percentile(combined_scores, 70)


In [10]:
len(df)

7052

In [11]:
relevant_chunks = []
for i, value in enumerate(combined_scores):
    if value >= threshold:
        relevant_chunks.append(df.iloc[i])

In [15]:
len(relevant_chunks)

2116

Unnamed: 0.1,Unnamed: 0,id,text,memory,output
2,1.0,134_1,was calculated to be -6.08 and -6.06 eV. The i...,,
24,19.0,134_19,Table 4 . 4 Photovoltaic parameters for PSCs ...,,
31,23.0,134_23,Table 8 . 8 Photovoltaic parameters for targe...,,
33,24.0,134_24,Device V OC (V) J SC (mA/cm 2 ) FF (%) PCE (%...,,
106,82.0,37_28,Pseudo-J-V curves Figure R1. 1 1 Figure R1. 1...,,
...,...,...,...,...,...
6965,5237.0,129_103,Table S4 . S4 Representative device performan...,,
7006,5270.0,#:144_30,Fig. S5 .Fig. S6 . S5S6 Fig. S5. ToF-SIMS dep...,,
7010,5273.0,#:144_33,Fig. S16 .Fig. S17 . S16S17 Fig. S16. ToF-SIM...,,
7021,5282.0,#:144_42,Fig. S33 .Fig. S37 . S33S37 Fig. S33. Stabili...,,


In [18]:
df = pd.DataFrame(relevant_chunks)
df = df[df["output"].isna() == False]
df.to_csv("data/chunked_training_schema2.csv", index=False)