In [1]:
import json
import os
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from scipy.spatial.distance import euclidean

  from .autonotebook import tqdm as notebook_tqdm


### Parsing BioC XML

In [2]:
EMPTY_PAPER_DATA = {
    "perovskite_composition": None,
    "electron_transport_layer": None,
    "hole_transport_layer": None,
    "structure_pin_nip": None,
}
EMPTY_STABILITY_TEST = {
    "stability_type": None,
    "passivating_molecule": None,
    "humidity": None,
    "temperature": None,
    "time": None,
    "control_pce": None,
    "treated_pce": None,
    "control_voc": None,
    "treated_voc": None,
    "efficiency_control": None,
    "efficiency_tret": None
}

In [3]:
def get_json_for_passage(passage, relations, previous_json):
    concept_ids = set()
    for annotation in passage.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        value = concept_id if concept_id is not None else value
        if var_name == "perovskite_molecule": #due to an error in some of the annotations
            var_name = "passivating_molecule"
        if var_name in ["additive_molecule", "treatment_element", "control_element", "metal_contact"]: #irrelevant
            continue

        if var_name in ["perovskite_composition", "structure_pin_nip", "electron_transport_layer", "hole_transport_layer" ]:
            #in top level: composition, ETL, HTL, PIN-NIP,
            previous_json[var_name] = value
        elif node_id in relations:
            test_name = relations[node_id]
            if test_name not in previous_json:
                previous_json[test_name] = EMPTY_STABILITY_TEST.copy()
            previous_json[test_name][var_name] = value
        elif len(relations.keys()) == 0:
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value
            #in stability tests:
            #test type, passivator, PCE (control + treat), VOC (control + treat)
            #efficiency (treat, control), temp, time, humidity
        else:
            #assumes that all other possible data goes into the first stability test
            if "test_1" not in previous_json:
                previous_json["test_1"] = EMPTY_STABILITY_TEST.copy()
            previous_json["test_1"][var_name] = value

    return previous_json
    


In [4]:
def extract_papernum(root):
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [5]:
def parse_bioc_into_chunks(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    passages = root.findall('.//passage')
    data = []

    relations = {}
    test_names = set()
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        if 'performance' in test_name: #irrelevant tests
            continue
        if test_name not in test_names:
            test_names.add(test_name)
        else:
            test_name = test_name + "_2"
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name
    
    paper_num = extract_papernum(root)
    curr_json = EMPTY_PAPER_DATA.copy()
    for relation in root.findall('.//relation'):
        test_name = relation.find
    for i, passage in enumerate(passages):
        passage_text = passage.find('.//text').text
        row = { "id": f"{paper_num}_{i}", "text": passage_text, "memory": json.dumps(curr_json) }
        curr_json = get_json_for_passage(passage, relations, curr_json)
        row['output'] = json.dumps(curr_json)
        data.append(row)
    return data

In [6]:
bioc_dir = "data/biocs"
data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        curr_paper_chunks = parse_bioc_into_chunks(file_path)
        data.extend(curr_paper_chunks)


### Filtering by similarity to prompt

In [2]:
df = pd.read_csv('../data/150_papers_json_perchunk.csv')

In [None]:
# df[['first_num', 'second_num']] = df['id'].str.split('_', expand=True)

# # Step 2: Convert 'first_num' to numeric for proper sorting
# df['first_num'] = df['first_num'].astype(int)

# # Step 3: Group by 'first_num' and get the last row of each group
# result = df.groupby('first_num', as_index=False).last()
# result.to_csv('data/150_papers_json.csv', index=False)

In [3]:
model = SentenceTransformer("Alibaba-NLP/GTE-Qwen2-1.5B-instruct")

Downloading shards: 100%|██████████| 2/2 [02:51<00:00, 85.56s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.74it/s]


In [None]:
simple_prompt = "Identify the passivating molecule tested, the corresponding PCE, VOC, and any stability test data (efficiency retained over time, temperature, test type)."
query = model.encode(simple_prompt, convert_to_tensor=True)

cosine_similarities = []
euclidean_distances = []
for index, row in df.iterrows():
    text_embeddings = model.encode(row["text"], convert_to_tensor=True)
    cosine_sim = util.pytorch_cos_sim(query, text_embeddings).item()
    euclidean_dist = euclidean(query.cpu().numpy(), text_embeddings.cpu().numpy())
    cosine_similarities.append(cosine_sim)
    euclidean_distances.append(euclidean_dist)

combined_scores = [0.7 * cos_sim - 0.3 * eucl_dist for cos_sim, eucl_dist in zip(cosine_similarities, euclidean_distances)]

threshold = np.percentile(combined_scores, 70)


NameError: name 'data' is not defined

In [8]:
relevant_chunks = []
for i, value in enumerate(combined_scores):
    if value >= threshold:
        relevant_chunks.append(df.iloc[i])

In [10]:
relevant_chunks[0]

id                                                          0_2
text          Formamidinium lead triiodide (FAPbI 3 ) has re...
memory        {"perovskite_composition": null, "electron_tra...
output        {"perovskite_composition": null, "electron_tra...
first_num                                                     0
second_num                                                    2
Name: 2, dtype: object

In [13]:
df = pd.DataFrame(relevant_chunks)
df.to_csv("../data/chunked_training.csv", index=False)