In [1]:
from sentence_transformers import SentenceTransformer, util
import os
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
txt_dir = "data/txts"
data = {}
for filename in os.listdir(txt_dir):
    if filename.endswith(".txt") == False:
        continue
    filepath = os.path.join(txt_dir, filename)
    paper_id = os.path.splitext(filename)[0]
    with open(filepath, 'r', encoding='utf-8') as f:
        text = f.read()
    data[paper_id] = { "full_text": text }

In [3]:
model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
simple_prompt = "What, if any, is the passivating molecule tested, and what is the corresponding PCE, VOC, and stability test data (efficency retained over time, temperature, test type)"

query = model.encode(simple_prompt, convert_to_tensor=True)

for paper_id in data.keys():
    print(paper_id)
    curr_text = data[paper_id]["full_text"]
    chunks = curr_text.split('\n')
    chunk_num = 1 #skips paper id
    new_chunks = []
    while chunk_num < len(chunks):
        chunk = chunks[chunk_num]
        chunk_num += 1
        if chunk.startswith('\t\t\t') or len(chunk.strip()) == 0:
            continue
        new_chunks.append(chunk)
    print(new_chunks[0])
    cosine_similarities = []
    for chunk in new_chunks:
        text_embeddings = model.encode(chunk, convert_to_tensor=True)
        cosine_similarities.append(util.pytorch_cos_sim(query, text_embeddings).item())

    cos_mean = np.mean(cosine_similarities)

    classified_chunks = []
    for i, value in enumerate(cosine_similarities):
        if value >= cos_mean:
            classified_chunks.append(new_chunks[i])
    data[paper_id]["filtered_text"] = '\n'.join(classified_chunks)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.20it/s]


115
Liquid medium annealing for fabricating durable perovskite solar cells with improved reproducibility
112
Iodine reduction for reproducible and high-performance perovskite solar cells and modules
72
Suppressed Ion Migration in Reduced-Dimensional Perovskites Improves Operating Stability
75
Dynamical Evolution of the 2D/3D Interface: A Hidden Driver behind Perovskite Solar Cell Instability
40
Damp heat-stable perovskite solar cells with tailored-dimensionality 2D/3D heterojunctions
91
Methylammonium Chloride Induces Intermediate Phase Stabilization for Efficient Perovskite Solar Cells
129
Nature Research wishes to improve the reproducibility of the work that we publish. This form is intended for publication with all accepted papers reporting the characterization of photovoltaic devices and provides structure for consistency and transparency in reporting. Some list items might not apply to an individual manuscript, but all fields must be completed for clarity. For further information 

In [4]:
data["91"]

{'full_text': 'Paper #: 91\nMethylammonium Chloride Induces Intermediate Phase Stabilization for Efficient Perovskite Solar Cells\nSkyscraper Banner Ad (160 x 600) & Large Rectangle Banner Ad (300 x 250) \n Electronic Table of Contents (eToC) \n Email Targeting Broadcast your message in the audience\'s inbox through our electronic Tables  of Contents (eToCs) . Recipients are all opt-in subscribers, so emails are all fully permission based. Opportunities are available for all primary Cell Press research journals and are delivered the day before each journal\'s cover date. Two sponsored message slots are available: • Top: 728 x 90 (before the Table of Contents) • Middle: 336 x 280 (within the body) Campaign Subscribers Average Open Rate AJHG Supplemental Information 1. AFM images Figure s1. AFM phase images of the perovskite films: (a) pristine, (b) MA-10, (c) MA-20, (d) MA-30, (e) MA-40, and (f) MA-50, respectively.\nAbout Cell Press About \n Contacts With each passing year, the scienti

In [6]:
df = pd.DataFrame.from_dict(data, orient="index")
df.reset_index(inplace=True)
df.rename(columns={'index': 'id'}, inplace=True)
df.to_csv('data/rag_filtered_150_papers.csv')