In [1]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from openai import OpenAI
import pandas as pd
import numpy as np

import json
import os
import re

from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv()
access_token = os.getenv("HF_TOKEN")
login(token=access_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
with open('data/txts/0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chunks = text.split('\n')

In [3]:
chunk_num = 1 #skips paper id
new_chunks = []
while chunk_num < len(chunks):
    chunk = chunks[chunk_num]
    chunk_num += 1
    if chunk.startswith('\t\t\t') or len(chunk.strip()) == 0:
        continue

    new_chunks.append(chunk)

In [4]:
# model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-7B-instruct")

# simple_prompt = "What, if any, is the passivating molecule tested, and what is the corresponding PCE, VOC, and stability test data (efficency retained over time, temperature, test type)"

# query = model.encode(simple_prompt, convert_to_tensor=True)

# cosine_similarities = []
# for chunk in new_chunks:
#     text_embeddings = model.encode(chunk, convert_to_tensor=True)
#     cosine_similarities.append(util.pytorch_cos_sim(query, text_embeddings).item())

# cos_mean = np.mean(cosine_similarities)

# classified_chunks = []
# for i, value in enumerate(cosine_similarities):
#     if value >= cos_mean:
#         classified_chunks.append(new_chunks[i])

In [5]:
system_prompt = (
    "You are a scientific assistant and your task is to extract certain information from text. "
    "We are in a scientific environment. You MUST be critical of the units of the variables. "
    "Do not leave information behind. "
    "Only extract the variables that were developed in this study. You must omit the ones extracted from the bibliography"
)

In [6]:
simple_prompt = """Extract only the variables detailed below from the provided text. Then join them with the data from previous chunks.
It is likely that a lot of this data is not present in the chunk provided. Only extract the data points that are present in the chunk.
To provide the answer with the data, do it using a schema similar to the following:

- Full name of the passivating molecule tested.
- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment.
- PCE of the control perovskite.
- VOC of the perovskite with the passivating treatment.
- VOC of the control perovskite. 
- Stability test type
- Stability test length
- Percent efficiency retained after stability test length for perovskite with passivating treatment.
- Percent efficiency retained after stability test length for control perovskite.

Text to extract from:

{chunk}

Add the newly extracted data to the one from previous chunks that is the following:

{memory}

Never leave the information from previous chunks behind.
Begin extracting!
"""

In [7]:
client = OpenAI(api_key=os.getenv("DEEPINFRA_TOKEN"), base_url="https://api.deepinfra.com/v1/openai")
# model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
# model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)


In [16]:
summary = ""
i = 0 
for chunk in new_chunks:
    if i == 2:
        i += 1
        continue
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        { "role": "user", "content": simple_prompt.format(chunk=chunk, memory=summary)}
    ]
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3-70B-Instruct",
        messages=messages,
        stream=False
    )
    summary = response.choices[0].message.content
    i += 1

APIStatusError: Error code: 402 - {'detail': {'error': 'inference prohibited, please enter a payment method in https://deepinfra.com/dash/settings'}}

In [9]:
messages

[{'role': 'system',
  'content': 'You are a scientific assistant and your task is to extract certain information from text. We are in a scientific environment. You MUST be critical of the units of the variables. Do not leave information behind. Only extract the variables that were developed in this study. You must omit the ones extracted from the bibliography'},
 {'role': 'user',
  'content': 'Extract only the variables detailed below from the provided text. Then join them with the data from previous chunks.\nIt is likely that a lot of this data is not present in the chunk provided. Only extract the data points that are present in the chunk.\nTo provide the answer with the data, do it using a schema similar to the following:\n\n- Full name of the passivating molecule tested.\n- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment.\n- PCE of the control perovskite.\n- VOC of the perovskite with the passivating treatment.\n- VOC of the control perovskite. \n

In [17]:
summary

"After carefully reviewing the provided text, I extracted the following new data points that match the specified variables:\n\n- Full name of the passivating molecule tested: Not mentioned\n- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment: Not mentioned\n- PCE of the control perovskite: Not mentioned\n- VOC of the perovskite with the passivating treatment: Not mentioned\n- VOC of the control perovskite: Not mentioned\n- Stability test type: Not mentioned\n- Stability test length: Not mentioned\n- Percent efficiency retained after stability test length for perovskite with passivating treatment: Not mentioned\n- Percent efficiency retained after stability test length for control perovskite: Not mentioned\n\nI will add these new data points to the existing data sheet, ensuring that I don't leave any information behind. Here is the updated data sheet:\n\n- Full name of the passivating molecule tested: cyclohexylmethylammonium iodide (CMAI), 4Cl-BZS, 4F-B

In [11]:
i

49

### Evaluation

In [12]:
training_data = pd.read_csv('data/training_data.csv')
paper_0_json = str(training_data[training_data["id"] == 0]["output"])


In [13]:
str(messages[1])

"{'role': 'user', 'content': 'Extract only the variables detailed below from the provided text. Then join them with the data from previous chunks.\\nIt is likely that a lot of this data is not present in the chunk provided. Only extract the data points that are present in the chunk.\\nTo provide the answer with the data, do it using a schema similar to the following:\\n\\n- Full name of the passivating molecule tested.\\n- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment.\\n- PCE of the control perovskite.\\n- VOC of the perovskite with the passivating treatment.\\n- VOC of the control perovskite. \\n- Stability test type\\n- Stability test length\\n- Percent efficiency retained after stability test length for perovskite with passivating treatment.\\n- Percent efficiency retained after stability test length for control perovskite.\\n\\nText to extract from:\\n\\n Table S5 . S5 Summary of operational stability of reported inverted PSCs following the ISO

In [18]:
outputs = {}
outputs['LLAMA-70B-IT'] = summary

In [19]:
outputs

{'LLAMA-70B-IT': "After carefully reviewing the provided text, I extracted the following new data points that match the specified variables:\n\n- Full name of the passivating molecule tested: Not mentioned\n- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment: Not mentioned\n- PCE of the control perovskite: Not mentioned\n- VOC of the perovskite with the passivating treatment: Not mentioned\n- VOC of the control perovskite: Not mentioned\n- Stability test type: Not mentioned\n- Stability test length: Not mentioned\n- Percent efficiency retained after stability test length for perovskite with passivating treatment: Not mentioned\n- Percent efficiency retained after stability test length for control perovskite: Not mentioned\n\nI will add these new data points to the existing data sheet, ensuring that I don't leave any information behind. Here is the updated data sheet:\n\n- Full name of the passivating molecule tested: cyclohexylmethylammonium iodide (CMA