In [1]:
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from openai import OpenAI
import pandas as pd
import numpy as np

import json
import os
import re

from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv()
access_token = os.getenv("HF_TOKEN")
login(token=access_token)

  from .autonotebook import tqdm as notebook_tqdm
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
with open('data/txts/0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chunks = text.split('\n')

In [3]:
chunk_num = 1 #skips paper id
new_chunks = []
while chunk_num < len(chunks):
    chunk = chunks[chunk_num]
    chunk_num += 1
    if chunk.startswith('\t\t\t') or len(chunk.strip()) == 0:
        continue

    new_chunks.append(chunk)

In [4]:
# model = SentenceTransformer("Alibaba-NLP/gte-Qwen2-7B-instruct")

# simple_prompt = "What, if any, is the passivating molecule tested, and what is the corresponding PCE, VOC, and stability test data (efficency retained over time, temperature, test type)"

# query = model.encode(simple_prompt, convert_to_tensor=True)

# cosine_similarities = []
# for chunk in new_chunks:
#     text_embeddings = model.encode(chunk, convert_to_tensor=True)
#     cosine_similarities.append(util.pytorch_cos_sim(query, text_embeddings).item())

# cos_mean = np.mean(cosine_similarities)

# classified_chunks = []
# for i, value in enumerate(cosine_similarities):
#     if value >= cos_mean:
#         classified_chunks.append(new_chunks[i])

In [5]:
system_prompt = (
    "You are a scientific assistant and your task is to extract certain information from text. "
    "We are in a scientific environment. You MUST be critical of the units of the variables. "
    "Do not leave information behind. "
    "Only extract the variables that were developed in this study. You must omit the ones extracted from the bibliography"
)

In [6]:
json_schema = {
    "perovskite_composition": {"type:": str},
    "electron_transport_layer": {"type": str},
    "hole_transport_layer": {"type": str},
    "structure_pin_nip": {"type": str},
    "test_1": {
        "stability_type": {"type": str},
        "passivating_molecule": {"type": str},
        "humidity": {"type": int},
        "temperature": {"type": int},
        "time": {"type": int},
        "control_pce": {"type": float},
        "treated_pce": {"type": float},
        "control_voc": {"type": float},
        "treated_voc": {"type": float},
        "efficiency_control": {"type": float},
        "efficiency_tret": {"type": float}
    }
}

In [7]:
simple_prompt = """Extract only the variables detailed below from the provided text. Then join them with the data from previous chunks.
It is likely that a lot of this data is not present in the chunk provided. Only extract the data points that are present in the chunk.
To provide the answer with the data, follow the schema outlined in the JSON provided:

{json_schema}

Some important notes about the variables:

-`passivating_molecule`: Full name of the passivating molecule tested.
-`treated_pce`: PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment.
-`control_pce`: PCE of the control perovskite.
-`treated_voc`: VOC of the perovskite with the passivating treatment.
-`control_voc`: VOC of the control perovskite. 
-`time`: Stability test length in hours
-`efficiency_tret`: Percent efficiency retained after stability test length for perovskite with passivating treatment.
-`efficiency_control`: Percent efficiency retained after stability test length for control perovskite.

Text to extract from:

{chunk}

Add the newly extracted data to the one from previous chunks that is the following:

{memory}

Never leave the information from previous chunks behind.

An example is given to you to help you better understand the task.
Example 1:

Chunk to extract from: {text1}
Data from previous chunks: {example_memory}
Answer: {answer1}
Begin extracting!
"""

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False
)

In [None]:
# client = OpenAI(api_key=os.getenv("DEEPINFRA_TOKEN"), base_url="https://api.deepinfra.com/v1/openai")
# client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 20000
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=None,
    top_p=None,
    do_sample=False,
    load_in_4bit=True,
    device=0
)

Downloading shards: 100%|██████████| 4/4 [06:25<00:00, 96.31s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:17<00:00,  4.36s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 23.68 GiB of which 17.50 MiB is free. Process 3194847 has 23.66 GiB memory in use. Of the allocated memory 23.46 GiB is allocated by PyTorch, and 1.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [9]:
example = pd.read_csv('data/chunked_example.csv').iloc[0]

In [10]:
example

id                                                     8_15
text       Supplementary Text S3 Additional characteriza...
memory    {'perovskite_composition': None, 'electron_tra...
output    {'perovskite_composition': None, 'electron_tra...
Name: 0, dtype: object

: 

In [None]:
summary = ""
i = 0 
for chunk in new_chunks:
    if i == 2:
        i += 1
        continue
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        { "role": "user", "content": simple_prompt.format(
            json_schema=json_schema, 
            chunk=chunk, 
            memory=summary, 
            text1=example["text"],
            example_memory=example["memory"],
            answer1=example["output"]
            )
        }
    ]
    # response = client.chat.completions.create(
    #     model="deepseek-chat",
    #     messages=messages,
    #     stream=False,
    #     max_tokens=8192
    # )
    # summary = response.choices[0].message.content
    summary = pipe(messages, max_new_tokens=1024)[0]["generated_text"][-1]['content']
    i += 1

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [14]:
summary

"Chunk to extract from:  Improved charge extraction in inverted perovskite solar cells with dual-site-binding ligands\n\nExtracted variables:\n\n- `passivating_molecule`: Not available\n- `treated_pce`: 26.15%\n- `control_pce`: Not available\n- `treated_voc`: Not available\n- `control_voc`: Not available\n- `time`: 1200 hours\n- `efficiency_tret`: 95%\n- `efficiency_control`: Not available\n\nData from previous chunks: \n{'perovskite_composition': None, 'electron_transport_layer': 'Tin Oxide', 'hole_transport_layer': 'Spiro-OMeTAD','structure_pin_nip': 'NIP', 'test_1': {'stability_type': 'ISOSLT', 'passivating_molecule': 'ethylammonium pyrene', 'humidity': None, 'temperature': '40', 'time': '2000', 'control_pce': '19.3', 'treated_pce': '22.4', 'control_voc': None, 'treated_voc': '1.177', 'efficiency_control': None, 'efficiency_tret': '0.85', 'efficiency_cont': '0.6'}}\n\nCombined data: \n{'perovskite_composition': None, 'electron_transport_layer': 'Tin Oxide', 'hole_transport_layer': '

In [13]:
cleaned_string = summary.strip().strip('```json').strip()
json.loads(cleaned_string)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [25]:
summary

'```json\n{\n  "perovskite_composition": "Cs0.05FA0.85MA0.1PbI3",\n  "electron_transport_layer": "Tin Oxide",\n  "hole_transport_layer": "Spiro-OMeTAD",\n  "structure_pin_nip": "NIP",\n  "test_1": {\n    "stability_type": "ISOSLT",\n    "passivating_molecule": "cyclohexylmethylammonium iodide (CMAI)",\n    "humidity": "40 ± 10%",\n    "temperature": "50",\n    "time": "1050",\n    "control_pce": "19.3",\n    "treated_pce": "21.07",\n    "control_voc": null,\n    "treated_voc": "1.154",\n    "efficiency_control": "85",\n    "efficiency_tret": "95",\n    "efficiency_cont": "0.6"\n  },\n  "test_2": {\n    "stability_type": "Continuous 1 sun maximum power point operation",\n    "passivating_molecule": "dual-site-binding ligands",\n    "humidity": null,\n    "temperature": "65",\n    "time": "1200",\n    "control_pce": null,\n    "treated_pce": "26.15",\n    "control_voc": null,\n    "treated_voc": null,\n    "efficiency_control": null,\n    "efficiency_tret": "95",\n    "efficiency_cont": 

### Evaluation

In [12]:
training_data = pd.read_csv('data/training_data.csv')
paper_0_json = str(training_data[training_data["id"] == 0]["output"])


In [13]:
str(messages[1])

"{'role': 'user', 'content': 'Extract only the variables detailed below from the provided text. Then join them with the data from previous chunks.\\nIt is likely that a lot of this data is not present in the chunk provided. Only extract the data points that are present in the chunk.\\nTo provide the answer with the data, do it using a schema similar to the following:\\n\\n- Full name of the passivating molecule tested.\\n- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment.\\n- PCE of the control perovskite.\\n- VOC of the perovskite with the passivating treatment.\\n- VOC of the control perovskite. \\n- Stability test type\\n- Stability test length\\n- Percent efficiency retained after stability test length for perovskite with passivating treatment.\\n- Percent efficiency retained after stability test length for control perovskite.\\n\\nText to extract from:\\n\\n Table S5 . S5 Summary of operational stability of reported inverted PSCs following the ISO

In [18]:
outputs = {}
outputs['LLAMA-70B-IT'] = summary

In [19]:
outputs

{'LLAMA-70B-IT': "After carefully reviewing the provided text, I extracted the following new data points that match the specified variables:\n\n- Full name of the passivating molecule tested: Not mentioned\n- PCE (Power Conversion Efficiency) of the perovskite with the passivating treatment: Not mentioned\n- PCE of the control perovskite: Not mentioned\n- VOC of the perovskite with the passivating treatment: Not mentioned\n- VOC of the control perovskite: Not mentioned\n- Stability test type: Not mentioned\n- Stability test length: Not mentioned\n- Percent efficiency retained after stability test length for perovskite with passivating treatment: Not mentioned\n- Percent efficiency retained after stability test length for control perovskite: Not mentioned\n\nI will add these new data points to the existing data sheet, ensuring that I don't leave any information behind. Here is the updated data sheet:\n\n- Full name of the passivating molecule tested: cyclohexylmethylammonium iodide (CMA