In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import os
from dotenv import load_dotenv
import json
import re
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

## Loading Model and Prompt

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=getattr(torch, "float16"),
    # bnb_4bit_use_double_quant=False
)

In [None]:
# trained model path
model_path = "../models/llama-3.2-3b-it-Perovskite-PaperExtractor"
model_name = "models/DeepSeek-R1-PSC-Extractor-8B"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.model_max_length = 70000
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=None,
    top_p=None,
    do_sample=False,
)

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:11<00:00,  5.60s/it]


In [None]:
one_shot_text = f"Intact 2D/3D halide junction perovskite solar cells via solid-phase in-plane growth Yeoun-Woo Jang 1,2,5 , Seungmin Lee 3,5 , Kyung Mun Yeom 3 , Kiwan Jeong 1 , Kwang Choi 3 , Mansoo Choi 1,2 ‚úâ and Jun Hong Noh 3,4 ‚úâ The solution process has been employed to obtain Ruddlesden-Popper two-dimensional/three-dimensional (2D/3D) halide perovskite bilayers in perovskite solar cells for improving the efficiency and chemical stability; however, the solution process has limitations in achieving thermal stability and designing a proper local electric field for efficient carrier collection due to the formation of a metastable quasi-2D perovskite. Here we grow a stable and highly crystalline 2D (C 4 H 9 NH 3 ) 2 PbI 4 film on top of a 3D film using a solvent-free solid-phase in-plane growth, which could result in an intact 2D/3D heterojunction. An enhanced built-in potential is achieved at the 2D/3D heterojunction with a thick 2D film, resulting in high photovoltage in the device. The intact 2D/3D heterojunction endow the devices with an open-circuit voltage of 1.185 V and a certified steady-state efficiency of 24.35%. The encapsulated device retained 94% of its initial efficiency after 1,056 h under the damp heat test (85 ¬∞C/85% relative humidity) and 98% after 1,620 h under full-sun illumination."
one_shot_answer =     """{{
        "perovskite_composition": null, // Chemical formula of the perovskite (string).
        "electron_transport_layer": null, // Material used as the electron transport layer (string).
        "pin_nip_structure": null, // Whether the perovskite uses a PIN or NIP structure (values: "PIN" or "NIP").
        "hole_transport_layer": null, // Material used as the hole transport layer (string).
        "passivating_molecule": null, // Name of the passivating molecule used in the test.
        "control_pce": null, // Power conversion efficiency for control perovskite (numeric).
        "control_voc": null, // Open-circuit voltage for control perovskite (numeric).
        "treated_pce": null, // Power conversion efficiency for treated perovskite (numeric).
        "treated_voc": null // Open-circuit voltage for treated perovskite (numeric).
        "stability_tests": [
            {{
                "test_name": null, // Must be one of: "ISOS-D", "ISOS-L", "ISOS-T", "ISOS-LC", "ISOS-LT".
                "temperature": null, // Temperature in Celsius (numeric or string, no units or symbols like ¬∞ or -).
                "time": null, // Duration of the test in hours (numeric).
                "humidity": null, // Relative Humidity level (string or numeric).
                "retained_proportion_cont": null, // Proportion of the PCE retained by the control perovskite after stability test duration (numeric).
                "retained_proportion_tret": null, /
            }},
        ]
}}"""

In [None]:
# prompt that specifies what to return
PREFIX = """
"You are a scientific assistant and your task is to extract certain information from text. "
"We are in a scientific environment. You MUST be critical of the units of the variables. "
"Do not leave information behind. "
"Only extract the variables that were developed in this study. You must omit the ones extracted from the bibliography"
Your task is to extract relevant scientific data from the provided text about perovskite solar cells.
    Follow these guidelines:

    1. **If passivating molecules are mentioned:**
    - If there is more than one passivating molecule tested, only return data for the champion passivator.
    - Include stability test data for each molecule if available. There may be multiple stability tests for a single molecule.

    2. **If no passivating molecules are mentioned:**
    - Provide a JSON object with any other relevant data explicitly mentioned in the text.

    **JSON Structure:**
    Ensure the output adheres to the following structure and is parseable as valid JSON:

    {{
        "perovskite_composition": null, // Chemical formula of the perovskite (string).
        "electron_transport_layer": null, // Material used as the electron transport layer (string).
        "pin_nip_structure": null, // Whether the perovskite uses a PIN or NIP structure (values: "PIN" or "NIP").
        "hole_transport_layer": null, // Material used as the hole transport layer (string).
        "test_1": {{ // Include only if stability tests are mentioned. Use unique keys for each test (e.g., test_1, test_2, etc.).
            "test_name": null, // Must be one of: "ISOS-D", "ISOS-L", "ISOS-T", "ISOS-LC", "ISOS-LT".
            "temperature": null, // Temperature in Celsius (numeric or string, no units or symbols like ¬∞ or -).
            "time": null, // Duration of the test in hours (string or numeric).
            "humidity": null, // Humidity level (string or numeric).
            "retained_proportion_cont": null, // Proportion of the PCE retained by the control perovskite after stability test (numeric).
            "retained_proportion_tret": null, // Proportion of the PCE retained by the treated perovskite after stability test (numeric).
            "passivating_molecule": null, // Name of the passivating molecule used in the test.
            "control_pce": null, // Power conversion efficiency for control perovskite (numeric).
            "control_voc": null, // Open-circuit voltage for control perovskite (numeric).
            "treated_pce": null, // Power conversion efficiency for treated perovskite (numeric).
            "treated_voc": null // Open-circuit voltage for treated perovskite (numeric).
        }}
    }}

    **Instructions:**
    - Be concise and accurate. Include only data explicitly present in the text.
    - For stability tests:
    - Infer the test type (e.g., ISOS-D, ISOS-L) based on the description if not explicitly stated.
    - Ensure all numeric values are parseable (e.g., no symbols like ¬∞ or -).
    - Use unique keys for each test (e.g., `test_1`, `test_2`, etc.).
    - If a field has no data, set it to `null`.
    - Make sure to only return a JSON object.
    - Do not create any properties that are not stated in the JSON structure provided.
    - If you cannot find a value, do not omit that property, just set it to null.
    - Make sure not to confuse the retained_proportion_cont/retained_proportion_tret variables with the control_pce/treated_pce variables. 
    The PCE values will almost never be above 30, while the proportion retained values will rarely be below 0.5. The retained proportion will not always be there, please leave these values as null if they cannot be found. DO NOT use the PCE for these values.

    An example is given to help you better understand the task:
    Example 1:

    Text to extract from: {text1}
    Answer: {answer1}
    
    Now extract from the following text:
"""
SUFFIX = """\n\n{sample}\n\n"""
def create_prompt(system, user):
    tokens = tokenizer.encode(user, max_length=60000, truncation=True) # prevents CUDA memory errors with current GPU
    truncated_user = tokenizer.decode(tokens)
    return [
    {"role": "system", "content": system},
    {"role": "user", "content": truncated_user}, ]

## Generating Extracted Data

In [6]:
contents = {}
dataset = pd.read_csv('data/rag_filtered_150_papers.csv')

In [7]:
dataset

Unnamed: 0.1,Unnamed: 0,id,full_text,filtered_text
0,0,115,Paper #: 115\nLiquid medium annealing for fabr...,Liquid medium annealing for fabricating durabl...
1,1,112,Paper #: 112\nIodine reduction for reproducibl...,Iodine reduction for reproducible and high-per...
2,2,72,Paper #: 72\nSuppressed Ion Migration in Reduc...,T oday's perovskite solar cells (PSCs) require...
3,3,75,Paper #: 75\nDynamical Evolution of the 2D/3D ...,UV-VIS Absorption The absorption spectra of t...
4,4,40,Paper #: 40\nDamp heat-stable perovskite solar...,Damp heat-stable perovskite solar cells with t...
...,...,...,...,...
144,144,92,Paper #: 92\n\nBanners \n Banners New for 2018...,Electronic Table of Contents (eToC) \n Standa...
145,145,123,Paper #: 123\nConstructing heterojunctions by ...,P erovskite solar cells (PSCs) have reached po...
146,146,31,Paper #: 31\n\nWe performed 1 H-NMR to support...,We performed 1 H-NMR to support our proposed r...
147,147,36,Paper #: 36\nUnderstanding Performance Limitin...,Understanding Performance Limiting Interfacial...


In [8]:
dataset = pd.read_csv('data/rag_filtered_150_papers.csv')
for index, row in dataset.iterrows():
    paper_id = row["id"]
    #feeds paper into model pipeline
    text = row["filtered_text"]
    instruction = create_prompt(PREFIX, text)
    json_string = pipe(instruction, max_new_tokens=1024)[0]["generated_text"][-1]['content']
    print(json_string)
    json_match = re.search(r"\{.*\}", json_string, re.DOTALL)
    if json_match:
        raw_json = json_match.group(0).strip()
    else:
        print("No JSON found")
        contents[paper_id] = None
        continue
    # Fix unquoted values (if any exist, less common)
    parseable = re.sub(r'":\s*"([^"]*?)(".*?"[^"]*?)"', r'": "\1\\\2"', raw_json)
    try:
        parsed_data = json.loads(parseable)
        contents[paper_id] = parsed_data
    except json.JSONDecodeError as e:
        print("Error creating JSON", e)
        contents[paper_id] = None 

Alright, so I'm trying to extract information from this scientific text about perovskite solar cells. The user wants me to focus on the data relevant to the study's development, not what's found in the bibliography. They also mentioned not to leave anything behind and to be concise.

First, I'll read through the text to understand the main focus. It seems the study is about improving the efficiency and stability of perovskite solar cells through a liquid medium annealing (LMA) process. They used anisole as the liquid medium, which helped in removing residual solvents and prevented moisture from entering the perovskite layer. This improved the device performance and stability.

I need to extract relevant data points. The text mentions the composition of the perovskite layers, the annealing conditions, and the resulting device performance. They also discuss stability tests and the use of finite element analysis to study heat transfer.

I should structure the output as a JSON object with 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Alright, I need to extract relevant data from the provided text about perovskite solar cells. The user has provided a detailed set of instructions on what to include and how to structure the output. Let me go through the text step by step.

First, the text discusses excitation for PL imaging measurements using a 445 nm CW laser. It mentions the intensity adjustment to match 1 sun equivalent using an integrating sphere and a spectrometer. There's also a note about calibrating the system with a halogen lamp and correcting the spectral output. The PLQY is measured using numerical integration and verified with test samples. 

Next, the text details the fabrication of different layers in the solar cells. It describes the preparation of electron and hole transport layers, including PEDOT:PSS, P3HT, and PolyTPD. It also explains the deposition of compact and mesoporous TiO2 layers, including the use of microwave plasma treatment. 

The perovskite layer is prepared using a triple-cation perovs

KeyboardInterrupt: 

In [None]:
# exporting JSON
json_path = "data/deepseek_finetuned_2.json"
with open(json_path, "w") as file:
    json.dump(contents, file, indent=4)