In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import os
from dotenv import load_dotenv
from openai import OpenAI
import json
import re
import torch
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

## Loading Model and Prompt

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    # bnb_4bit_quant_type="nf4",
    # bnb_4bit_compute_dtype=getattr(torch, "float16"),
    # bnb_4bit_use_double_quant=False
)

In [4]:
# trained model path
model_path = "../models/llama-3.2-3b-it-Perovskite-PaperExtractor"
model_name = "models/DeepSeek-R1-PSC-Extractor-8B"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.model_max_length = 70000
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=None,
    top_p=None,
    do_sample=False,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.98s/it]


### Prompt Text

In [5]:
one_shot_text = f"Intact 2D/3D halide junction perovskite solar cells via solid-phase in-plane growth Yeoun-Woo Jang 1,2,5 , Seungmin Lee 3,5 , Kyung Mun Yeom 3 , Kiwan Jeong 1 , Kwang Choi 3 , Mansoo Choi 1,2 ✉ and Jun Hong Noh 3,4 ✉ The solution process has been employed to obtain Ruddlesden-Popper two-dimensional/three-dimensional (2D/3D) halide perovskite bilayers in perovskite solar cells for improving the efficiency and chemical stability; however, the solution process has limitations in achieving thermal stability and designing a proper local electric field for efficient carrier collection due to the formation of a metastable quasi-2D perovskite. Here we grow a stable and highly crystalline 2D (C 4 H 9 NH 3 ) 2 PbI 4 film on top of a 3D film using a solvent-free solid-phase in-plane growth, which could result in an intact 2D/3D heterojunction. An enhanced built-in potential is achieved at the 2D/3D heterojunction with a thick 2D film, resulting in high photovoltage in the device. The intact 2D/3D heterojunction endow the devices with an open-circuit voltage of 1.185 V and a certified steady-state efficiency of 24.35%. The encapsulated device retained 94% of its initial efficiency after 1,056 h under the damp heat test (85 °C/85% relative humidity) and 98% after 1,620 h under full-sun illumination."
one_shot_answer =     """{{
        "perovskite_composition": null, // Chemical formula of the perovskite (string).
        "electron_transport_layer": null, // Material used as the electron transport layer (string).
        "pin_nip_structure": null, // Whether the perovskite uses a PIN or NIP structure (values: "PIN" or "NIP").
        "hole_transport_layer": null, // Material used as the hole transport layer (string).
        "passivating_molecule": null, // Name of the passivating molecule used in the test.
        "control_pce": null, // Power conversion efficiency for control perovskite (numeric).
        "control_voc": null, // Open-circuit voltage for control perovskite (numeric).
        "treated_pce": null, // Power conversion efficiency for treated perovskite (numeric).
        "treated_voc": null // Open-circuit voltage for treated perovskite (numeric).
        "stability_tests": [
            {{
                "test_name": null, // Must be one of: "ISOS-D", "ISOS-L", "ISOS-T", "ISOS-LC", "ISOS-LT".
                "temperature": null, // Temperature in Celsius (numeric or string, no units or symbols like ° or -).
                "time": null, // Duration of the test in hours (numeric).
                "humidity": null, // Relative Humidity level (string or numeric).
                "retained_proportion_cont": null, // Proportion of the PCE retained by the control perovskite after stability test duration (numeric).
                "retained_proportion_tret": null, /
            }},
        ]
}}"""

In [None]:
# prompt that specifies what to return
PREFIX = """
"You are a scientific assistant and your task is to extract certain information from text, particularly 
in the context of perovskite solar cells. Your task is to identify and extract details about passivating molecules and associated performance data mentioned in the text.
We are in a scientific environment. You MUST be critical of the units of the variables.

"Only extract the variables that were developed in this study. You must omit the ones extracted from the bibliography"
Your task is to extract relevant scientific data from the provided text about perovskite solar cells.
    Follow these guidelines:

    1. **If passivating molecules are mentioned:**
    - If there is more than one passivating molecule tested, only return data for the champion passivator.
    - Include stability test data for each molecule if available. There may be multiple stability tests for a single molecule.

    2. **If no passivating molecules are mentioned:**
    - Provide a JSON object with any other relevant data explicitly mentioned in the text.

    **JSON Structure:**
    - DO NOT change the names of any of the property names. It is imperative that these are exactly as they as stated in the schema below.
    - Ensure the output adheres to the following structure and is parseable as valid JSON:

    {{
        "perovskite_composition": null, // Chemical formula of the perovskite (string).
        "electron_transport_layer": null, // Material used as the electron transport layer (string).
        "pin_nip_structure": null, // Whether the perovskite uses a PIN or NIP structure (values: "PIN" or "NIP").
        "hole_transport_layer": null, // Material used as the hole transport layer (string).
        "test_1": {{ // Include only if stability tests are mentioned. Use unique keys for each test (e.g., test_1, test_2, etc.).
            "test_name": null, // Must be one of: "ISOS-D", "ISOS-L", "ISOS-T", "ISOS-LC", "ISOS-LT".
            "temperature": null, // Temperature in Celsius (numeric or string, no units or symbols like ° or -).
            "time": null, // Duration of the test in hours (string or numeric).
            "humidity": null, // Humidity level (string or numeric).
            "retained_percentage_cont": null, // Percentage of the PCE retained by the control perovskite after stability test (numeric) (values should be between 30-100).
            "retained_percentage_tret": null, // Percentage of the PCE retained by the treated perovskite after stability test (numeric) (values should be between 30-100).
            "passivating_molecule": null, // Name of the passivating molecule used in the test.
            "control_pce": null, // Power conversion efficiency for control perovskite (numeric) (values should be between 10-30).
            "control_voc": null, // Open-circuit voltage for control perovskite (numeric).
            "treated_pce": null, // Power conversion efficiency for treated perovskite (numeric) (values should be between 10-30).
            "treated_voc": null // Open-circuit voltage for treated perovskite (numeric).
        }}
    }}

    **Instructions:**
    - Be concise and accurate. Include only data explicitly present in the text.
    - For stability tests:
    - Infer the test type (e.g., ISOS-D, ISOS-L) based on the description if not explicitly stated.
    - Ensure all numeric values are parseable (e.g., no symbols like ° or -).
    - Use unique keys for each test (e.g., `test_1`, `test_2`, etc.).
    - If a field has no data, set it to `null`.
    - Make sure to only return a JSON object.
    - Do not create any properties that are not stated in the JSON structure provided.
    - If you cannot find a value, do not omit that property, just set it to null.
    - Make sure not to confuse the retained_proportion_cont/retained_proportion_tret variables with the control_pce/treated_pce variables. 
    The PCE values will almost never be above 30, while the proportion retained values will rarely be below 0.5. The retained proportion will not always be there, please leave these values as null if they cannot be found. DO NOT use the PCE for these values.

    Now extract from the following text:
"""
SUFFIX = """\n\n{sample}\n\n"""
def create_prompt(system, user):
    tokens = tokenizer.encode(user, max_length=60000, truncation=True) # prevents CUDA memory errors with current GPU
    truncated_user = tokenizer.decode(tokens)
    return [
    {"role": "system", "content": system},
    {"role": "user", "content": truncated_user}, ]

## Ensuring JSON Format

In [3]:
client = OpenAI(api_key=os.getenv("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")

In [4]:
json_schema = {
    "perovskite_composition": {"type:": str},
    "electron_transport_layer": {"type": str},
    "hole_transport_layer": {"type": str},
    "structure_pin_nip": {"type": str},
    "test_1": {
        "stability_type": {"type": str},
        "passivating_molecule": {"type": str},
        "humidity": {"type": int},
        "temperature": {"type": int},
        "time": {"type": int},
        "control_pce": {"type": float},
        "treated_pce": {"type": float},
        "control_voc": {"type": float},
        "treated_voc": {"type": float},
        "retained_proportion_cont": {"type": float},
        "retained_proportion_tret": {"type": float}
    }
}
ensure_format_system = """
You are a helpful data quality assistant.
Your task is to make sure that the JSON provided follows the following schema:

{json_schema}

**Instructions**
- Note that there may be multiple tests, where each test has the key (test_1, test_2, etc.)
The JSON provided to you may not follow this exact format. Rename the keys if necessary to ensure that it follows the schema previously outlined.
- The JSON provided may be missing some keys. In the case a key is missing, put it in the JSON with its value as null.

Begin formatting!
"""


In [5]:
def ensure_json_format(paper_json):
    messages = [
        {
            "role": "system",
            "content": ensure_format_system.format(json_schema=json_schema)
        },
        {
            "role": "user",
            "content": str(paper_json) 
        }
    ]
    response = client.chat.completions.create(
        model="deepseek-reasoner",
        messages=messages,
        stream=False,
        max_tokens=1024
    )
    output = response.choices[0].message.content
    return output

In [6]:
with open('data/deepseek_finetuned.json', 'r') as f:
    test_json = json.load(f)
test_string = test_json["66"]
test_string
return_val = ensure_json_format(test_string)
json_match = re.search(r"\{.*\}", return_val, re.DOTALL)
if json_match:
    raw_json = json_match.group(0).strip()

In [8]:
json.loads(raw_json)

{'perovskite_composition': 'Cs0.15FA0.85PbI2.55Br0.45',
 'electron_transport_layer': 'SnO2',
 'hole_transport_layer': 'ITO',
 'structure_pin_nip': 'PIN',
 'test_1': {'stability_type': None,
  'passivating_molecule': None,
  'humidity': '30 ± 3 %',
  'temperature': '21 ± 2 °C',
  'time': '1566 h',
  'control_pce': '24.5',
  'treated_pce': '24.5',
  'control_voc': '1.15',
  'treated_voc': '1.15',
  'retained_proportion_cont': None,
  'retained_proportion_tret': '95.5'},
 'test_2': {'stability_type': None,
  'passivating_molecule': None,
  'humidity': None,
  'temperature': 'outdoor conditions',
  'time': '10 months',
  'control_pce': None,
  'treated_pce': None,
  'control_voc': None,
  'treated_voc': None,
  'retained_proportion_cont': None,
  'retained_proportion_tret': '95.5'}}

## Generating Extracted Data

In [7]:
contents = {}
dataset = pd.read_csv('data/rag_filtered_150_papers.csv')

In [8]:
dataset

Unnamed: 0.1,Unnamed: 0,id,full_text,filtered_text
0,0,115,Paper #: 115\nLiquid medium annealing for fabr...,Liquid medium annealing for fabricating durabl...
1,1,112,Paper #: 112\nIodine reduction for reproducibl...,Iodine reduction for reproducible and high-per...
2,2,72,Paper #: 72\nSuppressed Ion Migration in Reduc...,T oday's perovskite solar cells (PSCs) require...
3,3,75,Paper #: 75\nDynamical Evolution of the 2D/3D ...,UV-VIS Absorption The absorption spectra of t...
4,4,40,Paper #: 40\nDamp heat-stable perovskite solar...,Damp heat-stable perovskite solar cells with t...
...,...,...,...,...
144,144,92,Paper #: 92\n\nBanners \n Banners New for 2018...,Electronic Table of Contents (eToC) \n Standa...
145,145,123,Paper #: 123\nConstructing heterojunctions by ...,P erovskite solar cells (PSCs) have reached po...
146,146,31,Paper #: 31\n\nWe performed 1 H-NMR to support...,We performed 1 H-NMR to support our proposed r...
147,147,36,Paper #: 36\nUnderstanding Performance Limitin...,Understanding Performance Limiting Interfacial...


In [None]:
dataset = pd.read_csv('data/rag_filtered_150_papers.csv')
for index, row in dataset.iterrows():
    paper_id = row["id"]
    #feeds paper into model pipeline
    text = row["filtered_text"]
    instruction = create_prompt(PREFIX, text)
    json_string = pipe(instruction, max_new_tokens=2048)[0]["generated_text"][-1]['content']
    print(json_string)
    json_match = re.search(r"\{.*\}", json_string, re.DOTALL)
    if json_match:
        raw_json = json_match.group(0).strip()
    else:
        print("No JSON found")
        contents[paper_id] = None
        continue
    # Fix unquoted values (if any exist, less common)
    parseable = re.sub(r'":\s*"([^"]*?)(".*?"[^"]*?)"', r'": "\1\\\2"', raw_json)
    try:
        formatted = ensure_json_format(parseable)
        parsed_data = json.loads(formatted)
        contents[paper_id] = parsed_data
    except json.JSONDecodeError as e:
        print("Error creating JSON", e)
        contents[paper_id] = None 

Okay, so I need to extract relevant data from the provided text about perovskite solar cells. The user has given a detailed protocol for extracting this information, so I should follow that closely. Let me start by reading through the text carefully to identify the key points.

First, the text discusses the fabrication of perovskite solar cells using a liquid medium annealing (LMA) process. They used anisole as the liquid medium, which is a good solvent for organic compounds and helps in removing residual solvent and moisture from the perovskite film. The process involves preheating the liquid medium, annealing the perovskite film, and then removing the residual liquid. This method is shown to improve the performance and stability of the solar cells compared to traditional annealing.

The text also provides details about the composition of the perovskite films, including the use of cesium (Cs) and bromine (Br) in the perovskite layer, which is important for improving the efficiency and

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Alright, I need to extract relevant data from the provided text about perovskite solar cells. The user has given specific instructions on how to structure the JSON output, so I'll follow those closely.

First, I'll read through the text to identify any relevant information. The text discusses the preparation and characterization of perovskite solar cells, including details about the materials used, device structures, and stability tests.

Looking for passivating molecules, I see that the text mentions the use of PFN-P2. It also provides details about the stability tests conducted, including the type of test (ISOS-L) and the conditions (100°C, 1000 hours). The control and treated devices show good stability, with the treated device retaining 90% of its initial PCE.

Since there are no other relevant data points mentioned, I'll structure the JSON accordingly, ensuring all provided information is included and no unnecessary data is added.
</think>

```json
{
  "perovskite_composition": "C

In [None]:
# exporting JSON
json_path = "data/deepseek_finetuned_2.json"
with open(json_path, "w") as file:
    json.dump(contents, file, indent=4)