In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import os
from dotenv import load_dotenv
import json
from litellm import completion
import re
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

## Loading Model and Prompt

In [3]:
# trained model path
model_path = "../models/llama-3.2-3b-it-Perovskite-PaperExtractor"
model_path = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.model_max_length = 60000
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    temperature=None,
    top_p=None,
    do_sample=False,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Downloading shards: 100%|██████████| 4/4 [06:23<00:00, 95.97s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.29s/it]


In [4]:
# prompt that specifies what to return
PREFIX = """
You are a helpful scientific assistant. Your task is to extract relevant scientific data from the provided text about perovskite solar cells and passivating molecules. If the data is not available in the text, return null for the respective fields. Output the information in JSON format with the following fields:
- `control_pce`: Power conversion efficiency for control perovskite (numeric).
- `control_voc`: Open-circuit voltage for control perovskite (numeric).
- `treated_pce`: Best Power conversion efficiency for treated perovskite (numeric).
- `treated_voc`: Best Open-circuit voltage for treated perovskite (numeric).
- `passivating_molecule`: Name of the champion passivating molecule tested.
- `perovskite_composition`: Chemical formula of the perovskite (string).
- `electron_transport_layer`: Material used as the electron transport layer (string).
- `pin_nip_structure`: Whether the perovskite used a PIN or NIP structure (values: PIN or NIP)
- `hole_transport_layer`: Material used as the hole transport layer (string).
- Stability tests: Include any stability tests mentioned. Stability tests can be done in dark storage (ISOS-D), light-soaking (ISOS-L), thermal cycling (ISOS-T), light cycling (ISOS-LC), and solar-thermal cycling (ISOS-LT). If none of these types are tested, do not include a JSON object for them. Note that these test names are typically not mentioned directly, and you will have to infer them.
Make sure that all numeric variables are proper javascript numbers. If not, return them as a string.
For each test, the value should follow this format:
```json
{
  "test_name": null, (**make sure this value is only one of the following possible values**: ISOS-D, ISOS-L, ISOS-T, ISOS-LC, ISOS-LT)
  "temperature": null (numeric - only return the number in degrees celsius),
  "time": null,
  "humidity": null (string),
  "control_efficiency": null,
  "treatment_efficiency": null
}

The JSON structure must follow this exact format:
{
  "control_pce": null,
  "control_voc": null,
  "treated_pce": null,
  "treated_voc": null,
  "passivating_molecule": null (make sure this value is parseable by JSON - i.e. there are no quotation marks within the string itself),
  "perovskite_composition": null,
  "electron_transport_layer": null (make sure this value is parseable by JSON - i.e. there are no characters that would disrupt parsing within the string itself. Do not need to give the full name),
  "hole_transport_layer": null,
  "stability_tests": [
    {
      "test_name": null (**make sure this value is only one of the following possible values**: ISOS-D, ISOS-L, ISOS-T, ISOS-LC, ISOS-LT),
      "temperature": null (**make sure that this value is either a number or a string - cannot have a - or °**. Do not include unit, make sure it is in celsius. Value must be parseable, i.e. a string or a number.),
      "time": null,
      "humidity": null,
      "control_efficiency": null,
      "treatment_efficiency": null
    },
  ]
}
Be concise and accurate. Include only information explicitly present in the text.
Don't return ranges for any values, as this will cause the JSON to not parse correctly. If a range is presented, return the range as a string. This is any value that has a "-" in it.
Do not include the "%" sign for any value, this will cause the JSON to parse incorrectly. Either do not include it or return a string - specifically for PCE and effiicency variables.
Do not include the degree symbol for any value, this will cause the JSON to parse incorrectly.
If a value is not a string or number, i.e. "85 C", make sure to put quotes around it so that JSON is able to parse it correctly. Make sure every value is a valid string or number.
**make sure no unparseable JSON is returned as values for any of these properties - this means that all strings should have quotation marks around them**
Only return JSON. The text is below:
"""
SUFFIX = """\n\n{sample}\n\n"""
def create_prompt(system, user):
    tokens = tokenizer.encode(user, max_length=23000, truncation=True) # prevents CUDA memory errors with current GPU
    truncated_user = tokenizer.decode(tokens)
    return [
    {"role": "system", "content": system},
    {"role": "user", "content": truncated_user}, ]

## Generating Extracted Data

In [5]:
contents = {}

In [6]:
txt_dir = "../data/txts"
for filename in os.listdir(txt_dir):
    if filename.endswith(".txt") == False:
        continue
    filepath = os.path.join(txt_dir, filename)
    paper_id = os.path.splitext(filename)[0]
    if paper_id in contents:
        continue
    print(paper_id)
    #feeds paper into model pipeline
    with open(filepath, "r", encoding="utf-8") as file:
        text = file.read()
        instruction = create_prompt(PREFIX, text)
        json_string = pipe(instruction, max_new_tokens=1024)[0]["generated_text"][-1]['content']
        print(json_string)
        json_match = re.search(r"\{.*\}", json_string, re.DOTALL)
        if json_match:
            raw_json = json_match.group(0).strip()
        else:
            print("No JSON found")
            continue
        # Fix unquoted values (if any exist, less common)
        parseable = re.sub(r'":\s*"([^"]*?)(".*?"[^"]*?)"', r'": "\1\\\2"', raw_json)
        try:
            parsed_data = json.loads(parseable)
            contents[paper_id] = parsed_data
        except json.JSONDecodeError as e:
            print("Error creating JSON", e)
            print(raw_json)
        

115




null
["control_pce", 23.15, null, null, "MAPbI3", "Cs0.05(FA0.95MA0.05)0.95Pb(I0.95Br0.05)3", "SnO2", "Spiro-OMeTAD/PTAA", "PIN", null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null
null


In [7]:
# exporting JSON
json_path = "../data/finetuned_llama_output_1epoch.json"
with open(json_path, "w") as file:
    json.dump(contents, file, indent=4)