In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import torch
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from huggingface_hub import login
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os
from sklearn.model_selection import train_test_split
import bitsandbytes as bnb
from trl import SFTTrainer, setup_chat_format
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_bioc(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = { }
    concept_ids = set()
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name

    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        if node_id in relations:
            stability_test = relations[node_id]
            if stability_test not in data:
                data[stability_test] = {}
            if concept_id is None:
                concept_id = value
            data[stability_test][var_name] = concept_id
            concept_ids.add(concept_id)

        if concept_id is None:
            concept_id = value
        if concept_id in concept_ids: # duplicate annotation
            continue
        concept_ids.add(concept_id)
        if var_name not in data:
            data[var_name] = concept_id
        else:
            if isinstance(data[var_name], list):
                data[var_name].append(value)
            else:
                data[var_name] = [data[var_name], value]
    
    return data

### Creating training set

In [3]:
def extract_papernum(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [4]:
bioc_dir = "data/biocs"
training_data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = int(extract_papernum(file_path))
        if not (0 <= paper_num <= 19 or 40 <= paper_num <= 104):
            continue
        parsed = parse_bioc(file_path)
        if len(parsed.keys()) > 0:
            txt_file  = open(f"data/txts/{paper_num}.txt", "r", encoding="utf-8")
            text = txt_file.read()
            txt_file.close()
            row = { "id": paper_num, "unannotated": text, "output": parsed }
            training_data.append(row)
        
training_df = pd.DataFrame(training_data)

In [5]:
training_df.to_csv('data/training_data.csv', index=False)

## Training Model

In [2]:
load_dotenv()
access_token = os.getenv("HF_TOKEN")
login(token=access_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [3]:
model_id = "meta-llama/Llama-3.1-8B-Instruct"
# model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

In [4]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # fp4 or nf4
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)  # Define the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Where the "pad_token" is placed

# Model config
model = AutoModelForCausalLM.from_pretrained(
    model_id,  # Model that we are going to fine-tune
    quantization_config=bnb_config,  # QLoRA config defined above
    device_map="auto",  # Where the model is trained, set device_map="auto" loads a model onto available GPUs first.
)

Downloading shards: 100%|██████████| 4/4 [06:24<00:00, 96.12s/it] 


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [9]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [10]:
INSTRUCTION = """
You are a helpful scientific assistant. Your task is to extract relevant scientific data from the provided text about perovskite solar cells and passivating molecules. If the data is not available in the text, return null for the respective fields. Output the information in JSON format with the following fields:
- `control_pce`: Power conversion efficiency for control perovskite (numeric).
- `control_voc`: Open-circuit voltage for control perovskite (numeric).
- `treated_pce`: Best Power conversion efficiency for treated perovskite (numeric).
- `treated_voc`: Best Open-circuit voltage for treated perovskite (numeric).
- `passivating_molecule`: Name of the champion passivating molecule tested.
- `perovskite_composition`: Chemical formula of the perovskite (string).
- `electron_transport_layer`: Material used as the electron transport layer (string).
- `pin_nip_structure`: Whether the perovskite used a PIN or NIP structure (values: PIN or NIP)
- `hole_transport_layer`: Material used as the hole transport layer (string).
- Stability tests: Include any stability tests mentioned. Stability tests can be done in dark storage (ISOS-D), light-soaking (ISOS-L), thermal cycling (ISOS-T), light cycling (ISOS-LC), and solar-thermal cycling (ISOS-LT). If none of these types are tested, do not include a JSON object for them. Note that these test names are typically not mentioned directly, and you will have to infer them.
Make sure that all numeric variables are proper javascript numbers. If not, return them as a string.
For each test, the value should follow this format:
```json
{
  "test_name": null, (**make sure this value is only one of the following possible values**: ISOS-D, ISOS-L, ISOS-T, ISOS-LC, ISOS-LT)
  "temperature": null (numeric - only return the number in degrees celsius),
  "time": null,
  "humidity": null (string),
  "control_efficiency": null,
  "treatment_efficiency": null
}

The JSON structure must follow this exact format:
{
  "control_pce": null,
  "control_voc": null,
  "treated_pce": null,
  "treated_voc": null,
  "passivating_molecule": null (make sure this value is parseable by JSON - i.e. there are no quotation marks within the string itself),
  "perovskite_composition": null,
  "electron_transport_layer": null (make sure this value is parseable by JSON - i.e. there are no characters that would disrupt parsing within the string itself. Do not need to give the full name),
  "hole_transport_layer": null,
  "stability_tests": [
    {
      "test_name": null (**make sure this value is only one of the following possible values**: ISOS-D, ISOS-L, ISOS-T, ISOS-LC, ISOS-LT),
      "temperature": null (**make sure that this value is either a number or a string - cannot have a - or °**. Do not include unit, make sure it is in celsius. Value must be parseable, i.e. a string or a number.),
      "time": null,
      "humidity": null,
      "control_efficiency": null,
      "treatment_efficiency": null
    },
  ]
}
Be concise and accurate. Include only information explicitly present in the text.
Don't return ranges for any values, as this will cause the JSON to not parse correctly. If a range is presented, return the range as a string. This is any value that has a "-" in it.
Do not include the "%" sign for any value, this will cause the JSON to parse incorrectly. Either do not include it or return a string - specifically for PCE and effiicency variables.
Do not include the degree symbol for any value, this will cause the JSON to parse incorrectly.
If a value is not a string or number, i.e. "85 C", make sure to put quotes around it so that JSON is able to parse it correctly. Make sure every value is a valid string or number.
**make sure no unparseable JSON is returned as values for any of these properties - this means that all strings should have quotation marks around them**
Only return JSON. The text is below:
"""
SUFFIX = """\n\n{sample}\n\n"""

In [11]:
def format_chat_template(row):
    row_json = [{"role": "system", "content": INSTRUCTION },
                {"role": "user", "content": row["unannotated"]},
                {"role": "assistant", "content": row["output"]}]
    new_row = { "text": tokenizer.apply_chat_template(row_json, tokenize=False) }
    return new_row

In [12]:
dataset = pd.read_csv("data/training_data.csv")
formatted_data = []
for index, row in dataset.iterrows():
    formatted_data.append(format_chat_template(row))
formatted_data = pd.DataFrame(formatted_data)
formatted_data = Dataset.from_pandas(formatted_data)

In [13]:
dataset = formatted_data.train_test_split(test_size=0.2)

In [14]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [15]:
training_arguments = TrainingArguments(
    learning_rate=6e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    fp16=False,
    bf16=False,  # bf16 to True with an A100, False otherwise
    logging_steps=1,  # Logging is done every step.
    evaluation_strategy="steps",
    eval_steps=0.01,
    max_grad_norm=0.3,
    warmup_steps=100,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    output_dir="./results/",
    save_strategy="no",
    report_to="none"
)



In [16]:
trainer = SFTTrainer(
    model=model,  # Model to fine-tune
    max_seq_length=2048,  # Max number of tokens of the completion
    args=training_arguments,  # Training arguments to use
    train_dataset=dataset["train"],  # Set of the dataset used for the training
    eval_dataset=dataset["test"],  # Set of the dataset used for the evaluations
    peft_config=peft_config,  # Configuration and PEFT method to use
    processing_class=tokenizer,  # Tokenizer used
    packing=False,
);


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 58/58 [00:00<00:00, 267.46 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 329.72 examples/s]


In [17]:
trainer.train()

Step,Training Loss,Validation Loss
1,2.2272,2.290785
2,2.4102,2.290628
3,2.2605,2.290314
4,2.3143,2.28984
5,2.3095,2.289205
6,2.6181,2.288409
7,2.4338,2.287453
8,2.4251,2.286329
9,2.2911,2.285032
10,2.2152,2.283557


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=29, training_loss=2.2721291328298636, metrics={'train_runtime': 657.3382, 'train_samples_per_second': 0.088, 'train_steps_per_second': 0.044, 'total_flos': 2012202085122048.0, 'train_loss': 2.2721291328298636, 'epoch': 1.0})

In [19]:
model_path = '../models/'
new_model = "llama-3.2-3b-it-Perovskite-PaperExtractor"
trainer.model.save_pretrained(model_path + new_model)
trainer.tokenizer.save_pretrained(model_path + new_model)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('../models/llama-3.2-3b-it-Perovskite-PaperExtractor/tokenizer_config.json',
 '../models/llama-3.2-3b-it-Perovskite-PaperExtractor/special_tokens_map.json',
 '../models/llama-3.2-3b-it-Perovskite-PaperExtractor/tokenizer.json')