In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import torch
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from huggingface_hub import login
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os
from sklearn.model_selection import train_test_split
import bitsandbytes as bnb
from trl import SFTTrainer, setup_chat_format
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def parse_bioc(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = { }
    concept_ids = set()
    relations = {}
    for relation in root.findall(".//relation"):
        test_name = relation.find("infon[@key='type']").text
        node_ids = [node.get("refid") for node in relation.findall("node")]
        for node_id in node_ids:
            relations[node_id] = test_name

    for annotation in root.findall(".//annotation"):
        node_id = annotation.get("id")
        var_name = annotation.find("infon[@key='type']").text
        concept_id = annotation.find("infon[@key='identifier']").text
        value = annotation.find("text").text
        if node_id in relations:
            stability_test = relations[node_id]
            if stability_test not in data:
                data[stability_test] = {}
            if concept_id is None:
                concept_id = value
            data[stability_test][var_name] = concept_id
            concept_ids.add(concept_id)

        if concept_id is None:
            concept_id = value
        if concept_id in concept_ids: # duplicate annotation
            continue
        concept_ids.add(concept_id)
        if var_name not in data:
            data[var_name] = concept_id
        else:
            if isinstance(data[var_name], list):
                data[var_name].append(value)
            else:
                data[var_name] = [data[var_name], value]
    
    return data

### Creating training set

In [3]:
def extract_papernum(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    first_text = root.find(".//text")
    full_text = first_text.text
    
    ##We want to extract article number from this format
    #Method: split by spaces and extract the last element in the list
    text_list = full_text.split()
    paper_num = text_list[-1]
    return paper_num

In [4]:
bioc_dir = "data/biocs"
training_data = []
for filename in os.listdir(bioc_dir):
    if filename.endswith(".xml"):
        file_path = os.path.join(bioc_dir, filename)
        paper_num = int(extract_papernum(file_path))
        if not (0 <= paper_num <= 19 or 40 <= paper_num <= 104):
            continue
        parsed = parse_bioc(file_path)
        if len(parsed.keys()) > 0:
            txt_file  = open(f"data/txts/{paper_num}.txt", "r", encoding="utf-8")
            text = txt_file.read()
            txt_file.close()
            row = { "id": paper_num, "unannotated": text, "output": parsed }
            training_data.append(row)
        
training_df = pd.DataFrame(training_data)

In [5]:
training_df.to_csv('data/training_data.csv', index=False)

In [6]:
load_dotenv()
access_token = os.getenv("HF_TOKEN")
login(token=access_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [8]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # fp4 or nf4
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)  # Define the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Where the "pad_token" is placed

# Model config
model = AutoModelForCausalLM.from_pretrained(
    model_id,  # Model that we are going to fine-tune
    quantization_config=bnb_config,  # QLoRA config defined above
    device_map="auto",  # Where the model is trained, set device_map="auto" loads a model onto available GPUs first.
)

Downloading shards: 100%|██████████| 2/2 [02:32<00:00, 76.47s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.07s/it]


In [10]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [11]:
INSTRUCTION = """
You are a helpful scientific assistant. Your task is to annotate the following scientifc paper and outputting a bioc XML format with annotations for the following features:
- `control`: Data for the control group.
  - `PCE`: Power conversion efficiency (numeric).
  - `VOC`: Open-circuit voltage (numeric).
- `treatment`: An array of treatments, where each treatment includes:
  - `PCE`: Power conversion efficiency (numeric).
  - `VOC`: Open-circuit voltage (numeric).
  - `passivating_molecule`: Name of the passivating molecule tested.
- `perovskite_composition`: Chemical formula of the perovskite (string).
- `electron_transport_layer`: Material used as the electron transport layer (string).
- `hole_transport_layer`: Material used as the hole transport layer (string).
- `stability_tests`: An array of stability tests, where each test includes:
  - `test_name`: Name of the stability test (string).
  - `temperature`: Test temperature in degrees Celsius (numeric).
  - `time`: Test duration in hours (numeric).
  - `humidity`: Test humidity in percentage (numeric).
  - `control_efficiency`: Control PCE after the test (numeric).
  - `treatment_efficiency`: Treatment PCE after the test (array of numerics if multiple treatments).


Be concise and accurate. Include only information explicitly present in the text.
"""
SUFFIX = """\n\n{sample}\n\n"""

In [12]:
def format_chat_template(row):
    row_json = [{"role": "system", "content": INSTRUCTION },
                {"role": "user", "content": row["unannotated"]},
                {"role": "assistant", "content": row["output"]}]
    new_row = { "text": tokenizer.apply_chat_template(row_json, tokenize=False) }
    return new_row

In [14]:
dataset = pd.read_csv("data/training_data.csv")
formatted_data = []
for index, row in dataset.iterrows():
    formatted_data.append(format_chat_template(row))
formatted_data = pd.DataFrame(formatted_data)
formatted_data = Dataset.from_pandas(formatted_data)

In [15]:
dataset = formatted_data.train_test_split(test_size=0.2)

In [16]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [17]:
training_arguments = TrainingArguments(
    learning_rate=6e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=10,
    fp16=False,
    bf16=False,  # bf16 to True with an A100, False otherwise
    logging_steps=1,  # Logging is done every step.
    evaluation_strategy="steps",
    eval_steps=0.01,
    max_grad_norm=0.3,
    warmup_steps=100,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    output_dir="./results/",
    save_strategy="no",
    report_to="none"
)



In [18]:
trainer = SFTTrainer(
    model=model,  # Model to fine-tune
    max_seq_length=2048,  # Max number of tokens of the completion
    args=training_arguments,  # Training arguments to use
    train_dataset=dataset["train"],  # Set of the dataset used for the training
    eval_dataset=dataset["test"],  # Set of the dataset used for the evaluations
    peft_config=peft_config,  # Configuration and PEFT method to use
    processing_class=tokenizer,  # Tokenizer used
    packing=False,
);


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 58/58 [00:00<00:00, 329.57 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 339.48 examples/s]


In [19]:
trainer.train()

Step,Training Loss,Validation Loss
3,2.7038,2.144622
6,2.0794,2.143416
9,2.3421,2.141279
12,2.1623,2.138126
15,2.4658,2.133922
18,2.1082,2.128542
21,2.3964,2.121726
24,1.9412,2.113344
27,2.0319,2.10347
30,1.9582,2.091906


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=290, training_loss=1.7839054609167164, metrics={'train_runtime': 3141.3348, 'train_samples_per_second': 0.185, 'train_steps_per_second': 0.092, 'total_flos': 2.012202085122048e+16, 'train_loss': 1.7839054609167164, 'epoch': 10.0})

In [20]:
model_path = '../models/'
new_model = "llama-3.2-3b-it-Perovskite-PaperExtractor"
trainer.model.save_pretrained(model_path + new_model)
trainer.tokenizer.save_pretrained(model_path + new_model)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('../models/llama-3.2-3b-it-Perovskite-PaperExtractor/tokenizer_config.json',
 '../models/llama-3.2-3b-it-Perovskite-PaperExtractor/special_tokens_map.json',
 '../models/llama-3.2-3b-it-Perovskite-PaperExtractor/tokenizer.json')