In [1]:
import pandas as pd
import torch
import transformers
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
from huggingface_hub import login
import xml.etree.ElementTree as ET
from dotenv import load_dotenv
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os
from sklearn.model_selection import train_test_split
import bitsandbytes as bnb
from trl import SFTTrainer, setup_chat_format
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [2]:
load_dotenv()

True

In [3]:
access_token = os.getenv("HF_TOKEN")
login(token=access_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"


In [5]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # fp4 or nf4
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [6]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)  # Define the tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Where the "pad_token" is placed

# Model config
model = AutoModelForCausalLM.from_pretrained(
    model_id,  # Model that we are going to fine-tune
    quantization_config=bnb_config,  # QLoRA config defined above
    device_map="auto",  # Where the model is trained, set device_map="auto" loads a model onto available GPUs first.
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.90s/it]


In [7]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [8]:
INSTRUCTION = """
You are a helpful scientific assistant. Your task is to annotate the following scientifc paper and outputting a bioc XML format with annotations for the following features:
- `control`: Data for the control group.
  - `PCE`: Power conversion efficiency (numeric).
  - `VOC`: Open-circuit voltage (numeric).
- `treatment`: An array of treatments, where each treatment includes:
  - `PCE`: Power conversion efficiency (numeric).
  - `VOC`: Open-circuit voltage (numeric).
  - `passivating_molecule`: Name of the passivating molecule tested.
- `perovskite_composition`: Chemical formula of the perovskite (string).
- `electron_transport_layer`: Material used as the electron transport layer (string).
- `hole_transport_layer`: Material used as the hole transport layer (string).
- `stability_tests`: An array of stability tests, where each test includes:
  - `test_name`: Name of the stability test (string).
  - `temperature`: Test temperature in degrees Celsius (numeric).
  - `time`: Test duration in hours (numeric).
  - `humidity`: Test humidity in percentage (numeric).
  - `control_efficiency`: Control PCE after the test (numeric).
  - `treatment_efficiency`: Treatment PCE after the test (array of numerics if multiple treatments).


Be concise and accurate. Include only information explicitly present in the text.
"""
SUFFIX = """\n\n{sample}\n\n"""

In [9]:
def format_chat_template(row):
    row_json = [{"role": "system", "content": INSTRUCTION },
                {"role": "user", "content": row["unannotated"]},
                {"role": "assistant", "content": row["annotated"]}]
    new_row = { "text": tokenizer.apply_chat_template(row_json, tokenize=False) }
    return new_row

In [None]:
dataset = pd.read_csv("../data/training_data.csv")
formatted_data = []
for index, row in dataset.iterrows():
    formatted_data.append(format_chat_template(row))
formatted_data = pd.DataFrame(formatted_data)
formatted_data = Dataset.from_pandas(formatted_data)

In [11]:
dataset = formatted_data.train_test_split(test_size=0.2)

In [12]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [13]:
new_model = "llama-3.2-3b-it-Perovskite-PaperExtractor"

In [14]:
training_arguments = TrainingArguments(
    learning_rate=6e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=10,
    fp16=False,
    bf16=False,  # bf16 to True with an A100, False otherwise
    logging_steps=1,  # Logging is done every step.
    evaluation_strategy="steps",
    eval_steps=0.01,
    max_grad_norm=0.3,
    warmup_steps=100,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    output_dir="./results/",
    save_strategy="no",
    report_to="none"
)



In [15]:
trainer = SFTTrainer(
    model=model,  # Model to fine-tune
    max_seq_length=2048,  # Max number of tokens of the completion
    args=training_arguments,  # Training arguments to use
    train_dataset=dataset["train"],  # Set of the dataset used for the training
    eval_dataset=dataset["test"],  # Set of the dataset used for the evaluations
    peft_config=peft_config,  # Configuration and PEFT method to use
    processing_class=tokenizer,  # Tokenizer used
    packing=False,
);


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 43/43 [00:00<00:00, 127.89 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 119.06 examples/s]
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()

Step,Training Loss,Validation Loss
3,2.0473,2.235904
6,2.0385,2.234686
9,2.345,2.232537
12,2.6313,2.22944
15,2.3456,2.2253
18,2.0486,2.219966
21,2.3906,2.213364
24,2.216,2.205416
27,1.8896,2.195843
30,2.292,2.184574


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=210, training_loss=1.8752219495319185, metrics={'train_runtime': 1988.4171, 'train_samples_per_second': 0.216, 'train_steps_per_second': 0.106, 'total_flos': 1.457111854743552e+16, 'train_loss': 1.8752219495319185, 'epoch': 9.767441860465116})

In [2]:
model_path = '../models/'

In [None]:
trainer.model.save_pretrained(model_path + new_model)

In [None]:
trainer.tokenizer.save_pretrained(model_path + new_model)

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('llama-3.2-3b-it-Perovskite-PaperExtractor/tokenizer_config.json',
 'llama-3.2-3b-it-Perovskite-PaperExtractor/special_tokens_map.json',
 'llama-3.2-3b-it-Perovskite-PaperExtractor/tokenizer.json')