In [16]:
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
import os
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv()

True

In [17]:
os.environ['WANDB_DISABLED'] = "true"

### Getting Data Ready

In [18]:
def create_prompt_formats(sample):
    INTRO_BLURB ="You are a helpful scientific assistant, specializing in data extraction. Below is an instruction that describes the task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = """### Instruct: 
    Your task is to extract relevant scientific data from the provided text about perovskite solar cells. The aim is to create a JSON object for each surface passivating molecule tested in the text, if there are any.
    If there are no passivating molecules mentioned, just provide a JSON object with any relevant data mentioned in the text. Be concise and accurate. Include only information explicitly present in the text.
    All relevant data should coincide with the following JSON structure. Ensure that the output is parseable into a JSON object:
    {
    - `control_pce`: Power conversion efficiency for control perovskite (numeric).
    - `control_voc`: Open-circuit voltage for control perovskite (numeric).
    - `treated_pce`: Best Power conversion efficiency for treated perovskite (numeric).
    - `treated_voc`: Best Open-circuit voltage for treated perovskite (numeric).
    - `passivating_molecule`: Name of the champion passivating molecule tested.
    - `perovskite_composition`: Chemical formula of the perovskite (string).
    - `electron_transport_layer`: Material used as the electron transport layer (string).
    - `pin_nip_structure`: Whether the perovskite used a PIN or NIP structure (values: PIN or NIP)
    - `hole_transport_layer`: Material used as the hole transport layer (string).
    - `stability_tests`: Include any stability tests mentioned. Stability tests can be done in dark storage (ISOS-D), light-soaking (ISOS-L), thermal cycling (ISOS-T), light cycling (ISOS-LC), and solar-thermal cycling (ISOS-LT). If none of these types are tested, do not include a JSON object for them. Note that these test names are typically not mentioned directly, and you will have to infer them. [
        {
        "test_name": null (**make sure this value is only one of the following possible values**: ISOS-D, ISOS-L, ISOS-T, ISOS-LC, ISOS-LT),
        "temperature": null (**make sure that this value is either a number or a string - cannot have a - or °**. Do not include unit, make sure it is in celsius. Value must be parseable, i.e. a string or a number.),
        "time": null,
        "humidity": null,
        "control_efficiency": null,
        "treatment_efficiency": null
        },
    ]
    }
    """
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['unannotated']}" if sample["unannotated"] else None
    response = f"{RESPONSE_KEY}\n{sample['output']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample


In [25]:
df = pd.read_csv('data/training_data.csv')
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2)


### Fine-tuning

In [26]:
access_token = os.getenv("HF_TOKEN")
login(token=access_token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [27]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

In [28]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(model_name,
                                                      device_map=device_map,
                                                      quantization_config=bnb_config)

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.98s/it]


In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", add_eos_token=True, add_bos_token=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

In [30]:
from functools import partial

def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True
    )

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, dataset):

    dataset = dataset.map(create_prompt_formats)

    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["unannotated", "output"]
    )

    dataset = dataset.shuffle()

    return dataset

In [33]:
MAX_LENGTH = 60000

train_dataset = preprocess_dataset(tokenizer, MAX_LENGTH, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, MAX_LENGTH, dataset['test'])

Map: 100%|██████████| 58/58 [00:00<00:00, 1321.52 examples/s]
Map: 100%|██████████| 58/58 [00:00<00:00, 133.82 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 1680.50 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 67.90 examples/s]


In [34]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

original_model = prepare_model_for_kbit_training(original_model)
original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)

In [35]:
output_dir = f'models/peft-dialogue-summary-training-{str(int(time.time()))}'
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [36]:
peft_trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 824.00 MiB. GPU 0 has a total capacity of 10.75 GiB of which 262.50 MiB is free. Process 217382 has 10.49 GiB memory in use. Of the allocated memory 8.28 GiB is allocated by PyTorch, and 1.47 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)