In [None]:
!pip install transformers
!pip install datasets

In [1]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

In [None]:
!pip install peft

In [None]:
!pip install trl

In [2]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

In [None]:
!pip install 'accelerate>=0.26.0'

In [None]:
!pip install unsloth

In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel, is_bfloat16_supported


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:


from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-unsloth-bnb-4bit",
    max_seq_length = 2048,
    dtype = None,         
    load_in_4bit = True, 
)

==((====))==  Unsloth 2025.6.2: Fast Llama patching. Transformers: 4.52.4.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    use_rslora=True,
    use_gradient_checkpointing=True
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.6.2 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [None]:
df = pd.read_csv("generation_SFTDataset.csv", encoding="latin1")

df.to_csv("generation_SFTDataset.csv",
          index=False,
          encoding="utf-8")

In [None]:
dataset = load_dataset("csv", data_files={"train": "generation_SFTDataset.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
print(dataset['train']['output'][0])

Acinar cell


In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'reasoning', 'output'],
        num_rows: 9600
    })
})


In [14]:
EOS_TOKEN = tokenizer.eos_token

input_prompt = "You are an intelligent expert to identify the most appropriate cell type from the given marker genes.\nYou will be given marker genes and context that you should consider.\n\n**Do NOT explain your answer. You must return a single cell type word.**\n\n{}\n\nContext:\n\n{}\n\nAnswer: {}"


def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["reasoning"]
    outputs = examples["output"]
    texts = []
    for instruction, ainput, output in zip(instructions, inputs, outputs):
        text = input_prompt.format(instruction, ainput, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

In [15]:
from trl import SFTTrainer
tokenizer.pad_token = tokenizer.eos_token

In [16]:
dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

In [17]:
print(dataset['train']['text'][0])

You are an intelligent expert to identify the most appropriate cell type from the given marker genes.
You will be given marker genes and context that you should consider.

**Do NOT explain your answer. You must return a single cell type word.**

Given the expression of genes ERP27, PTF1A, RBPJL, PRSS1, GRP78, identify the most likely cell type.

Context:

Trypsinogen (PRSS1/2) is a zymogen secreted by acinar cells for protein digestion. PTF1A is a master transcription factor governing acinar cell identity and digestive enzyme gene expression. These gene expression patterns strongly indicate an acinar cell identity.

Answer: Acinar cell<|eot_id|>


In [19]:
output_dir = "./outputs"


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=12,
    packing=True,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=16,
        warmup_steps=100,
        num_train_epochs=2.0,
        learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        logging_first_step=True,
        optim="adamw_8bit",
        weight_decay=0.02,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir=output_dir,
        report_to="none",
        save_strategy="steps",
        save_steps=100,
        gradient_checkpointing=True,
        max_grad_norm=0.3,
        dataloader_num_workers=8,
        dataloader_pin_memory=True,
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/9600 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,600 | Num Epochs = 2 | Total steps = 600
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 16 x 1) = 32
 "-____-"     Trainable parameters = 18,350,080/3,000,000,000 (0.61% trained)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, afte

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.9671
50,2.6711
100,1.5184
150,1.1162
200,0.9928
250,0.9003
300,0.8277
350,0.7477
400,0.7161
450,0.689


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=600, training_loss=1.0142843445142111, metrics={'train_runtime': 2807.7358, 'train_samples_per_second': 6.838, 'train_steps_per_second': 0.214, 'total_flos': 6.15856351668265e+16, 'train_loss': 1.0142843445142111})

In [21]:
sft_adapter_path = "./outputs/sft_lora_adapter_generation"

try:
    model.save_pretrained(
        sft_adapter_path,
        save_adapter=True,
        save_config=True
    )

except Exception as e:
    print(f"‚ùå Failed to save model: {e}")
    raise