In [1]:
from unsloth import FastLanguageModel

# Configuration parameters
max_seq_length = 1024  # Maximum sequence length for inputs
dtype = None           # Data type for model weights (default: None)
load_in_4bit = True    # Use 4-bit precision to optimize memory and speed

# Load the pre-trained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct",  # Specify the model
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


# you can switch to anyof these models
# Supported 4bit pre-quantized models for 4x faster downloading + no OOMs.
# fourbit_models = ["unsloth/Meta-Llama-3.1-8B-bnb-4bit", "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
#                   "unsloth/Meta-Llama-3.1-405B-bnb-4bit", "unsloth/Mistral-Small-Instruct-2409", "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
#                   "unsloth/Phi-3.5-mini-instruct", "unsloth/Phi-3-medium-4k-instruct", "unsloth/gemma-2-9b-bnb-4bit",
#                   "unsloth/gemma-2-27b-bnb-4bit", "unsloth/Llama-3.2-1B-bnb-4bit", "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
#                   "unsloth/Llama-3.2-3B-bnb-4bit", "unsloth/Llama-3.2-3B-Instruct-bnb-4bit", "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"]


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.4: Fast Llama patching. Transformers: 4.48.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu118. CUDA: 7.5. CUDA Toolkit: 11.8. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [16]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

### Attempt 1

In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.2.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [3]:
import pickle

def save_to_pickle(obj, filepath):
    """
    Saves an object to a pickle file.

    Args:
        obj: The object to save.
        filepath: The path to the file where the object will be saved.
    """
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(obj, f)
        print(f"Object successfully saved to {filepath}")
    except Exception as e:
        print(f"Error saving object to pickle file: {e}")

def load_from_pickle(filepath):
    """
    Loads an object from a pickle file.

    Args:
        filepath: The path to the file from which the object will be loaded.

    Returns:
        The loaded object.
    """
    try:
        with open(filepath, 'rb') as f:
            obj = pickle.load(f)
        print(f"Object successfully loaded from {filepath}")
        return obj
    except Exception as e:
        print(f"Error loading object from pickle file: {e}")
        return None



In [4]:
train_messages_processed, dev_messages_processed, test_messages_processed = load_from_pickle("sanction-ner-data-processed.pkl")

Object successfully loaded from sanction-ner-data-processed.pkl


In [5]:
from datasets import Dataset

In [6]:
default_system_prompt = """You are an expert Named Entity Recognition (NER) assistant specializing in identifying four entity types: "individual", "entity" (organization or company), "vessel" (ship or boat), and "address"."""
default_user_prompt_header = """Analyze the following text and extract all named entities belonging to these four types.  Output your findings in JSON format. For each identified entity, create a JSON object with two keys: `"type"` (the entity type as one of the four options) and `"text"` (the exact text span from the input text representing the entity).

Example JSON output format:
[
  {"type": "individual", "text": "John Smith"},
  {"type": "address", "text": "123 Main Street, Anytown"}
]

Important:

The "text" value must be the exact, original text as it appears in the input text.
If there's ambiguity and an entity could potentially belong to multiple types or the text belong to multiple named entities, list all possible interpretations in separate JSON objects.
The output should be a valid JSON string that could be loaded by json.loads(). The JSON should only has two fields "type" and "text".
If no entities of the specified types are found, return an empty JSON array: [].
Only output the JSON. Do not include anything else in the response, no comments, no further expalanations.

Text to analyze:
"""

model_tokenizer = tokenizer

def transformDataToChatPrompt(text_to_analyze, model_output, tokenizer=model_tokenizer, system_prompt=default_system_prompt, user_prompt_header=default_user_prompt_header):
    messages =  [
        {"role": "system", "content": default_system_prompt},
        {"role": "user", "content": f"{default_user_prompt_header}{text_to_analyze}"},
        {"role": "assistant", "content": f"{model_output}"}
    ]
    return tokenizer.apply_chat_template(messages, tokenize=False).replace("Cutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n","")

def transformListOfInputsToPrompts(list_of_inputs):
    return [transformDataToChatPrompt(inputs[0], inputs[1]) for inputs in list_of_inputs]

train_promps = transformListOfInputsToPrompts(train_messages_processed)
train_prompts = transformListOfInputsToPrompts(train_messages_processed)
train_data = Dataset.from_dict({"text": train_prompts})

dev_prompts = transformListOfInputsToPrompts(dev_messages_processed)
dev_data = Dataset.from_dict({"text": dev_prompts[:300]})

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 50 steps
    ),
)

Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [10]:
#This code displays GPU details and memory usage using PyTorch. It shows the GPU name, total memory capacity, and peak memory reserved during the session,
# helping monitor resource utilization.

import torch
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
2.303 GB of memory reserved.


In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.089,0.384111
20,0.0019,0.452807
30,0.0005,0.508323
40,0.001,0.525329
50,0.0004,0.526831
60,0.0003,0.526288


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


### Attemp 1: Significantly Reduce Learning Rate - 5e-6 (1/40) of above rate

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 5e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 50 steps
    ),
)

Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [9]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.2732,0.379948
20,0.1918,0.378881
30,0.1328,0.378236
40,0.1007,0.377784
50,0.0715,0.377497
60,0.0604,0.377629


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


### Attemp 2: Further Reduce Learning Rate - 2e-6 (1/100) of above rate

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 20,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-6,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 50 steps
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()



Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.2832,0.380209
20,0.2687,0.380172
30,0.2525,0.37984
40,0.2304,0.379629
50,0.1942,0.379462
60,0.1789,0.379294


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


### Attemp 3: Change Dev to Train, Train to Dev?

In [7]:
train_data_2 = Dataset.from_dict({"text": dev_prompts})
dev_data_2 = Dataset.from_dict({"text": train_prompts[:300]})

In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data_2,
    eval_dataset = dev_data_2,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 50 steps
    ),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()



Map (num_proc=2):   0%|          | 0/3622 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/3622 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,622 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.2638,0.362677
20,0.1041,0.360449
30,0.0523,0.358929
40,0.0369,0.358501
50,0.0165,0.358556
60,0.02,0.358545


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 8,
    lora_dropout = 0.5,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.5.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.2.4 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 20,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 100,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 50 steps
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()



Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.2793,0.380112
20,0.1928,0.37806
30,0.0734,0.375121
40,0.0274,0.374093
50,0.0109,0.374679
60,0.0054,0.375301
70,0.0042,0.37557
80,0.0073,0.37595


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


KeyboardInterrupt: 

### Batch Size Increase

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        auto_find_batch_size=True,
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 20,
        num_train_epochs = 2, # Set this for 1 full training run.
        #max_steps = 100,
        learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 50 steps
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 4
\        /    Total batch size = 32 | Total steps = 226
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.2775,0.379849
20,0.1206,0.376718


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


KeyboardInterrupt: 

## fp8 optimizer fits too soon?

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        auto_find_batch_size=True,
        #per_device_train_batch_size = 8,
        #gradient_accumulation_steps = 4,
        #warmup_steps = 20,
        learning_rate=1e-3,
        num_train_epochs = 2, # Set this for 1 full training run.
        #max_steps = 100,
        #learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        #optim = "adamw_8bit",
        #weight_decay = 0.01,
        #lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 10 steps
        #save_steps=50,
        #output_dir=""
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 906
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.001,0.698489
20,0.0021,0.656683
30,-0.0001,0.904432
40,-0.0,0.910176
50,0.0159,0.753399
60,0.0036,1.03515
70,0.0086,1.863308
80,0.0171,2.566154
90,0.0175,2.695054
100,0.0003,2.503896


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_data,
    eval_dataset = dev_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        auto_find_batch_size=True,
        #per_device_train_batch_size = 8,
        #gradient_accumulation_steps = 4,
        #warmup_steps = 20,
        learning_rate=1e-4,
        num_train_epochs = 2, # Set this for 1 full training run.
        #max_steps = 100,
        #learning_rate = 2e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        #optim = "adamw_8bit",
        #weight_decay = 0.01,
        #lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
        eval_strategy="steps",  # or "epoch"
        eval_steps=10,  # Perform evaluation every 10 steps
        #save_steps=50,
        #output_dir=""
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

trainer_stats = trainer.train()

Map (num_proc=2):   0%|          | 0/3624 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/300 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,624 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 1
\        /    Total batch size = 8 | Total steps = 906
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss,Validation Loss
10,0.0019,0.377691
20,0.0002,0.393191


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


KeyboardInterrupt: 