In [1]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.4.1-py3-none-any.whl.metadata (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.4/46.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.4.1 (from unsloth)
  Downloading unsloth_zoo-2025.4.1-py3-none-any.whl.metadata (8.0 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.7.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.5 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.19-py3-none-any.whl.metadata (9.9 kB)
Collecting transformers!=4.47.0,>=4.46.1 (from unsloth

In [2]:
# 3. Load Unsloth and prepare environment
from unsloth import FastLanguageModel
import torch
import os

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch SmolVLMForConditionalGeneration forward function.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothBCOTrainer: No module named 'UnslothBCOTrainer'. Using tempfile instead!


In [3]:
import os

path = "./checkpoint-64/adapter_model.safetensors"
print(f"File size: {os.path.getsize(path) / (1024 * 1024):.2f} MB")


File size: 160.06 MB


In [4]:
# 4. Load Mistral 7B model with LoRA
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./checkpoint-64",  # Unsloth's quantized Mistral 7B
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

==((====))==  Unsloth 2025.4.1: Fast Mistral patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

Unsloth: Will load ./checkpoint-64 as a legacy tokenizer.
Unsloth 2025.4.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="STRUCTURED_PARSED_DATA_AGAIN.jsonl", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
#5. Format dataset
# Format into <s>[INST] prompt [/INST] output</s>

def formatting_func(example):
    prompt = f"<s>[INST] {example['instruction']} [/INST] {example['output']}</s>"
    
    return {"text": prompt}
 
dataset = dataset.map(formatting_func, remove_columns=dataset.column_names)

Map:   0%|          | 0/787 [00:00<?, ? examples/s]

In [7]:
# Tokenize and save
dataset = dataset.map(
    lambda samples: tokenizer(samples["text"], truncation=True, padding=False, max_length=2048),
    batched=True,
    num_proc=4,
)
#dataset.save_to_disk("tokenized_parsed_final")
print("✅ Tokenized dataset saved!")

Map (num_proc=4):   0%|          | 0/787 [00:00<?, ? examples/s]

✅ Tokenized dataset saved!


In [8]:
# Set format for Trainer compatibility
dataset.set_format(type="torch")

In [9]:
# ✅ 7. Load Unsloth and environment setup 
from unsloth import is_bfloat16_supported
from transformers import TrainingArguments
from trl import SFTTrainer

In [10]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    dataset_text_field = "text",
    max_seq_length = 2048,
    packing = False,
    args = TrainingArguments(
        output_dir = "./mistral-parsed-again",
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 2,
        warmup_steps = 5,
        num_train_epochs = 4,
        learning_rate = 2e-4,
        fp16 = True,
        bf16 = False,  # use hardware-accurate setting
        logging_steps = 1,
        save_strategy="steps",
        save_steps=1000,
        save_total_limit=2,
        resume_from_checkpoint=True,  # ✅ this will resume from the checkpoint
        gradient_checkpointing = True,
        optim = "adamw_8bit",
        lr_scheduler_type = "cosine",
    ),
)


In [None]:
# ✅ 9. Train
trainer.train()

# ✅ 10. Save model and tokenizer
model.save_pretrained("./mistral-finetuning-again")
tokenizer.save_pretrained("./mistral-finetuning-again")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 787 | Num Epochs = 4 | Total steps = 100
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 2 x 1) = 32
 "-____-"     Trainable parameters = 41,943,040/7,000,000,000 (0.60% trained)
  arr = np.array(obj)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.5588
2,1.5852
3,1.4172
4,1.3734
5,1.3148
6,1.2544
7,1.2787
8,1.2677
9,1.2637
10,1.2879


  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
  arr = np.array(obj)
