# Search and learn with unsloth

## Goal

1. Learn to use unsloth
2. See how viable is to use it for search and learn
3. Compare speed with other methods

## Documentation

- https://docs.unsloth.ai/
- https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb
- https://docs.unsloth.ai/basics/reinforcement-learning-rl-guide
- Inference with LoRA:
  - https://github.com/unslothai/unsloth/issues/2009
  - https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide

## Imports

In [None]:
import os
from arc25.utils import set_cuda_visible_devices_to_least_used_gpu_if_undefined
from arc25.logging import configure_logging, logging, log_execution_time

configure_logging()
set_cuda_visible_devices_to_least_used_gpu_if_undefined()

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

import time
from unsloth import FastLanguageModel
import gc
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

## Code

## Debug

### First steps

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True, max_seq_length=12000, fast_inference=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me a joke."}
]
inputs = tokenizer.apply_chat_template(
    messages, add_bos_token=True, return_tensors="pt"
).to(llm.device)
outputs = llm.generate(inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me a joke."}
]
inputs = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)
responses = llm.fast_generate(inputs)
print(responses[0].outputs[0].text)

This seems to be much faster, 0.3s vs 1.9s.

Let's see if we can make more predictions.

In [None]:
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=2048)
responses = llm.fast_generate(inputs, sampling_params=sampling_params)
print(len(responses), len(responses[0].outputs))
print(responses[0].outputs[0].text)

Seems very similar to VLLM, I should do a direct comparison.

### Compare inference speed of VLLM vs unsloth

Ideally I would see the same speed with both methods, because unsloth uses VLLM under the hood.

#### VLLM

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1,
               max_model_len=32000, enable_lora=False, max_lora_rank=16):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.92,  # Use less GPU memory
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
        enable_lora=enable_lora,
        max_lora_rank=max_lora_rank,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(model_path, use_4bit_quantization=True, tensor_parallel_size=1, max_model_len=12000)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=512, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
text_predictions = llm.generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

#### Unsloth

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True, max_seq_length=12000, fast_inference=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=512, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
text_predictions = llm.fast_generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

#### Results

unsloth has a faster startup of 54s vs 1m51s for VLLM.

The table below shows the inference speed in tokens/s when generating 100 tokens per prompt.

| method \ n predictions | 8   | 32  | 128  | 512  |
|------------------------|-----|-----|------|------|
| VLLM                   | 140 | 512 | 1476 | 1992 |
| unsloth                | 138 | 510 | 1454 | 1464 |

They are very similar except from the last column, where I believe VLLM is using more VRAM memory than
unsloth. This is promising because it opens the door to use unsloth both for training and inference
in the same process.

### Proof of concept of inference, training and inference 

Let's verify that I can do fast inference, train and fast inference again with unsloth.

#### Inference with base model

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True, max_seq_length=12000, fast_inference=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
text_predictions = llm.fast_generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(text_predictions):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

#### Finetune on a single sample

Let's use a single training sample that simply rejects to answer.

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
print(prompt)
print(len(tokenizer.tokenize(prompt)))

Now let's add a lora adapter.

In [None]:
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_dict({'text': [prompt]})

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-5,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

#### Repeat inference

In [None]:
from vllm.lora.request import LoRARequest
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/lora"
model.save_lora(lora_save_path)

In [None]:
lora_request = LoRARequest('LoRA', 1, lora_save_path)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
text_predictions = model.fast_generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(text_predictions):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
text_predictions = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(text_predictions):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

We need to provide the lora request, otherwise it simply uses the base model.

#### Create a new fresh lora adapter

In [None]:
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "AI don't know that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-5,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/lora-2"
model.save_lora(lora_save_path)

In [None]:
lora_request = LoRARequest('LoRA-2', 2, lora_save_path)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
text_predictions = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in text_predictions)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(text_predictions):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(text_predictions) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(text_predictions):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

Awesome, seems that we can create new LoRAs without problem.

#### Load LoRA and continue training

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/lora"
model.load_lora(lora_save_path)

This returns a lora request, kind of weird.

In [None]:
help(model.load_lora)

In [None]:
help(model.load_adapter)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})

In [None]:
model.set_adapter("LoRA")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
model.set_adapter("LoRA-2")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
model.set_adapter("default")
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/lora"
model.load_adapter(lora_save_path, adapter_name="LoRA", is_trainable=True)
model.set

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/lora-2"
model.load_adapter(lora_save_path, adapter_name="LoRA-2", is_trainable=True)

In [None]:
model.set_adapter("LoRA")

In [None]:
model.peft_config.keys()

## TODO