# Search and learn with unsloth

## Goal

1. Learn to use unsloth
2. See how viable is to use it for search and learn
3. Compare speed with other methods

## Documentation

- https://docs.unsloth.ai/
- https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb
- https://docs.unsloth.ai/basics/reinforcement-learning-rl-guide
- Inference with LoRA:
  - https://github.com/unslothai/unsloth/issues/2009
  - https://docs.unsloth.ai/get-started/fine-tuning-llms-guide/lora-hyperparameters-guide

## Imports

In [None]:
import os
from arc25.utils import set_cuda_visible_devices_to_least_used_gpu_if_undefined
from arc25.logging import configure_logging, logging, log_execution_time

configure_logging()
set_cuda_visible_devices_to_least_used_gpu_if_undefined()

# Add VLLM specific environment variables to avoid common issues
os.environ['VLLM_USE_MODELSCOPE'] = 'False'
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'

from unsloth import FastLanguageModel

from dataclasses import dataclass
import sys
import time
import random
import numpy as np
from collections import namedtuple
from tqdm_joblib import tqdm_joblib
from joblib import Parallel, delayed
import hashlib
from IPython.display import Markdown, display

import pandas as pd
from datasets import Dataset
import gc
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from trl import SFTConfig, SFTTrainer
from tqdm.auto import tqdm

from arc25.encoders import create_grid_encoder
from arc25.utils import load_arc_dataset_with_solutions
from arc25.data_augmentation import apply_data_augmentation, revert_data_augmentation, get_random_data_augmentation_params
from arc25.code_execution import safe_code_execution
from arc25.prompting import pretty_print_prompt, Template
from arc25.metrics import pixel_similarity_score, correct_grids_score
from arc25.plot import plot_task

sys.path.append(os.path.realpath("../scripts"))
from finetuning import get_data_collator

logger = logging.getLogger(__name__)

## Code

### Prompt

In [None]:
# TODO: maybe move this to the prompting module
# https://huggingface.co/barc0/Llama-3.1-ARC-Potpourri-Induction-8B
system_prompt = """You are a world-class puzzle solver with exceptional pattern recognition skills and expertise in Python programming. Your task is to analyze puzzles and provide Python solutions."""

prompt_template_text = """Given input-output grid pairs as reference examples, carefully observe the patterns to predict the output grid for new test input. Each pair follows the same transformation rule. Grids are 2D arrays represented as strings, with cells (colors) separated by spaces and rows by newlines.
Here are the input and output grids for the reference examples:
{% for sample in train_samples %}Example {{ loop.index }}
Input:
{{ sample.input }}

Output:
{{ sample.output }}

{% endfor %}
Here is the input grid for the test example:
{{ test }}

Write a Python function `transform` that can convert any given input grid to its corresponding output grid based on the pattern observed in the reference examples.
"""

# I have verified that all responses start with this prefix
common_prefix = "Let's solve this puzzle using Python code with the common library functions. We'll first reason about the problem and then write the code to solve it. The `transform` function will take the input grid and return the output grid. Here is the Python code with the comments describing how to solve the problem:\n" #```python\nfrom common import *\n"

prompt_template = Template(prompt_template_text)

def create_prompt_from_task(task, grid_encoder, tokenizer, shuffle_train_samples=True):
    train_samples = [{'input': grid_encoder.to_text(sample['input']), 'output': grid_encoder.to_text(sample['output'])} for sample in task['train']]
    if shuffle_train_samples:
        random.shuffle(train_samples)
    test_sample = random.choice(task['test'])
    render_kwargs = dict(train_samples=train_samples, test=grid_encoder.to_text(test_sample['input']))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template.render(**render_kwargs)},
                {"role": "assistant", "content": common_prefix}]
    prompt = tokenizer.apply_chat_template(messages,
                                            tokenize=False,
                                            add_generation_prompt=False,
                                            continue_final_message=True,
                                            # enable_thinking=False,
                                            )
    return prompt

### Code

In [None]:
def parse_python_code(text):
    # Extract Python code from the text
    if '```python' not in text:
        return ''
    code = text.split('```python')[1]
    if not '```' in code:
        return ''

    code = code.split('```')[0].strip()
    return code

In [None]:
def curate_python_code(code):
    remove_line_keywords = ['import dsl', 'from dsl import ', 'print(', 'from common import *']
    code = '\n'.join(line for line in code.split('\n') if not any(keyword in line for keyword in remove_line_keywords))
    # code = 'from arc25.BARC_dsl import *\n' + code  # Ensure BARC_dsl is imported
    return code.strip()


def add_additional_imports(code):
    additional_imports = [
        'from typing import List, Tuple',
        'import numpy as np',
        'import numpy',
        'from arc25.BARC_dsl import *',
    ]
    imports = '\n'.join(additional_imports)
    return imports + '\n' + code if code else imports

In [None]:
def validate_outputs(outputs):
    if not outputs:
        raise ValueError("Outputs list is empty")
    return [_validate_output(output) for output in outputs]


def _validate_output(output):
    if output is None:
        raise ValueError("Output is None")
    output = np.array(output, dtype=int) # otherwise I see weird outputs that mix list and numpy arrays
    if output.ndim != 2:
        raise ValueError(f"Output is not a 2D array. Output shape: {output.shape}")
    if max(output.shape) > 35:
        raise ValueError(f"Output is too large, the maximum allowed shape is 30x30. Output shape: {output.shape}")
    if min(output.shape) == 0:
        raise ValueError(f"Output has zero dimension, it is empty. Output shape: {output.shape}")
    if np.max(output) > 9 or np.min(output) < 0:
        raise ValueError(f"Output contains invalid values, expected values in range [0, 9]. Output max: {np.max(output)}, min: {np.min(output)}")
    # if not np.issubdtype(output.dtype, np.integer):
    #     raise ValueError(f"Output contains non-integer values, expected integer values. Output dtype: {output.dtype}")
    return output

In [None]:
def run_code_from_predictions(task, text_predictions, data_augmentation_params):
    # Flatten all predictions into a work list
    work = [
        (text_pred, task, params)
        for text_pred, params in zip(text_predictions, data_augmentation_params)
    ]
    n_jobs = -1  # all cores; set to an int to cap

    # sort the work by prediction index first and the task id second, I believe this will improve resource allocation
    # because some tasks are more resource intensive than others
    work.sort(key=lambda x: (x[1], x[0]))

    # with tqdm_joblib(tqdm(total=len(work), desc="Executing predictions", unit="pred")):
    with tqdm_joblib(total=len(work), desc="Executing code from predictions", unit="runs", smoothing=0):
        results = Parallel(
            n_jobs=n_jobs,
            backend="loky",
            prefer="processes",
            batch_size=1,
        )(delayed(_run_one)(*args) for args in work)
    return results


def _run_one(text_prediction, task, data_augmentation_params=None):
    code = parse_python_code(text_prediction)
    if not code:
        return dict(error_type="ParsingCodeFailed", error_message='', text_prediction=text_prediction)
    try:
        input_grids = [sample['input'] for sample in task['train']] + [sample['input'] for sample in task['test']]
        if data_augmentation_params is not None:
            input_grids = apply_data_augmentation(input_grids, **data_augmentation_params)
        output_grids = safe_code_execution(
            add_additional_imports(curate_python_code(code)),
            input_grids,
            func_name="transform",
        )
        output_grids = validate_outputs(output_grids)
        if data_augmentation_params is not None:
            original_output_grids = revert_data_augmentation(output_grids, **data_augmentation_params)
        else:
            original_output_grids = output_grids
        result = dict(code=code, output_grids=output_grids,
                      input_grids=input_grids, text_prediction=text_prediction,
                      fingerprint=fingerprint(original_output_grids))
        result.update(_compute_metrics(task, original_output_grids))
        return result
    except Exception as e:
        return dict(code=code, error_type=type(e).__name__, error_message=str(e))


def fingerprint(prediction):
    """
    Create a compact hash for a list of matrices.
    Includes shape & dtype to distinguish e.g. (2×2) from (4×1).
    """
    h = hashlib.sha256()
    for m in prediction:
        # incorporate shape and dtype in a reproducible way
        h.update(str(m.shape).encode())
        h.update(m.dtype.str.encode())
        # raw data bytes
        h.update(m.tobytes())
    return h.hexdigest()


def _compute_metrics(task, predicted_grids):
    metrics = {}
    for partition in ['train', 'test']:
        if not 'output' in task[partition][0]:
            continue # we won't have the output when making submissions
        gt_grids = [sample['output'] for sample in task[partition]]
        n_samples = len(gt_grids)
        partition_predicted_grids = predicted_grids[:n_samples] if partition == 'train' else predicted_grids[-n_samples:]
        pixel_scores = np.array([pixel_similarity_score(pred, gt) for pred, gt in zip(partition_predicted_grids, gt_grids)])
        metrics[f"{partition}_pixel_score"] = float(np.mean(pixel_scores))
        metrics[f'{partition}_correct_grids'] = float(np.mean(pixel_scores == 1))
        metrics[f'{partition}_is_correct'] = int(all(pixel_scores == 1))
    return metrics

# tiny_predictions = {'00576224': predictions['00576224']}
# predicted_code, predicted_outputs = run_code_from_predictions(tiny_predictions, log_errors=True)
# df = compute_search_metrics(list(tiny_predictions.keys()), predicted_code, predicted_outputs, n_preds)
# df.round(3)
# predicted_code, predicted_outputs = run_code_from_predictions(predictions, log_errors=False)

### Metrics

In [None]:
def aggregate_metrics(results, task_id):
    df = pd.DataFrame()
    n_preds = len(results)
    df.loc[task_id, 'n_preds'] = n_preds
    df.loc[task_id, 'valid code'] = (len([1 for result in results if 'error_type' not in result]))/n_preds
    df.loc[task_id, 'unique outputs'] = len(set(result['fingerprint'] for result in results if 'fingerprint' in result))/n_preds
    for partition in ['train', 'test']:
        df.loc[task_id, f'{partition}_pixel_score'] = np.mean([result.get(f'{partition}_pixel_score', 0) for result in results])
        df.loc[task_id, f'{partition}_correct_grids'] = np.mean([result.get(f'{partition}_correct_grids', 0) for result in results])
        df.loc[task_id, f'{partition}_pass_rate'] = sum(result.get(f'{partition}_is_correct', 0) for result in results)/n_preds
        df.loc[task_id, f'{partition}_is_correct'] = int(any(result.get(f'{partition}_is_correct', 0) for result in results))
    return df.astype(float)

### Hindsight relabeling

In [None]:
def create_hindsight_relabeled_tasks(results, task):
    # TODO: strategies to avoid repetitions
    # TODO: sort the tasks
    relabeled_tasks = []
    n_train = len(task['train'])
    for result in results:
        if 'output_grids' not in result:
            continue
        new_task = {
            'train': [{'input': input, 'output': output} for input, output in zip(result['input_grids'][:n_train], result['output_grids'][:n_train])],
            'test': [{'input': input, 'output': output} for input, output in zip(result['input_grids'][n_train:], result['output_grids'][n_train:])],
            'text_prediction': result['text_prediction'],
        }
        relabeled_tasks.append(new_task)
    return relabeled_tasks

In [None]:
def create_training_prompts(relabeled_tasks, grid_encoder, tokenizer):
    prompts = []
    for task in relabeled_tasks:
        prompt = create_prompt_from_task(
            task, grid_encoder=grid_encoder, tokenizer=tokenizer, shuffle_train_samples=True)
        prompt += task['text_prediction'] + tokenizer.eos_token
        prompts.append(prompt)
    return prompts

## Experiments

In [None]:
@dataclass
class cfg:
    # base model
    model_path: str = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
    load_in_4bit: bool = True
    max_seq_length: int = 10000
    grid_encoder: str = 'ColorNameEncoder()'
    gpu_memory_utilization: float = 0.90
    # dataset
    dataset_path: str = "/mnt/hdd0/Kaggle/arc25/data/arc-prize-2024/arc-agi_training_challenges.json"
    max_epochs: int = 1
    use_data_augmentation: bool = True
    inference_batch_size: int = 1
    predictions_per_epoch: int = 8
    training_batch_size: int = 1

assert cfg.predictions_per_epoch % cfg.inference_batch_size == 0

In [None]:
llm, tokenizer = FastLanguageModel.from_pretrained(
    cfg.model_path, load_in_4bit=cfg.load_in_4bit, max_seq_length=cfg.max_seq_length,
    fast_inference=True, gpu_memory_utilization=cfg.gpu_memory_utilization)

In [None]:
dataset = load_arc_dataset_with_solutions(cfg.dataset_path)
task_ids = list(dataset.keys())
print(f"Loaded {len(dataset)} tasks from {cfg.dataset_path}")

In [None]:
grid_encoder = create_grid_encoder(cfg.grid_encoder)

In [None]:
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
print(model.peft_config.keys())
print(model.active_adapter)

In [None]:
all_results = dict()
for task_id in tqdm(task_ids[::4], desc="Tasks", unit="task"):
    # model.unload()
    # model = FastLanguageModel.get_peft_model(
    #     llm,
    #     r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    #     target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    #     lora_alpha = 64,
    #     lora_dropout = 0, # Supports any, but = 0 is optimized
    #     bias = "none",    # Supports any, but = "none" is optimized
    #     # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    #     use_gradient_checkpointing = False, # True or "unsloth" for very long context
    #     use_rslora = True,  # We support rank stabilized LoRA
    #     loftq_config = None, # And LoftQ
    # )
    lora_request = None
    sampling_params = SamplingParams(n=cfg.inference_batch_size, temperature=1.0, top_p=0.95, max_tokens=2048) # TODO: move parameters to cfg
    for epoch in range(1, cfg.max_epochs + 1):
        # the following code makes predictions, should be moved to a function
        prompts, data_augmentation_params = [], []
        for _ in range(cfg.predictions_per_epoch // cfg.inference_batch_size):
            task = dataset[task_id]
            if cfg.use_data_augmentation:
                params = get_random_data_augmentation_params()
                task = apply_data_augmentation(task, **params)
            else:
                params = None
            data_augmentation_params.extend([params] * cfg.inference_batch_size)
            prompt = create_prompt_from_task(
                task, grid_encoder=grid_encoder, tokenizer=tokenizer, shuffle_train_samples=True)
            prompts.append(prompt)

        t0 = time.time()
        generations = llm.fast_generate(prompts, sampling_params, lora_request=lora_request)
        total_tokens = sum(sum(len(output.token_ids) for output in generation.outputs) for generation in generations)
        inference_time = time.time() - t0
        logger.info(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second (Inference time: {inference_time:.1f} seconds)")

        text_predictions = []
        for generation in generations:
            for output in generation.outputs:
                text_predictions.append(output.text)
        assert len(text_predictions) == cfg.predictions_per_epoch

        # run code and compute metrics
        results = run_code_from_predictions(task, text_predictions, data_augmentation_params)
        display(aggregate_metrics(results, task_id).round(3))
        all_results[task_id] = results
        # TODO: stop criteria
        if epoch == cfg.max_epochs:
            break
        # TODO: prepare data for training
        relabeled_tasks = create_hindsight_relabeled_tasks(results, task)
        training_prompts = create_training_prompts(relabeled_tasks, grid_encoder, tokenizer)
        train_dataset = Dataset.from_dict({'text': training_prompts})
        # TODO: train model
        trainer = SFTTrainer(
            model = model,
            tokenizer = tokenizer,
            train_dataset = train_dataset,
            dataset_text_field = "text",
            max_seq_length = 8192,
            packing = False, # Can make training 5x faster for short sequences.
            data_collator=get_data_collator(tokenizer),
            args = SFTConfig(
                per_device_train_batch_size = 1,
                gradient_accumulation_steps = 1,
                warmup_ratio=0.1,
                num_train_epochs=1,
                save_strategy='no',
                learning_rate = 1e-5,
                logging_steps = 1,
                optim = "adamw_torch_fused",
                weight_decay = 0.01,
                lr_scheduler_type = 'constant_with_warmup',
                # seed = 3407,
                output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-04-debug-unsloth",
                report_to = "none", # Use this for WandB etc
            ),
        )
        trainer_stats = trainer.train()
        model.save_lora("/mnt/hdd0/Kaggle/arc25/trainings/2025-09-04-debug-unsloth/lora")
        lora_request = model.load_lora("/mnt/hdd0/Kaggle/arc25/trainings/2025-09-04-debug-unsloth/lora")
# TODO: select best predictions and prepare submission

In [None]:
dfs = []
for task_id, results in all_results.items():
    df = aggregate_metrics(results, task_id)
    dfs.append(df)
final_df = pd.concat(dfs)
final_df['is_correct'] = final_df['test_is_correct']*final_df['train_is_correct']
final_df.loc['MEAN'] = final_df.mean()
final_df

```
8 preds
20 tasks, 9 min, 15% correct
100 tasks, 49 min, 13% correct
```

In [None]:
raise

In [None]:
pretty_print_prompt(training_prompts[0])

In [None]:

relabeled_tasks = create_hindsight_relabeled_tasks(results, task)

In [None]:
from arc25.plot import plot_task

In [None]:
[result.get('train_is_correct', None) for result in results]

In [None]:
[result.get('train_pixel_score', None) for result in results]

In [None]:
plot_task(relabeled_tasks[0])

In [None]:
raise

## Debug

### First steps

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True, max_seq_length=12000, fast_inference=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me a joke."}
]
inputs = tokenizer.apply_chat_template(
    messages, add_bos_token=True, return_tensors="pt"
).to(llm.device)
outputs = llm.generate(inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Tell me a joke."}
]
inputs = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)
responses = llm.fast_generate(inputs)
print(responses[0].outputs[0].text)

This seems to be much faster, 0.3s vs 1.9s.

Let's see if we can make more predictions.

In [None]:
sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=2048)
responses = llm.fast_generate(inputs, sampling_params=sampling_params)
print(len(responses), len(responses[0].outputs))
print(responses[0].outputs[0].text)

Seems very similar to VLLM, I should do a direct comparison.

### Compare inference speed of VLLM vs unsloth

Ideally I would see the same speed with both methods, because unsloth uses VLLM under the hood.

#### VLLM

In [None]:
@log_execution_time
def load_model(model_path, use_4bit_quantization=False, tensor_parallel_size=1,
               max_model_len=32000, enable_lora=False, max_lora_rank=16):
    logging.info(f"Loading model from {model_path}")
    cleanup_gpu()
    llm = LLM(
        model=model_path,
        gpu_memory_utilization=0.92,  # Use less GPU memory
        trust_remote_code=True,
        dtype="bfloat16",  # Use float16 to save memory
        tensor_parallel_size=tensor_parallel_size,  # Single GPU
        quantization="bitsandbytes" if use_4bit_quantization else None,
        enable_prefix_caching=True, # Seems that it is true by default, but let's be explicit
        max_model_len=max_model_len,
        enable_lora=enable_lora,
        max_lora_rank=max_lora_rank,
    )
    if model_path.endswith('.gguf'):
        tokenizer_path = os.path.join(os.path.dirname(model_path), 'tokenizer')
    else:
        tokenizer_path = model_path
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return llm, tokenizer


def cleanup_gpu():
    """Clean up GPU memory before loading VLLM"""
    gc.collect()
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.synchronize()

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = load_model(model_path, use_4bit_quantization=True, tensor_parallel_size=1, max_model_len=12000)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=512, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = llm.generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

#### Unsloth

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True, max_seq_length=12000, fast_inference=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=512, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = llm.fast_generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")

#### Results

unsloth has a faster startup of 54s vs 1m51s for VLLM.

The table below shows the inference speed in tokens/s when generating 100 tokens per prompt.

| method \ n predictions | 8   | 32  | 128  | 512  |
|------------------------|-----|-----|------|------|
| VLLM                   | 140 | 512 | 1476 | 1992 |
| unsloth                | 138 | 510 | 1454 | 1464 |

They are very similar except from the last column, where I believe VLLM is using more VRAM memory than
unsloth. This is promising because it opens the door to use unsloth both for training and inference
in the same process.

### Proof of concept of inference, training and inference 

Let's verify that I can do fast inference, train and fast inference again with unsloth.

#### Inference with base model

In [None]:
model_path = "/home/gbarbadillo/models/Llama-3.1-ARC-Potpourri-Induction-8B"
llm, tokenizer = FastLanguageModel.from_pretrained(model_path, load_in_4bit=True, max_seq_length=12000, fast_inference=True)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = llm.fast_generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

#### Finetune on a single sample

Let's use a single training sample that simply rejects to answer.

Now let's add a lora adapter.

In [None]:
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
print(model.peft_config.keys())
print(model.active_adapter)

In [None]:
from datasets import Dataset
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
print(prompt)
print(len(tokenizer.tokenize(prompt)))
train_dataset = Dataset.from_dict({'text': [prompt]})

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-5,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal"
model.save_lora(lora_save_path)

#### Repeat inference

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = model.fast_generate(prompt, sampling_params)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

In [None]:
from vllm.lora.request import LoRARequest
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal"
lora_request = LoRARequest('LoRA', 1, lora_save_path)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

We need to provide the lora request, otherwise it simply uses the base model.

We can also generate the lora request with the `load_lora` method from the model.

In [None]:
lora_request = model.load_lora(lora_save_path)
print(lora_request)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

#### Create a new fresh lora adapter

In [None]:
model.unload()
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
print(model.peft_config.keys())
print(model.active_adapter)

Check that the model is untrained.

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Yes, but maybe tomorrow."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 1e-5,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

Awesome, seems that we can create new LoRAs without problem.

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/tomorrow"
model.save_lora(lora_save_path)

#### Inference with multiple loras

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal"
lora_request = model.load_lora(lora_save_path)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/tomorrow"
lora_request = model.load_lora(lora_save_path)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

#### Load LoRA and continue training

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Yes, but maybe tomorrow."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
model.unload()
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
print(model.peft_config.keys())
print(model.active_adapter)


In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Yes, but maybe tomorrow."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

Let's verify that calling the lora adapter has no effect.

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal"
model.load_lora(lora_save_path)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

Now let's try using the adapters.

In [None]:
print(model.peft_config.keys())
print(model.active_adapter)

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal"
model.load_adapter(lora_save_path, adapter_name="refusal", is_trainable=True)
print(model.peft_config.keys())
print(model.active_adapter)
model.set_adapter("refusal")
print(model.active_adapter)

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-7,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

Awesome, this seems to be working! Let's train for longer.

In [None]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."},
    {"role": "assistant", "content": "Sorry, I can't help with that."}
]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_eos_token=True,
)
train_dataset = Dataset.from_dict({'text': [prompt]})
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = 100,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 50,
        learning_rate = 1e-5,
        logging_steps = 1,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth",
        report_to = "none", # Use this for WandB etc
    ),
)
trainer_stats = trainer.train()

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal-v2"
model.save_lora(lora_save_path)

In [None]:
lora_request = model.load_lora(lora_save_path)
print(lora_request)
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Write a chapter about the history of AI."}
]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True, tokenize=False,
)

sampling_params = SamplingParams(n=8, temperature=1.0, top_p=0.95, max_tokens=100)
t0 = time.time()
generations = model.fast_generate(prompt, sampling_params, lora_request=lora_request)
total_tokens = sum(sum(len(_output.token_ids) for _output in output.outputs) for output in generations)
inference_time = time.time() - t0
print(f"Total tokens generated: {total_tokens}")
print(f"Time taken: {inference_time:.2f} seconds")
print(f"Average time per task: {inference_time / len(generations):.2f} seconds")
print(f"Average tokens per task: {total_tokens / len(generations) / sampling_params.n:.2f} tokens")
print(f"Average tokens per second: {total_tokens / inference_time:.2f} tokens/second")
for i, output in enumerate(generations):
    for j, out in enumerate(output.outputs):
        print(f"Output {i}-{j}: {out.text}")

This is weird, it seems to have saved the untrained adapter.

#### Try another approach

In [None]:
model.unload()
model = FastLanguageModel.get_peft_model(
    llm,
    r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ['k_proj', 'q_proj', 'v_proj', 'o_proj'],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = False, # True or "unsloth" for very long context
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
print(model.peft_config.keys())
print(model.active_adapter)

In [None]:
torch.load("/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal/adapter_model.safetensors", weights_only=False)

I could use the checkpoints and avoid doing weird tricks.

In [None]:
model.save

In [None]:
model.load_state_dict(torch.load("/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal/adapter_model.safetensors", weights_only=False), strict=False)

In [None]:
help(model.from_pretrained)

In [None]:
!ls -lah /mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/*/adapter_model.safetensors

In [None]:
help(model.load_state_dict)

In [None]:
model.from_pretrained()

In [None]:
lora_save_path = "/mnt/hdd0/Kaggle/arc25/trainings/2025-09-03-debug-unsloth/refusal-v2"
model.save_lora(lora_save_path, 'refusal')

## TODO

- [x] Maybe I should integrate evaluation, and hindsight relabeling all in the same code execution step because it is done in parallel
- [x] Check if training prompts are being created correctly or if I'm missing some space or anything. I find weird the high loss I see when I start training (2.3, 5). They are created correctly but I'm not using data collator
- [x] Add data collator. Now the training loss is much lower
- [ ] Run a test without any learning to compare with my benchmarks
- [ ] Compare the code to the training script to see if there are relevant differences
- [ ] Tokenization seems to be slow at the beginning of the training
- [ ] Make the code robust to code execution errors
- [ ] Doing the first inference run for all the tasks could be more efficient, because throughput is increased with the number of predictions. Also it might have sense to do a wider exploration at first and focus on the best results second.