## “The goal of this practical is very clear:
minimum steps, maximum clarity.”

In [None]:
!pip uninstall -y axolotl peft transformers accelerate datasets trl optimum cut-cross-entropy flash-attn
!pip install --no-build-isolation git+https://github.com/OpenAccess-AI-Collective/axolotl.git
!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1
!pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"

In [None]:
dataset_id = "winglian/pirate-ultrachat-10k"
uploaded = {}

In [None]:
import os
GOOGLE_DRIVE_PATH = ""
if GOOGLE_DRIVE_PATH:
    from google.colab import drive
    GOOGLE_DRIVE_MNT = "/content/drive/"
    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)
    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip("/"))
    if not os.path.isfile(tmp_path):
        raise ValueError(f"File {tmp_path} does not exist")
    dataset_id = tmp_path

In [None]:
import os
os.environ["AXOLOTL_DO_NOT_TRACK"] = "1"

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from axolotl.cli.config import load_cfg
from axolotl.utils.dict import DictDefault

In [None]:
config = DictDefault(
    base_model="Qwen/Qwen2.5-3B-Instruct",
    load_in_4bit=True,
    adapter="qlora",
    lora_r=32,
    lora_alpha=64,
    lora_target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "down_proj",
        "up_proj",
    ],
    lora_qkv_kernel=False,
    lora_o_kernel=False,
    lora_mlp_kernel=False,
    embeddings_skip_upcast=True,
    xformers_attention=True,
    plugins=[],
    sample_packing=False,
    learning_rate=0.00019,
    sequence_len=1024,
    micro_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={
        "use_reentrant": False,
    },
    optimizer="paged_adamw_8bit",
    lr_scheduler="cosine",
    warmup_steps=5,
    fp16=True,
    bf16=False,
    max_grad_norm=0.1,
    num_epochs=1,
    saves_per_epoch=2,
    logging_steps=1,
    output_dir="./outputs/qwen-sft-pirate-rrr",
    chat_template="qwen3",
    datasets=[
        {
            "path": dataset_id,
            "type": "chat_template",
            "split": "train",
            "eot_tokens": ["<|im_end|>"],
        }
    ],
    dataloader_prefetch_factor=2,
    dataloader_num_workers=0,
    dataloader_pin_memory=True,
)
cfg = load_cfg(config)

In [None]:
from axolotl.utils import set_pytorch_cuda_alloc_conf
set_pytorch_cuda_alloc_conf()

In [None]:
from axolotl.common.datasets import load_datasets
dataset_meta = load_datasets(cfg=cfg)

In [None]:
from axolotl.train import train
cfg.max_steps = 25
model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)

In [None]:
from transformers import TextStreamer

messages = [
    {
        "role": "user",
        "content": "Explain the Pythagorean theorem to me.",
    },
]

prompt = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False,
    enable_thinking=False,
)

outputs = model.generate(
    **tokenizer(prompt, return_tensors="pt").to("cuda"),
    max_new_tokens=192,
    temperature=1.0,
    top_p=0.8,
    top_k=32,
    streamer=TextStreamer(tokenizer, skip_prompt=True),
)

In [None]:
# Show the saved checkpoints in the output_dir
!ls -lh "./outputs/qwen-sft-pirate-rrr"

In [None]:
from huggingface_hub import notebook_login

# remove the partial epoch checkpoints
!rm -rf "./outputs/qwen-sft-pirate-rrr/checkpoint-*"

# HF Notebook login widget
notebook_login()

# upload the LoRA adapter for your model to HF, remember to update the username/model-name below
!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B "./outputs/qwen-sft-pirate-rrr"