In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import torch
import os

# ========================
# 1️⃣ Paths and model
# ========================
BASE_MODEL = "Qwen/Qwen2-0.5B"  # path to original model
OUTPUT_DIR = "./qwen-sft-local-test"  # local folder for checkpoints

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ========================
# 2️⃣ Load tokenizer & model
# ========================
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,  # mixed precision
).cuda()

# ========================
# 3️⃣ Prepare your dataset
# ========================
# Example format: [{"input": "Question: ...\nAnswer:", "output": " your answer"}]
data = [
    {"input": "What is lung cancer?\nAnswer:", "output": " Lung cancer is ..."},
    {"input": "Is smoking harmful?\nAnswer:", "output": " Yes, smoking is ..."}
]

def tokenize_fn(example):
    # Concatenate input and output as one sequence
    full_text = example["input"] + example["output"]
    tokens = tokenizer(full_text, truncation=True, max_length=1024)
    return tokens

dataset = Dataset.from_list(data)
dataset = dataset.map(tokenize_fn, batched=False)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# ========================
# 4️⃣ Data collator
# ========================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # causal LM
)

# ========================
# 5️⃣ Training arguments
# ========================
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,  # small batch for stability
    gradient_accumulation_steps=4,  # effective batch size = 4
    learning_rate=1e-5,             # start small
    max_steps=500,                   # short debug run
    fp16=True,
    save_steps=100,
    save_total_limit=3,
    logging_steps=10,
    report_to=None,                  # no wandb/other reporting
    remove_unused_columns=False,
    gradient_checkpointing=True,     # save memory
    max_grad_norm=1.0,               # gradient clipping
)

# ========================
# 6️⃣ Trainer
# ========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ========================
# 7️⃣ Start training
# ========================
trainer.train()


2025-10-04 14:02:17.684395: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-04 14:02:17.684418: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-04 14:02:17.685200: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-04 14:02:17.689284: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
`torch_dtype` is deprecated! Use `dtype` instead!


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Trainer(
[codecarbon INFO @ 14:02:22] [setup] RAM Tracking...
[codecarbon INFO @ 14:02:22] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 14:02:23] CPU Model on constant consumption mode: AMD Ryzen 7 5800X 8-Core Processor
[codecarbon INFO @ 14:02:23] [setup] GPU Tracking...
[codecarbon INFO @ 14:02:23] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 14:02:23] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: cpu_load
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 14:02:23] >>> Tracker's metadata:
[codecarbon INFO @ 14:02:23]   Platform system: Linux-5.15.0-153-generic-x86_64-with-glibc2.35
[codecarbon INFO @ 14:02:23]   Python version: 3.10.18
[codecarbon INFO @ 14:02:23]   CodeCarbon version: 3.0.5
[codecarbon INFO @ 14:02:23]   Available RAM

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,0.9893
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


[codecarbon INFO @ 14:02:44] Energy consumed for RAM : 0.000086 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 14:02:44] Delta energy consumed for CPU with cpu_load : 0.000045 kWh, power : 10.525426069200002 W
[codecarbon INFO @ 14:02:44] Energy consumed for All CPU : 0.000045 kWh
[codecarbon INFO @ 14:02:44] Energy consumed for all GPUs : 0.001335 kWh. Total GPU Power : 300.2258932925965 W
[codecarbon INFO @ 14:02:44] 0.001466 kWh of electricity used since the beginning.
[codecarbon INFO @ 14:02:59] Energy consumed for RAM : 0.000167 kWh. RAM Power : 20.0 W
[codecarbon INFO @ 14:02:59] Delta energy consumed for CPU with cpu_load : 0.000042 kWh, power : 10.529583154125001 W
[codecarbon INFO @ 14:02:59] Energy consumed for All CPU : 0.000088 kWh
[codecarbon INFO @ 14:02:59] Energy consumed for all GPUs : 0.002556 kWh. Total GPU Power : 293.14588434984665 W
[codecarbon INFO @ 14:02:59] 0.002810 kWh of electricity used since the beginning.
[codecarbon INFO @ 14:03:14] Energy consumed for RAM 

TrainOutput(global_step=500, training_loss=0.019785152435302735, metrics={'train_runtime': 148.3454, 'train_samples_per_second': 26.964, 'train_steps_per_second': 3.371, 'total_flos': 23621275392000.0, 'train_loss': 0.019785152435302735, 'epoch': 500.0})