### Step 1 : Installations 

In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install torchao==0.14.0 fbgemm-gpu-genai==1.3.0
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2
!pip install wandb

### Step 2 : Imports

In [2]:
import os
import re
import torch
import wandb
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from unsloth.chat_templates import standardize_data_formats

from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from unsloth.chat_templates import train_on_responses_only
from transformers import TextStreamer

from torchao.quantization import quantize_
from torchao.quantization.qat import QATConfig

from huggingface_hub import HfApi, create_repo

from torchao.quantization import Int4WeightOnlyConfig
from transformers import AutoModelForCausalLM, AutoTokenizer,TorchAoConfig

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.


TMA benchmarks will be running without grid constant TMA descriptor.


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
Switching to PyTorch attention since your Xformers is broken.

/usr/local/lib/python3.10/dist-packages/flash_attn_2_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN3c104cuda9SetDeviceEi
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!


### Step 3 : Experiment configs

In [3]:
QUANTIZATION_TYPE = "QAT"          # Options: "PTQ" or "QAT"

WANDB_PROJECT_NAME = "QuantizationTraining"
RUN_NAME     = "Qwen3_4B_" + QUANTIZATION_TYPE
wandb.init(project=WANDB_PROJECT_NAME, name=RUN_NAME)

[34m[1mwandb[0m: Currently logged in as: [33mjaytonde05[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


### Step 4 : Load the Model and Tokenizer

In [4]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-4B-Instruct-2507",
    max_seq_length = 2048,   
    load_in_4bit = False,    
    load_in_8bit = False,    
    full_finetuning = False,
)

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen3-instruct",
)

==((====))==  Unsloth 2025.11.2: Fast Qwen3 patching. Transformers: 4.55.4.
   \\   /|    NVIDIA H100 80GB HBM3. Num GPUs = 1. Max memory: 79.097 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 9.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560, padding_idx=151654)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3

### Step 5 : Add LoRA Adapters

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, 
    bias = "none",    
    qat_scheme = "int4" if QUANTIZATION_TYPE == "QAT" else None,
    use_gradient_checkpointing = "unsloth", 
    random_state = 3407,
    use_rslora = False,  
    loftq_config = None, 
)

Unsloth 2025.11.2 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


Unsloth: Applying QAT to mitigate quantization degradation


In [7]:
for module in model.modules():
    if "FakeQuantized" in module.__class__.__name__:
        print("QAT is applied!")
        break

QAT is applied!


### Step 6 : Load training dataset

In [8]:
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

In [9]:
dataset = standardize_data_formats(dataset)
print(f"one sample : {dataset[100]}")

one sample : {'conversations': [{'content': 'What is the modulus operator in programming and how can I use it to calculate the modulus of two given numbers?', 'role': 'user'}, {'content': 'In programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which equals 2

### Step 7 : Apply chat tempelate

In [10]:
def formatting_prompts_func(examples):
   convos = examples["conversations"]
   texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
   return { "text" : texts, }

dataset = dataset.map(formatting_prompts_func, batched = True)

dataset[100]['text']

### Step 9 : Train the model

In [11]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, 
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 4, 
        warmup_steps = 5,
        num_train_epochs = 1, 
        #max_steps = 30,
        learning_rate = 2e-5, 
        logging_steps = 50,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "wandb", 
    ),
)

In [12]:
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

tokenizer.decode(trainer.train_dataset[100]["input_ids"])

In [13]:
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")

'                              In programming, the modulus operator is represented by the \'%\' symbol. It calculates the remainder when one number is divided by another. To calculate the modulus of two given numbers, you can use the modulus operator in the following way:\n\n```python\n# Calculate the modulus\nModulus = a % b\n\nprint("Modulus of the given numbers is: ", Modulus)\n```\n\nIn this code snippet, the variables \'a\' and \'b\' represent the two given numbers for which you want to calculate the modulus. By using the modulus operator \'%\', we calculate the remainder when \'a\' is divided by \'b\'. The result is then stored in the variable \'Modulus\'. Finally, the modulus value is printed using the \'print\' statement.\n\nFor example, if \'a\' is 10 and \'b\' is 4, the modulus calculation would be 10 % 4, which equals 2. Therefore, the output of the above code would be:\n\n```\nModulus of the given numbers is: 2\n```\n\nThis means that the modulus of 10 and 4 is 2.<|im_end|>

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 6,250
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 33,030,144 of 4,055,498,240 (0.81% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
50,0.9755
100,0.7114
150,0.7049
200,0.6568
250,0.6796


### Step 10 : Convert step in QAT / PTQ

In [None]:
if QUANTIZATION_TYPE == "PTQ":

    basline_name = RUN_NAME.replace("PTQ", "baseline")
    merged_model = trainer.model.merge_and_unload()
    merged_model.save_pretrained(f"./{basline_name}")
    tokenizer.save_pretrained(f"./{basline_name}")

    api = HfApi()

    repo_id=f"jaytonde05/{basline_name}",

    create_repo(
        repo_id=repo_id,
        repo_type="model",
        token="",
        exist_ok=True  # Won't raise error if repo already exists
    )

    api.upload_folder(
        folder_path=f"./{basline_name}",
        repo_id=repo_id,
        repo_type="model", # specify this is a model repository
        token=""
    )

In [None]:
if QUANTIZATION_TYPE == "QAT":
    quantize_(model, QATConfig(step = "convert"))
else:
    quant_config = Int4WeightOnlyConfig(group_size=128, int4_packing_format="tile_packed_to_4d", int4_choose_qparams_algorithm="hqq")
    quantization_config = TorchAoConfig(quant_type=quant_config)
    #model_ptq, tokenizer = FastLanguageModel.from_pretrained(basline_name, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
    model = AutoModelForCausalLM.from_pretrained(basline_name, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
    tokenizer = AutoTokenizer.from_pretrained(basline_name)

### Step 11 : Sample Inference

In [None]:
messages = [
    {"role" : "user", "content" : "Continue the sequence: 1, 1, 2, 3, 5, 8,"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
)

_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1000, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

### Step 12 : Save the model to upload on huggingface hub

In [None]:
if QUANTIZATION_TYPE == "QAT":
    model.save_pretrained_torchao(
        RUN_NAME,
        tokenizer,
        torchao_config = model._torchao_config.base_config,
    )
else:
    model.save_pretrained(f"./{RUN_NAME}")
    tokenizer.save_pretrained(f"./{RUN_NAME}")

### Step 13 : Pushing the model on HuggingFace Hub

In [None]:
from huggingface_hub import HfApi
api = HfApi()

repo_id=f"jaytonde05/{RUN_NAME}"

create_repo(
    repo_id=repo_id,
    repo_type="model",
    token="",
    exist_ok=True  # Won't raise error if repo already exists
)

api.upload_folder(
    folder_path=RUN_NAME,
    repo_id=repo_id,
    repo_type="model", 
    token="",
)