In [1]:
!nvidia-smi

Thu Sep  4 05:56:50 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.247.01             Driver Version: 535.247.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:01:00.0  On |                  Off |
|  0%   46C    P8              24W / 450W |    882MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
from unsloth import FastLanguageModel

max_seq_length = 1024
dtype = None

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit",  # 20B model using bitsandbytes 4bit quantization
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b",  # 20B model using MXFP4 format
    "unsloth/gpt-oss-120b",
]  # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/gpt-oss-20b",
    dtype=dtype,  # None for auto detection
    max_seq_length=max_seq_length,  # Choose any for long context!
    load_in_4bit=False,  # 4 bit quantization to reduce memory
    full_finetuning=False,  # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.9: Fast Gpt_Oss patching. Transformers: 4.55.4.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.642 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gpt_Oss does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
model

GptOssForCausalLM(
  (model): GptOssModel(
    (embed_tokens): Embedding(201088, 2880, padding_idx=200017)
    (layers): ModuleList(
      (0-23): 24 x GptOssDecoderLayer(
        (self_attn): GptOssAttention(
          (q_proj): Linear(in_features=2880, out_features=4096, bias=True)
          (k_proj): Linear(in_features=2880, out_features=512, bias=True)
          (v_proj): Linear(in_features=2880, out_features=512, bias=True)
          (o_proj): Linear(in_features=4096, out_features=2880, bias=True)
        )
        (mlp): GptOssMLP(
          (router): GptOssTopKRouter()
          (experts): Mxfp4GptOssExperts()
        )
        (input_layernorm): GptOssRMSNorm((2880,), eps=1e-05)
        (post_attention_layernorm): GptOssRMSNorm((2880,), eps=1e-05)
      )
    )
    (norm): GptOssRMSNorm((2880,), eps=1e-05)
    (rotary_emb): GptOssRotaryEmbedding()
  )
  (lm_head): Linear(in_features=2880, out_features=201088, bias=False)
)

In [4]:
model.config

GptOssConfig {
  "architectures": [
    "GptOssForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "eos_token_id": 200002,
  "experts_per_token": 4,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2880,
  "initial_context_length": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 2880,
  "layer_types": [
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "full_attention"
  ],
  "max_position_embeddings": 131072,
  "model_type": "gpt_oss",
  "num_attention_head

In [None]:

        print(module.gate_up_proj.numel()/ 2880/32)

2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0
2880.0


In [None]:
num_parameters = 0
parameters_state_dict = model.state_dict()
for key, value in parameters_state_dict.items():
    print(key, value.size(), value.dtype, value.numel())
    num_parameters += value.numel()
# parameters in MoE layers are stored in a special way for mxfp4
for name, module in model.named_modules():
    if "experts" in name:  # MoE layers
        # special stored tensors for mxfp4
        if hasattr(module, "gate_up_proj"):
            print(
                f"{name}.gate_up_proj: {module.gate_up_proj.size()}, {module.gate_up_proj.dtype}, {module.gate_up_proj.numel()}"
            )
            num_parameters += module.gate_up_proj.numel() // 8
        if hasattr(module, "down_proj"):
            print(
                f"{name}.down_proj: {module.down_proj.size()}, {module.down_proj.dtype}, {module.down_proj.numel()}"
            )
            num_parameters += module.down_proj.numel() // 8

print(f"Number of parameters: {num_parameters}")

model.embed_tokens.weight torch.Size([201088, 2880]) torch.bfloat16 579133440
model.layers.0.self_attn.sinks torch.Size([64]) torch.bfloat16 64
model.layers.0.self_attn.q_proj.weight torch.Size([4096, 2880]) torch.bfloat16 11796480
model.layers.0.self_attn.q_proj.bias torch.Size([4096]) torch.bfloat16 4096
model.layers.0.self_attn.k_proj.weight torch.Size([512, 2880]) torch.bfloat16 1474560
model.layers.0.self_attn.k_proj.bias torch.Size([512]) torch.bfloat16 512
model.layers.0.self_attn.v_proj.weight torch.Size([512, 2880]) torch.bfloat16 1474560
model.layers.0.self_attn.v_proj.bias torch.Size([512]) torch.bfloat16 512
model.layers.0.self_attn.o_proj.weight torch.Size([2880, 4096]) torch.bfloat16 11796480
model.layers.0.self_attn.o_proj.bias torch.Size([2880]) torch.bfloat16 2880
model.layers.0.mlp.router.weight torch.Size([32, 2880]) torch.bfloat16 92160
model.layers.0.mlp.router.bias torch.Size([32]) torch.bfloat16 32
model.layers.0.mlp.experts.gate_up_proj_bias torch.Size([32, 5760