## Install prerequisites 

In [1]:
!pip install llmcompressor

Collecting llmcompressor
  Downloading llmcompressor-0.5.1-py3-none-any.whl.metadata (6.9 kB)
Collecting loguru (from llmcompressor)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting numpy<2.0,>=1.17.0 (from llmcompressor)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting tqdm>=4.0.0 (from llmcompressor)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting transformers<5.0,>4.0 (from llmcompressor)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting datasets (from llmcompressor)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate!=1.1.0,>=0.20.3 (from llmcompressor)
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Collecting pynvml (from llmcompressor)
  Downloading pynvml-12.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting compressed-tensors==0.9.4 (from llmcompressor)
  Downloading compressed_tensors-0.9.4

### Parameters

In [22]:
# Put your PATH or huggingface model here
model_path=""
ouput_model_path=""

### FP8 Weight and Activation Quantization

In [2]:
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from llmcompressor import oneshot
import torch

In [6]:
recipe = QuantizationModifier(
  targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [7]:
config = AutoConfig.from_pretrained(model_path)
config

MistralConfig {
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 5120,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 1024000,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 40,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.4",
  "use_cache": true,
  "vocab_size": 131072
}

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [9]:
oneshot(model=model, recipe=recipe)

2025-06-10T09:59:14.186208+0000 | reset | INFO - Compression lifecycle reset
2025-06-10T09:59:14.189558+0000 | from_modifiers | INFO - Creating recipe from modifiers


manager stage: Modifiers initialized


2025-06-10T09:59:20.445172+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers


manager stage: Modifiers finalized


2025-06-10T09:59:20.446877+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(131072, 5120)
    (layers): ModuleList(
      (0-39): 40 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=5120, out_features=4096, bias=False)
          (k_proj): Linear(in_features=5120, out_features=1024, bias=False)
          (v_proj): Linear(in_features=5120, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=5120, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=5120, out_features=14336, bias=False)
          (up_proj): Linear(in_features=5120, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=5120, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((5120,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((5120,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((5120,), eps=1e-

In [11]:
model.save_pretrained(ouput_model_path)

2025-06-10T10:02:09.642640+0000 | save_pretrained_wrapper | INFO - Fetching state_dict - this may take some time
2025-06-10T10:03:13.530900+0000 | save_pretrained_wrapper | INFO - Fetching compressor
2025-06-10T10:03:13.534855+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Quantized Compression: 100%|██████████| 923/923 [01:02<00:00, 14.80it/s]

2025-06-10T10:04:15.923326+0000 | save_pretrained_wrapper | INFO - Saving compressed model to disk





In [14]:
tokenizer.save_pretrained(ouput_model_path)

('/workspace/NemoMix-Unleashed-12B-FP8/tokenizer_config.json',
 '/workspace/NemoMix-Unleashed-12B-FP8/special_tokens_map.json',
 '/workspace/NemoMix-Unleashed-12B-FP8/tokenizer.json')