In [1]:
# !pip install -qU transformers  --quiet

In [2]:
# from transformers import modeling_utils
# if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
#     modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none","colwise",'rowwise']

In [3]:
from datasets import load_dataset
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def process_and_tokenize(example):
    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
    return tokenizer(
        text,
        padding=False,
        max_length=MAX_SEQUENCE_LENGTH,
        truncation=True,
        add_special_tokens=False,
    )


ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm and scheme.
# In this case, we:
#   * quantize the weights to fp8 with per-tensor scales
#   * quantize the activations to fp8 with per-tensor scales
#   * quantize the kv cache to fp8 with per-tensor scales
recipe = """
quant_stage:
    quant_modifiers:
        QuantizationModifier:
            ignore: ["lm_head"]
            config_groups:
                group_0:
                    weights:
                        num_bits: 8
                        type: float
                        strategy: tensor
                        dynamic: false
                        symmetric: true
                    input_activations:
                        num_bits: 8
                        type: float
                        strategy: tensor
                        dynamic: false
                        symmetric: true
                    targets: ["Linear"]
            kv_cache_scheme:
                num_bits: 8
                type: float
                strategy: tensor
                dynamic: false
                symmetric: true
"""

# Apply algorithms.
oneshot(
    model=model,
    dataset=ds,
    recipe=recipe,
    max_seq_length=MAX_SEQUENCE_LENGTH,
    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

logger.info(
    "Running sample generation. ",
    "Note: Inference with the quantized kv_cache is not supported. ",
    "Please use vLLM for inference with the quantized kv_cache.",
)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-06-17T07:10:58.888265+0000 | reset | INFO - Compression lifecycle reset
2025-06-17T07:10:59.317169+0000 | _calibrate | INFO - Running QuantizationModifier calibration with 512 samples...


100%|██████████| 512/512 [09:02<00:00,  1.06s/it]
manager stage: Modifiers initialized


2025-06-17T07:20:01.499123+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers


manager stage: Modifiers finalized


2025-06-17T07:20:01.500051+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers
2025-06-17T07:20:01.517008+0000 | <module> | INFO - Running sample generation. 


In [None]:
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.





<|begin_of_text|>Hello my name is Karen and I am a professional photographer based in the beautiful county of Kent. I have been a photographer for over 10 years and have a passion for capturing special moments in people's lives.
I have a relaxed and friendly approach to my work, which helps to put my clients at ease and allows me to capture their true personalities and emotions. I am always looking for new and creative ways to capture the perfect shot, and I am constantly updating my skills and knowledge to stay ahead of the game.

I


2025-06-17T07:20:58.095490+0000 | save_pretrained_wrapper | INFO - Fetching state_dict - this may take some time
2025-06-17T07:21:18.163962+0000 | save_pretrained_wrapper | INFO - Fetching compressor
2025-06-17T07:21:18.164647+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Quantized Compression: 100%|██████████| 1251/1251 [00:10<00:00, 117.72it/s]

2025-06-17T07:21:28.799025+0000 | save_pretrained_wrapper | INFO - Saving compressed model to disk



