## GPTQModel Pipeline

### Install Dependencies

In [None]:
# install GPTQModel pre-requisites
%pip install torch datasets wheel

Note: you may need to restart the kernel to use updated packages.


### Install GPTQModel

In [None]:
# clone GPTQModel repo
!git clone --depth 1 --branch v0.9.9 https://github.com/ModelCloud/GPTQModel.git

# compile and install GPTQModel
!cd GPTQModel && pip install --no-build-isolation .

fatal: destination path 'GPTQModel' already exists and is not an empty directory.
Processing /home/jlaw/dev/gptqmodel-pipeline/notebooks/GPTQModel
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: gptqmodel
  Building wheel for gptqmodel (setup.py) ... [?25ldone
[?25h  Created wheel for gptqmodel: filename=gptqmodel-0.9.9+cu121-cp311-cp311-linux_x86_64.whl size=9430897 sha256=87d92afbce45fdf0aff2fe9360d06fbee2c6da5e0bacac4102e8afd6e75f8157
  Stored in directory: /tmp/pip-ephem-wheel-cache-yp6y10v_/wheels/30/a6/d2/65b8183a53eac7ce46a2d535bd54140689d0dae2848bd95f40
Successfully built gptqmodel
Installing collected packages: gptqmodel
  Attempting uninstall: gptqmodel
    Found existing installation: gptqmodel 0.9.9+cu121
    Uninstalling gptqmodel-0.9.9+cu121:
      Successfully uninstalled gptqmodel-0.9.9+cu121
Successfully installed gptqmodel-0.9.9+cu121


### Simple GPTQ Quantization

Using the WikiText2 dataset and microsoft/Phi-3-mini-128k-instruct.

In [None]:
import torch
import logging
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer
from datasets import load_dataset


pretrained_model_id = "microsoft/Phi-3-mini-128k-instruct"
quantized_model_id = "Phi-3-mini-128k-instruct-4bit-128g"


def get_wikitext2(tokenizer, nsamples, seqlen):
    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train").filter(
        lambda x: len(x["text"]) >= seqlen
    )

    return [tokenizer(example["text"]) for example in traindata.select(range(nsamples))]


@torch.no_grad()
def calculate_avg_ppl(model, tokenizer):
    from gptqmodel.utils import Perplexity

    ppl = Perplexity(
        model=model,
        tokenizer=tokenizer,
        dataset_path="wikitext",
        dataset_name="wikitext-2-raw-v1",
        split="train",
        text_column="text",
    )

    # n_ctx is context size
    # n_batch is the batch size
    all = ppl.calculate(n_ctx=512, n_batch=128)

    # average ppl
    avg = sum(all) / len(all)

    return avg


def main():
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=True)

    print("Loading WikiText2 training data...")
    train_dataset = get_wikitext2(tokenizer, nsamples=512, seqlen=1024)
    print("Completed loading of WikiText2 training data!")

    quantize_config = QuantizeConfig(
        # quantize model to 4-bit
        bits=4,
        # 128 offer good balance between inference speed and quantization quality
        group_size=128,  # it is recommended to set the value to 128
        # increase damp if NaN is encountered during `.quantize()` and/or increase calibration dataset size
        damp_percent=0.01,
        desc_act=True,
        static_groups=False,
        sym=True,
        true_sequential=True,
        lm_head=False,
        # marlin is vLLM's preferred GPTQ quantization method, which is included in "gptq"
        quant_method="gptq",
    )

    # load un-quantized model, the model will always be force loaded into cpu
    model = GPTQModel.from_pretrained(pretrained_model_id, quantize_config)

    print("Beginning quantization...")
    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
    # with value under torch.LongTensor type.
    model.quantize(train_dataset)
    print("Quantization complete!")

    print("Saving quantized model...")
    # save quantized model
    model.save_quantized(quantized_model_id)
    # save quantized model using safetensors
    model.save_quantized(quantized_model_id, use_safetensors=True)
    print("Saving quantized model complete!")

    # load quantized model, currently only support cpu or single gpu
    model = GPTQModel.from_quantized(quantized_model_id, device="cuda:0")

    # inference with model.generate
    print(
        tokenizer.decode(
            model.generate(
                **tokenizer("What is the capital of Jamaica?", return_tensors="pt").to(
                    "cuda:0"
                )
            )[0]
        )
    )

    print(
        f"Quantized Model {quantized_model_id} avg PPL is {calculate_avg_ppl(model, tokenizer)}"
    )

# set logging configuration for GPTQModel
logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

# execute main method
main()

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd


Loading WikiText2 training data...
Completed loading of WikiText2 training data!


Downloading shards: 100%|██████████| 2/2 [02:02<00:00, 61.01s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 14.49it/s]


Beginning quantization...


Quantizing layer 1 of 32:   0%|          | 0/32 [00:00<?, ?it/s]You are not running the flash-attention implementation, expect numerical differences.
Quantizing self_attn.qkv_proj in layer 1 of 32:   0%|          | 0/32 [00:04<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.qkv_proj', 'avg_loss': '0.1069', 'time': '0.4765'}
Quantizing self_attn.o_proj in layer 1 of 32:   0%|          | 0/32 [00:09<?, ?it/s]  INFO - {'layer': 1, 'module': 'self_attn.o_proj', 'avg_loss': '0.0002', 'time': '0.3293'}
Quantizing mlp.gate_up_proj in layer 1 of 32:   0%|          | 0/32 [00:14<?, ?it/s]INFO - {'layer': 1, 'module': 'mlp.gate_up_proj', 'avg_loss': '0.0554', 'time': '0.4137'}
Quantizing mlp.down_proj in layer 1 of 32:   0%|          | 0/32 [00:27<?, ?it/s]   INFO - {'layer': 1, 'module': 'mlp.down_proj', 'avg_loss': '0.0011', 'time': '1.1158'}
Quantizing self_attn.qkv_proj in layer 2 of 32:   3%|▎         | 1/32 [00:35<16:23, 31.71s/it]INFO - {'layer': 2, 'module': 'self_attn.qkv_proj', 'avg_