## GPTQModel Pipeline

### Install GPTQModel

In [None]:
# clone GPTQModel repo
!git clone --depth 1 --branch v0.9.9 https://github.com/ModelCloud/GPTQModel.git

# compile and install GPTQModel
!cd GPTQModel && pip install --no-build-isolation .

### Simple GPTQ Quantization

Using the VMWare OPen Instruct dataset and NousResearch's Hermes 2 Pro Mistral 7B fine-tune.

In [48]:
import torch
import logging
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer
from datasets import load_dataset


pretrained_model_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
quantized_model_id = "NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g"


def get_open_instruct(tokenizer, nsamples, seqlen):
    traindata = load_dataset("VMware/open-instruct", "default", split="train").filter(
        lambda x: len(x["response"]) >= seqlen
    )

    return [tokenizer(example["response"]) for example in traindata.select(range(nsamples))]


def main():
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=True)

    print("Loading Open Instruct training data...")
    train_dataset = get_open_instruct(tokenizer, nsamples=512, seqlen=1024)
    print("Completed loading of Open Instruct training data!")

    quantize_config = QuantizeConfig(
        # quantize model to 4-bit
        bits=4,
        # 128 offer good balance between inference speed and quantization quality
        # 32 will increase vRAM usage but increase inferencing quality
        group_size=32,
        # increase damp if NaN is encountered during `.quantize()` and/or increase calibration dataset size
        damp_percent=0.005,
        desc_act=True,
        static_groups=False,
        sym=True,
        true_sequential=True,
        lm_head=False,
        # marlin is vLLM's preferred GPTQ quantization method, which is included in "gptq"
        quant_method="gptq",
    )

    # load un-quantized model, the model will always be force loaded into cpu
    model = GPTQModel.from_pretrained(pretrained_model_id, quantize_config)

    print("Beginning quantization...")
    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
    # with value under torch.LongTensor type.
    model.quantize(train_dataset)
    print("Quantization complete!")

    print("Saving quantized model...")
    # save quantized model
    model.save_quantized(quantized_model_id)
    # save quantized model using safetensors
    model.save_quantized(quantized_model_id, use_safetensors=True, max_shard_size="4Gb")
    print("Saving quantized model complete!")

    # load quantized model, currently only support cpu or single gpu
    model = GPTQModel.from_quantized(quantized_model_id, device="cuda:0")

    # inference with model.generate
    print(
        tokenizer.decode(
            model.generate(
                **tokenizer("What is the capital of Jamaica?", return_tensors="pt").to(
                    "cuda:0"
                )
            )[0]
        )
    )

# set logging configuration for GPTQModel
logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

# execute main method
main()

Loading Open Instruct training data...
Completed loading of Open Instruct training data!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Beginning quantization...


Quantizing self_attn.k_proj in layer 1 of 32:   0%|          | 0/32 [00:06<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.k_proj', 'avg_loss': '0.0241', 'time': '1.6384'}
Quantizing self_attn.v_proj in layer 1 of 32:   0%|          | 0/32 [00:08<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.v_proj', 'avg_loss': '0.0011', 'time': '1.6030'}
Quantizing self_attn.q_proj in layer 1 of 32:   0%|          | 0/32 [00:09<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.q_proj', 'avg_loss': '0.0933', 'time': '1.6508'}
Quantizing self_attn.o_proj in layer 1 of 32:   0%|          | 0/32 [00:15<?, ?it/s]INFO - {'layer': 1, 'module': 'self_attn.o_proj', 'avg_loss': '0.0000', 'time': '1.6612'}
Quantizing mlp.up_proj in layer 1 of 32:   0%|          | 0/32 [00:22<?, ?it/s]     INFO - {'layer': 1, 'module': 'mlp.up_proj', 'avg_loss': '0.0931', 'time': '1.8252'}
Quantizing mlp.gate_proj in layer 1 of 32:   0%|          | 0/32 [00:23<?, ?it/s]INFO - {'layer': 1, 'module': 'mlp.gate_proj', 'avg_lo

Quantization complete!
Saving quantized model...


Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub library


Saving quantized model complete!


INFO - Compatibility: converting `checkpoint_format` from `gptq` to `gptq_v2`.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


<s> What is the capital of Jamaica?

The capital of Jamaica is Kingston.


### Upload Quantized Model to Hugging Face

Login to Hugging Face via PAT and then overwrite the existing Git repository with the newly quantized model's files.

In [55]:
from huggingface_hub import notebook_login

notebook_login()

from huggingface_hub import HfApi

api = HfApi()

api.upload_folder(
    folder_path="/content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g",
    repo_id="justinthelaw/Hermes-2-Pro-Mistral-7B-4bit-32g",
    repo_type="model",
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/4.57G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/justinthelaw/Hermes-2-Pro-Mistral-7B-4bit-32g/commit/50c326ebdbd58aae6140826646eaaf5a8d124fb3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='50c326ebdbd58aae6140826646eaaf5a8d124fb3', pr_url=None, pr_revision=None, pr_num=None)