## GPTQModel Pipeline

### Install GPTQModel

In [1]:
# clone GPTQModel repo
!git clone --depth 1 --branch v0.9.9 https://github.com/ModelCloud/GPTQModel.git

# compile and install GPTQModel
!cd GPTQModel && pip install --no-build-isolation .

Cloning into 'GPTQModel'...
remote: Enumerating objects: 210, done.[K
remote: Counting objects: 100% (210/210), done.[K
remote: Compressing objects: 100% (176/176), done.[K
Receiving objects: 100% (210/210), 200.83 KiB | 10.57 MiB/s, done.
remote: Total 210 (delta 35), reused 90 (delta 28), pack-reused 0[K
Resolving deltas: 100% (35/35), done.
Note: switching to '519fbe3ef02335c58e3aa8e9353f8346a8780b91'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

Processing /content/GPTQModel
  Preparing metadata (setup.py) ... [?2

### Simple GPTQ Quantization

Using the WikiText2 dataset and microsoft/Phi-3-mini-128k-instruct.

In [None]:
import torch
import logging
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer
from datasets import load_dataset


pretrained_model_id = "NousResearch/Hermes-2-Pro-Mistral-7B"
quantized_model_id = "NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g"


def get_open_instruct(tokenizer, nsamples, seqlen):
    traindata = load_dataset("VMware/open-instruct", "default", split="train").filter(
        lambda x: len(x["response"]) >= seqlen
    )

    return [tokenizer(example["response"]) for example in traindata.select(range(nsamples))]


def main():
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=True)

    print("Loading Open Instruct training data...")
    train_dataset = get_open_instruct(tokenizer, nsamples=512, seqlen=1024)
    print("Completed loading of Open Instruct training data!")

    quantize_config = QuantizeConfig(
        # quantize model to 4-bit
        bits=4,
        # 128 offer good balance between inference speed and quantization quality
        # 32 will increase vRAM usage but increase inferencing quality
        group_size=32,
        # increase damp if NaN is encountered during `.quantize()` and/or increase calibration dataset size
        damp_percent=0.005,
        desc_act=True,
        static_groups=False,
        sym=True,
        true_sequential=True,
        lm_head=False,
        # marlin is vLLM's preferred GPTQ quantization method, which is included in "gptq"
        quant_method="gptq",
    )

    # load un-quantized model, the model will always be force loaded into cpu
    model = GPTQModel.from_pretrained(pretrained_model_id, quantize_config)

    print("Beginning quantization...")
    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
    # with value under torch.LongTensor type.
    model.quantize(train_dataset)
    print("Quantization complete!")

    print("Saving quantized model...")
    # save quantized model
    model.save_quantized(quantized_model_id)
    # save quantized model using safetensors
    model.save_quantized(quantized_model_id, use_safetensors=True)
    print("Saving quantized model complete!")

    # load quantized model, currently only support cpu or single gpu
    model = GPTQModel.from_quantized(quantized_model_id, device="cuda:0")

    # inference with model.generate
    print(
        tokenizer.decode(
            model.generate(
                **tokenizer("What is the capital of Jamaica?", return_tensors="pt").to(
                    "cuda:0"
                )
            )[0]
        )
    )

# set logging configuration for GPTQModel
logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S",
)

# execute main method
main()

In [12]:
# prompt: download git and git lfs

!apt -y install git-lfs
!git lfs install


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Git LFS initialized.


In [13]:
# prompt: login to hugging face via notebook

from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
!git config --global init.defaultBranch main
!git config --global user.email "justin.law@defenseunicorns.com"
!git config --global user.name "justinthelaw"

In [None]:
# prompt: cd into /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g and then git init, commit, and push to hugging face repo under use justinthelaw

!cd /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g && rm -rf .git
!cd /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g && git init
!cd /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g && git add .
!cd /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g && git commit -m "initial commit"
!cd /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g && git remote add origin https://huggingface.co/justinthelaw/Hermes-2-Pro-Mistral-7B-4bit-32g
!cd /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g && git push -u origin main


Initialized empty Git repository in /content/NousResearch/Hermes-2-Pro-Mistral-7B-4bit-32g/.git/
