In [None]:
# uninstall existing dependencies
!pip uninstall transformers torch datasets -y

# install GPTQModel pre-reqs
!pip install torch datasets

Found existing installation: transformers 4.43.3
Uninstalling transformers-4.43.3:
  Successfully uninstalled transformers-4.43.3
Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Found existing installation: datasets 2.20.0
Uninstalling datasets-2.20.0:
  Successfully uninstalled datasets-2.20.0
Collecting torch
  Using cached torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Using cached torch-2.4.0-cp310-cp310-manylinux1_x86_64.whl (797.2 MB)
Using cached datasets-2.20.0-py3-none-any.whl (547 kB)
Installing collected packages: torch, datasets
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
auto-gptq 0.7.1 requires transformers>=4.31.0, which is not installed.
auto-round 0.2 requires transformers, which

In [None]:
# clone GPTQModel repo
!git clone https://github.com/ModelCloud/GPTQModel.git

# compile and install GPTQModel
# You can optionally include specific modules like vllm, sglang, or bitblas by adding them in brackets. Example: pip install -vvv --no-build-isolation .[vllm,sglang,bitblas]
!cd GPTQModel && pip install -vvv --no-build-isolation .

Cloning into 'GPTQModel'...
remote: Enumerating objects: 7359, done.[K
remote: Counting objects: 100% (1531/1531), done.[K
remote: Compressing objects: 100% (680/680), done.[K
remote: Total 7359 (delta 978), reused 937 (delta 849), pack-reused 5828[K
Receiving objects: 100% (7359/7359), 8.96 MiB | 12.88 MiB/s, done.
Resolving deltas: 100% (5078/5078), done.
Using pip 24.1.2 from /usr/local/lib/python3.10/dist-packages/pip (python 3.10)
Non-user install because site-packages writeable
Created temporary directory: /tmp/pip-build-tracker-9247o59h
Initialized build tracking at /tmp/pip-build-tracker-9247o59h
Created build tracker: /tmp/pip-build-tracker-9247o59h
Entered build tracker: /tmp/pip-build-tracker-9247o59h
Created temporary directory: /tmp/pip-install-e6a13lbu
Created temporary directory: /tmp/pip-ephem-wheel-cache-f7wyzpcf
Processing /content/GPTQModel
  Added file:///content/GPTQModel to build tracker '/tmp/pip-build-tracker-9247o59h'
  Running setup.py (path:/content/GPTQM

In [None]:
import torch
import torch.nn as nn
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer
from datasets import load_dataset

pretrained_model_id = "microsoft/Phi-3-mini-128k-instruct"
quantized_model_id = "Phi-3-mini-128k-instruct-4bit-128g"

def get_wikitext2(tokenizer, nsamples, seqlen):
    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train").filter(
        lambda x: len(x["text"]) >= seqlen)

    return [tokenizer(example["text"]) for example in traindata.select(range(nsamples))]


@torch.no_grad()
def calculate_avg_ppl(model, tokenizer):
    from gptqmodel.utils import Perplexity

    ppl = Perplexity(
        model=model,
        tokenizer=tokenizer,
        dataset_path="wikitext",
        dataset_name="wikitext-2-raw-v1",
        split="train",
        text_column="text",
    )

    all = ppl.calculate(n_ctx=512, n_batch=512)

    # average ppl
    avg = sum(all) / len(all)

    return avg

def main():
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=True)

    traindataset = get_wikitext2(tokenizer, nsamples=256, seqlen=1024)

    quantize_config = QuantizeConfig(
        bits=4,  # quantize model to 4-bit
        group_size=128,  # it is recommended to set the value to 128
    )

    # load un-quantized model, the model will always be force loaded into cpu
    model = GPTQModel.from_pretrained(pretrained_model_id, quantize_config)

    # quantize model, the calibration_dataset should be list of dict whose keys can only be "input_ids" and "attention_mask"
    # with value under torch.LongTensor type.
    model.quantize(traindataset)

    # save quantized model
    model.save_quantized(quantized_model_id)

    # save quantized model using safetensors
    model.save_quantized(quantized_model_id, use_safetensors=True)

    # load quantized model, currently only support cpu or single gpu
    model = GPTQModel.from_quantized(quantized_model_id, device="cuda:0")

    # inference with model.generate
    print(tokenizer.decode(model.generate(**tokenizer("What is the capital of Jamaica?", return_tensors="pt").to("cuda:0"))[0]))

    print(f"Quantized Model {quantized_model_id} avg PPL is {calculate_avg_ppl(model, tokenizer)}")


if __name__ == "__main__":
    import logging

    logging.basicConfig(
        format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
        level=logging.INFO,
        datefmt="%Y-%m-%d %H:%M:%S",
    )

    main()

ModuleNotFoundError: No module named 'gptqmodel'