In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
!pip install auto-gptq --no-build-isolation --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ # bcs cuda 11.8

Looking in indexes: https://pypi.org/simple, https://huggingface.github.io/autogptq-index/whl/cu118/
Collecting auto-gptq
  Downloading https://huggingface.github.io/autogptq-index/whl/cu118/auto-gptq/auto_gptq-0.7.1%2Bcu118-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-0.29.0-py3-none-any.whl.metadata (18 kB)
Collecting datasets (from auto-gptq)
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting sentencepiece (from auto-gptq)
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.1.0-py3-none-any.whl.metadata (3.0 kB)
Coll

In [8]:
# based off the auto-gptq repo example

pretrained_model_dir = ""
quantized_model_dir = ""

from transformers import AutoTokenizer, TextGenerationPipeline
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import logging
from datasets import load_dataset
import random
import torch
import numpy as np

logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)


traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

try:
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=False)
except Exception:
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
traindataset = []

random.seed(42)
np.random.seed(42)
torch.random.manual_seed(42)

for _ in range(128):
    i = random.randint(0, trainenc.input_ids.shape[1] - 2048 - 1)
    j = i + 2048
    inp = trainenc.input_ids[:, i:j]
    attention_mask = torch.ones_like(inp)
    traindataset.append({"input_ids": inp, "attention_mask": attention_mask})

quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
)

# load un-quantized model, by default, the model will always be loaded into CPU memory
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
model.quantize(traindataset)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

INFO - Start quantizing layer 1/32
2024-04-05 14:56:33 INFO [auto_gptq.modeling._base] Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
2024-04-05 14:56:36 INFO [auto_gptq.modeling._base] Quantizing self_attn.k_proj in layer 1/32...
2024-04-05 14:56:38 INFO [auto_gptq.quantization.gptq] duration: 1.7417263984680176
2024-04-05 14:56:38 INFO [auto_gptq.quantization.gptq] avg loss: 5.905703544616699
INFO - Quantizing self_attn.v_proj in layer 1/32...
2024-04-05 14:56:38 INFO [auto_gptq.modeling._base] Quantizing self_attn.v_proj in layer 1/32...
2024-04-05 14:56:39 INFO [auto_gptq.quantization.gptq] duration: 1.265934705734253
2024-04-05 14:56:39 INFO [auto_gptq.quantization.gptq] avg loss: 0.33390700817108154
INFO - Quantizing self_attn.q_proj in layer 1/32...
2024-04-05 14:56:39 INFO [auto_gptq.modeling._base] Quantizing self_attn.q_proj in layer 1/32...
2024-04-05 14:56:41 INFO [auto_gptq.quantization.gptq] duration: 1.2588515281677246
2024-04-05 14:56:41

In [10]:
tokenizer.save_pretrained(quantized_model_dir)

('merged_15_20mar_4bit/tokenizer_config.json',
 'merged_15_20mar_4bit/special_tokens_map.json',
 'merged_15_20mar_4bit/tokenizer.json')

In [None]:
from huggingface_hub import upload_folder

upload_folder(
    repo_id="",
    folder_path=quantized_model_dir,
    token = ""
)

gptq_model-4bit-128g.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

In [8]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Mar 18 23:44:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.146.02             Driver Version: 535.146.02   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        On  | 00000000:A1:00.0 Off |                  N/A |
| 30%   38C    P8              32W / 350W |  14580MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    