In [1]:
# https://ai.plainenglish.io/lets-quantize-llama3-12c261c28129

# Import

In [1]:
from datasets import load_dataset
from matplotlib import pyplot as plt
import torch
import transformers
from transformers import (
    AutoModelForCausalLM, 
    BitsAndBytesConfig, 
    AutoTokenizer,
    TrainingArguments
)
from peft import (
    LoraConfig, 
    get_peft_model,
    TaskType
    )
from trl import SFTTrainer
from optimum.gptq import GPTQQuantizer


  from .autonotebook import tqdm as notebook_tqdm
2024-06-30 12:24:32.242313: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-30 12:24:32.266965: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Loading quantized Llama3

In [2]:
dataset_id = 'wikitext2'

quantizer = GPTQQuantizer(bits = 4, dataset = dataset_id, model_seqlen=2048)
quantizer.quant_method = 'gptq'


In [3]:
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
plan_model = AutoModelForCausalLM.from_pretrained(model_id, 
                                                  config=quantizer, 
                                                  torch_dtype=torch.float16, 
                                                  low_cpu_mem_usage=True)
gptq_model = quantizer.quantize_model(plan_model, tokenizer)

print(gptq_model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 11.8MB/s]
Downloading data: 100%|██████████| 733k/733k [00:00<00:00, 952kB/s]
Downloading data: 100%|██████████| 6.36M/6.36M [00:00<00:00, 7.48MB/s]
Downloading data: 100%|██████████| 657k/657k [00:00<00:00, 1.13MB/s]
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 132314.92 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 1059752.79 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 1496828.31 examples/s]
Quantizing model.layers blocks : 100%|██████████| 32/32 [36:57<00:00, 69.30s/it]
Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (rotary_emb): LlamaRotaryEmbedding()
          (k_proj): QuantLinear()
          (o_proj): QuantLinear()
          (q_proj): QuantLinear()
          (v_proj): QuantLinear()
        )
        (mlp): LlamaMLP(
          (act_fn): SiLU()
          (down_proj): QuantLinear()
          (gate_proj): QuantLinear()
          (up_proj): QuantLinear()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
)


# Save quantized model

In [7]:
import os
# torch.save(gptq_model, "quantized_model")
path = "/output/q_llama3_8b"
if os.path.exists(path): 
    os.makedirs(path)
quantizer.save(gptq_model,'qllama3_8b')