In [1]:
%%capture
!pip install -r requirements.txt

In [2]:
model_id = "princeton-nlp/Sheared-LLaMA-2.7B"

## Full Precision

In [3]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(
            model_id, trust_remote_code=True, device_map="auto"
        )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

torch.float32
GPU memory: 10.19 GB


In [5]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm(

In [6]:
# clean up
import torch

del model
torch.cuda.empty_cache()

## Half Precision

In [7]:
model = AutoModelForCausalLM.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto"
        )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

torch.bfloat16
GPU memory: 5.09 GB


In [9]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm(

In [10]:
del model
torch.cuda.empty_cache()

## INT8 Quantization

### Using transformers parameters

In [11]:
model = AutoModelForCausalLM.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto",
            load_in_8bit=True
        )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [12]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

torch.bfloat16
GPU memory: 2.73 GB


In [13]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear8bitLt(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear8bitLt(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm

In [14]:
del model
torch.cuda.empty_cache()

Model `dtype` shows `torch.bfloat16`: This means the model will store weigths with **8bit**. While the computation will happen in `torch.bfloat16`.

### Using bitsandbytes quantization config

In [15]:
from transformers import BitsAndBytesConfig

In [16]:
quantization_config = BitsAndBytesConfig(load_in_8bit=True,
                                         llm_int8_threshold=200.0)

model = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto",
    quantization_config=quantization_config
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

torch.bfloat16
GPU memory: 2.73 GB


In [18]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear8bitLt(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear8bitLt(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear8bitLt(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm

In [19]:
del model
torch.cuda.empty_cache()

## 4bit Model Quantization

### Using transformers parameters

In [20]:
model = AutoModelForCausalLM.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto",
            load_in_4bit=True
        )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

torch.bfloat16
GPU memory: 1.55 GB


Model `dtype` shows `torch.bfloat16`: This means the model will store weigths with **4bit**. While the computation will happen in `torch.bfloat16`.

In [22]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
   

In [23]:
del model
torch.cuda.empty_cache()

### using bitsandbytes quantization config

#### QLoRA NF4

In [24]:
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16
        )
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, trust_remote_code=True, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [25]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")
print(f"GPU memory: {model.get_memory_footprint():.2f} B")

torch.float16
GPU memory: 1.55 GB
GPU memory: 1663906816.00 B


In [26]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
   

In [27]:
del model
torch.cuda.empty_cache()

#### QLoRA NF4-double-quantization (Nested Quantization)

In [28]:
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, trust_remote_code=True, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [29]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")
print(f"GPU memory: {model.get_memory_footprint():.2f} B")

torch.float16
GPU memory: 1.55 GB
GPU memory: 1663906816.00 B


In [30]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2560, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (o_proj): Linear4bit(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=6912, bias=False)
          (down_proj): Linear4bit(in_features=6912, out_features=2560, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
   

In [31]:
del model
torch.cuda.empty_cache()

## GPTQ

In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import torch

In [33]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)

In [34]:
# Set quantization configuration
quantization_config = GPTQConfig(
     bits=4,
     group_size=128,
     dataset="c4",
     desc_act=False,
     tokenizer=tokenizer
)

In [35]:
# Load the model from HF
quant_model = AutoModelForCausalLM.from_pretrained(model_id, 
                quantization_config=quantization_config, trust_remote_code=True, device_map='auto')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


Quantizing model.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

This process takes few hours to generate a quantized model. Hence, it is recommended to save the model locally or push it to huggingface_hub for later use.

In [36]:
# save the quantize model to disk
save_folder = "./models/quantized-shearedllama-2.7b"
quant_model.save_pretrained(save_folder, safe_serialization=True)

In [37]:
# save the tokenizer
tokenizer.save_pretrained(save_folder)

('./models/quantized-shearedllama-2.7b/tokenizer_config.json',
 './models/quantized-shearedllama-2.7b/special_tokens_map.json',
 './models/quantized-shearedllama-2.7b/tokenizer.model',
 './models/quantized-shearedllama-2.7b/added_tokens.json',
 './models/quantized-shearedllama-2.7b/tokenizer.json')

In [38]:
del quant_model
torch.cuda.empty_cache()

### Loading the Quantized Model

In [None]:
model_path = "./models/quantized-shearedllama-2.7b/"
gptq_config = GPTQConfig(bits=4, disable_exllama=False)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", 
                                             quantization_config = gptq_config)

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. disable_exllama, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


In [None]:
print(model.dtype)
print(f"GPU memory: {model.get_memory_footprint() / 1024**3:.2f} GB")

In [None]:
print(model)

In [None]:
del model
torch.cuda.empty_cache()

## Imprtant References
* HuggingFace Blogs
    1. https://huggingface.co/blog/hf-bitsandbytes-integration
    2. https://huggingface.co/blog/4bit-transformers-bitsandbytes
    3. https://huggingface.co/blog/gptq-integration
    4. https://huggingface.co/blog/merve/quantization
    5. https://huggingface.co/blog/overview-quantization-transformers
    6. https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/quantization
* Research Papers
    1. [LLM.int8](https://arxiv.org/abs/2208.07339)
    2. [QLoRA](https://arxiv.org/abs/2305.14314)
    3. [GPTQ](https://arxiv.org/pdf/2210.17323.pdf)
* Github Repo
    1. [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
    2. [auto-gptq](https://github.com/PanQiWei/AutoGPTQ)