## Quantizing

In [None]:
!pip install transformers torch bitsandbytes accelerate

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

MODEL="tiiuae/falcon-7b"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def show_model_size(model):
    config = model.config
    q=0

    if hasattr(config, "quantization_config") and config.quantization_config is not None:
        q_config = config.quantization_config

        if hasattr(q_config, "load_in_4bit") and q_config.load_in_4bit == True:
            q = 4
        elif hasattr(q_config, "load_in_8bit") and q_config.load_in_8bit == True:
            q = 8
    else:
        if hasattr(config, "torch_dtype") and config.torch_dtype is not None:
            q = config.torch_dtype.itemsize * 8

    gbs = model.get_memory_footprint() / 1e9
    print(f"----- {q}-bit Model -----")
    print(f"Number of parameters: {model.num_parameters():,}")
    print(f"Memory footprint if FP32: {model.num_parameters()*4/1e9:.2f} GB")
    print(f"Memory footprint: {gbs:.2f} GB")

def generate(prompt):
    tokenized_text = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**tokenized_text, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id ,do_sample=True, max_new_tokens=100)
    return tokenizer.decode(output[0], skip_special_tokens=True)

## This prompt was on the Falcon-7B model card
prompt = "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:"
prompt

### Use 16-bit

In [None]:
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, device_map="auto")

In [None]:
show_model_size(model)

In [None]:
result = generate(prompt)
print(result)

### Use 8-bit

In [None]:
del model

config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config=config, device_map="auto")

In [None]:
show_model_size(model)

In [None]:
result = generate(prompt)
print(result)

### Use 4-**bit**

In [None]:
del model

config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(MODEL, quantization_config=config, device_map="auto")

In [None]:
show_model_size(model)

In [None]:
result = generate(prompt)
print(result)