In [1]:
pip install -U bitsandbytes



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

### A Note for Mac Users
This code will automatically detect your hardware.
If you have an Apple Silicon Mac (M1/M2/M3), it will try to use the 'mps' device.
Otherwise, it will fall back to the 'cpu'.
The quantization itself will happen on the CPU if no NVIDIA GPU is found.

In [3]:
# The model we want to use: small, fast, and perfect for this demo.
model_id = "distilgpt2"

In [4]:
# 1. Load the tokenizer (this is the same for both models)
tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# 2. Load the FULL PRECISION (FP32) model
# We'll explicitly move it to the CPU

print("Loading full precision model (distilgpt2)...")
model_fp32 = AutoModelForCausalLM.from_pretrained(model_id)
model_fp32.to("cpu") # Move to CPU

Loading full precision model (distilgpt2)...


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
# 3. Load the QUANTIZED (4-bit) model
# The `bitsandbytes` library handles this with a simple configurations.

from transformers import BitsAndBytesConfig

print("\nLoading 4-bit quantized model (distilgpt2)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  # Use 'nf4' for CPU
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16",
)
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)



Loading 4-bit quantized model (distilgpt2)...


In [11]:
# Let's check the memory footprint
# We calculate the memory usage for each model.

fp32_mem = model_fp32.get_memory_footprint()
q4_mem = model_4bit.get_memory_footprint()

print(f"\nFull Precision Model Memory: {fp32_mem / 1e6:.2f} MB")
print(f"4-bit Quantized Model Memory: {q4_mem / 1e6:.2f} MB")
print(f"Memory saved: {100 * (1 - q4_mem / fp32_mem):.2f}%")


Full Precision Model Memory: 333.94 MB
4-bit Quantized Model Memory: 106.42 MB
Memory saved: 68.13%


In [12]:
# --- Let's test the output ---
prompt = "Quantization in deep learning is"
# We need to manually move the inputs to the correct device for the full model
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")

# Generate text with the full model
output_fp32 = model_fp32.generate(**inputs, max_new_tokens=20)

print("\n--- Full Model Output ---")
print(tokenizer.decode(output_fp32[0], skip_special_tokens=True))

# For the quantized model
inputs_4bit = tokenizer(prompt, return_tensors="pt")
# Generate text with the quantized model
output_4bit = model_4bit.generate(**inputs_4bit, max_new_tokens=20)

print("\n--- Quantized Model Output ---")
print(tokenizer.decode(output_4bit[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



--- Full Model Output ---
Quantization in deep learning is a very important topic in the field of artificial intelligence.










--- Quantized Model Output ---
Quantization in deep learning is a new approach to learning. It is a new approach to learning. It is a new approach to


In [13]:
# Add sampling to avoid repetition in quantized output
output_4bit = model_4bit.generate(
    **inputs_4bit,
    max_new_tokens=20,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95
)

print("\n--- Quantized Model Output ---")
print(tokenizer.decode(output_4bit[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



--- Quantized Model Output ---
Quantization in deep learning is a common area in the field of deep learning.










