In [1]:
import torch

# Check if CUDA (GPU) is available
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    # Get current GPU device
    current_device = torch.cuda.current_device()
    
    # Print device name
    print(f"Current GPU: {torch.cuda.get_device_name(current_device)}")
    
    # Print device properties
    print(f"Device properties: {torch.cuda.get_device_properties(current_device)}")
    
    # Check current memory usage
    print(f"Memory allocated: {torch.cuda.memory_allocated(current_device) / 1024**2:.2f} MB")
    print(f"Memory cached: {torch.cuda.memory_reserved(current_device) / 1024**2:.2f} MB")

CUDA available: True
Current GPU: NVIDIA GeForce RTX 2060
Device properties: _CudaDeviceProperties(name='NVIDIA GeForce RTX 2060', major=7, minor=5, total_memory=6143MB, multi_processor_count=30, uuid=fc813019-6407-5b8d-aac1-92a154ea3b32, L2_cache_size=3MB)
Memory allocated: 0.00 MB
Memory cached: 0.00 MB


In [2]:
model_name = 'google/flan-t5-small'

In [37]:
import sentencepiece as sm
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)

In [38]:
input_text = "I want to live "

input_ids = tokenizer(input_text, return_tensors='pt').input_ids

output = model.generate(input_ids)

print(tokenizer.decode(output[0]))

<pad> a little bit</s>


In [39]:
from optimum.quanto import quantize, freeze, qint8

In [41]:
def named_module_tensors(module, recurse=False):
    for named_parameter in module.named_parameters(recurse=recurse):
      name, val = named_parameter
      flag = True
      if hasattr(val,"_data") or hasattr(val,"_scale"):
        if hasattr(val,"_data"):
          yield name + "._data", val._data
        if hasattr(val,"_scale"):
          yield name + "._scale", val._scale
      else:
        yield named_parameter

    for named_buffer in module.named_buffers(recurse=recurse):
      yield named_buffer

In [42]:
def dtype_byte_size(dtype):
    """
    Returns the size (in bytes) occupied by one parameter of type `dtype`.
    """
    import re
    if dtype == torch.bool:
        return 1 / 8
    bit_search = re.search(r"[^\d](\d+)$", str(dtype))
    if bit_search is None:
        raise ValueError(f"`dtype` is not a valid dtype: {dtype}.")
    bit_size = int(bit_search.groups()[0])
    return bit_size // 8

In [43]:
# Function to get model size before and after quantization
def compute_module_sizes(model):
    """
    Compute the size of each submodule of a given model.
    """
    from collections import defaultdict
    module_sizes = defaultdict(int)
    for name, tensor in named_module_tensors(model, recurse=True):
      size = tensor.numel() * dtype_byte_size(tensor.dtype)
      name_parts = name.split(".")
      for idx in range(len(name_parts) + 1):
        module_sizes[".".join(name_parts[:idx])] += size

    return module_sizes
# Check size before quantization
#original_size = get_model_size(model)
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")

# Quantize
quantize(model, weights=qint8, activations=None)
freeze(model)

# Check size after quantization
module_sizes = compute_module_sizes(model)
print(f"The quantized model size is {module_sizes[''] * 1e-9} GB")


The model size is 0.307844608 GB
The quantized model size is 0.12682868 GB


In [57]:
## Testing with a larger model
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B")

In [58]:
text = "Once upon a time, there was a"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Once upon a time, there was a man named John. He was a very good man


In [53]:
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")

The model size is 1.9761312000000002 GB


In [54]:

quantize(model, weights=qint8, activations=qint8)
freeze(model)

In [55]:
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")

The model size is 1.040611784 GB


In [56]:
## Testing the quantized model

text = "Once upon a time, there was a"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Once upon a time, there was a()


In [59]:
## Lets try with activations 'None'
quantize(model, weights=qint8, activations=None)
freeze(model)


In [60]:
module_sizes = compute_module_sizes(model)
print(f"The model size is {module_sizes[''] * 1e-9} GB")

The model size is 1.040611784 GB


In [61]:
text = "Once upon a time, there was a"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=10)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Once upon a time, there was a man who was very rich. He had a lot


### better output observed. Why is that?

When activations=None:

* Only the weights are quantized
* Intermediate computations (activations) remain in full precision (typically float32)
* The model retains more of its ability to make nuanced predictions
* Result: More natural text generation ("...a man who was very rich")

When activations=qint8:

* Both weights AND intermediate computations are quantized to 8-bit integers
* This aggressive quantization reduces the model's ability to represent subtle differences in token probabilities
* The model becomes more "uncertain" about next token predictions
* Result: Early termination or degraded output ("...a()")
