In [None]:
import torch
from transformers import AutoModelForCausalLM
from collections import defaultdict
torch.set_printoptions(precision=20)

In [None]:
# Need CUDA for quantization with bitsandbytes
# Ensure enough GPU memory is available otherwise some tensors will not be quantized
torch.cuda.is_available()

In [None]:
token = "" # set huggingface token
main_dir = "/home/raunaks/" # set directory where you want to save models


In [None]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
# model_name = "mistralai/Mistral-7B-Instruct-v0.3"
# model_name = "qwen/qwen2-audio-7b-instruct"
# model_name = "qwen/qwen2.5-7b-instruct"
# model_name = "qwen/qwen2.5-vl-32b-instruct"
# model_name = "deepseek-ai/deepseek-coder-33b-instruct"
# model_name = "google/gemma-3-27b-it"

For most models (e.g. Deepseek Coder 33B, Llama 3.1 8B, Mistral 7B), we can use `AutoModelForCausalLM.from_pretrained()`

For some specific models (e.g. Qwen 2.5 VL, Qwen 2 Audio, Gemma 3) we may need to use their specific classes instead of `AutoModelForCausalLM` (imports listed below)

Ensure total GPU memory is sufficient for quantization, otherwise fewer tensors will be quantized, and the downloaded model will be bigger (i.e. do not offload to CPU)

In [None]:
from transformers import Qwen2AudioForConditionalGeneration, Gemma3ForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                            torch_dtype=torch.bfloat16, # bfloat16 or float16
                                            cache_dir="/projects/bdjx/rshah6/", 
                                            device_map="auto",
                                            token=token
                                        )

In [None]:
model.save_pretrained(main_dir + model_name + "-bf16")

In [None]:
q_model = AutoModelForCausalLM.from_pretrained(model_name, 
                                                torch_dtype=torch.bfloat16, # bfloat16 or float16
                                                cache_dir="/projects/bdjx/rshah6/", 
                                                device_map="auto", 
                                                token=token,
                                                load_in_8bit=True # will use LLM.int8() for quantization
                                            )

In [None]:
q_model.save_pretrained(main_dir + model_name + "-bf16-int8")