# Example of loading various of LoRA PEFT models

This notebook showcases an experimental form of extreme bit quantization. It uses right-bit shifting simulation which yields lots of potential for Hardware-Centric TPU and other embedded AI accelerator.

This model could reduce the RAM usage of the model but is designed first to reduce Memory traffic from cache to DRAM.

In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes peft   
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install evaluate
    !pip install torch --index-url https://download.pytorch.org/whl/cu124

In [None]:
import torch
from datasets import load_dataset
from evaluate import load
from peft import PeftModel, PeftMixedModel
import bitsandbytes as bnb
from tqdm.notebook import tqdm
import math
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, AwqConfig

In [None]:
import warnings
import gc
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
# @title Basic variable creation
model = "Meta-Llama-3.1-8B-Instruct-AWQ-INT4" # @param {"type": "string", "placeholder": "Meta-Llama-3.1-8B-bnb-4bit"}
source_user = "hugging-quants" # @param {"type": "string", "placeholder": "unsloth"}
my_user = "Tfloow" # @param {"type": "string", "placeholder": "Tfloow"}

dtype = torch.float16
load_in_4bit = True

model_name = f"{source_user}/{model}"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
def load_model(model_name):
  if "awq" in model_name.lower():
    # Need double loading or crashes 
    quant_type = 'awq'
    quantization_config = AwqConfig(
        bits=4,
        fuse_max_seq_len=512, # Note: Update this as per your use-case
        do_fuse=True,
    )
    try:
        model = AutoModelForCausalLM.from_pretrained(
          model_name,
          device_map="cuda:0",   # Put optimized layers on GPU
          torch_dtype=torch.float16,
          trust_remote_code=True,
          dtype=torch.float16,
          #quantization_config=quantization_config
      )
    except Exception as e:
      print("Trying to load again")
      model = AutoModelForCausalLM.from_pretrained(
          model_name,
          device_map="cuda:0",   # Put optimized layers on GPU
          torch_dtype=torch.float16,
          trust_remote_code=True,
          dtype=torch.float16,
          #quantization_config=quantization_config
      )
    model = model.to("cuda")
  else:
    quant_type = 'bnb'
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=load_in_4bit,
        bnb_4bit_compute_dtype=dtype,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
    )

    # Much slower than the unsloth optimzed method sadly but closer to GPU
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        dtype=dtype,
        device_map=device,
        #quantization_config=quantization_config,
  )
  return model, quant_type

In [None]:
def unload_model(model):
  """
  Unloads a Hugging Face model from GPU memory to free up resources.

  Args:
    model: The model object (e.g., AutoModelForCausalLM instance).
  """
  if model is not None:
    print("Unloading model from GPU...")
    try:
      # 1. Move model to CPU (optional, but good practice to clear GPU cache)
      model.to("cpu")

      # 2. Delete the model object
      del model

      # 3. Clear CUDA cache
      if torch.cuda.is_available():
        torch.cuda.empty_cache()

      print("Model unloaded and GPU memory cleared.")
      return None # Return None to ensure the calling scope clears the reference

    except Exception as e:
      print(f"Error during model unloading: {e}")
      return model
  else:
    print("No model provided to unload.")
    return None

In [None]:
# @title Apply a bit mask on weight to simulate 3 bits, 2 bits and 1 bit weight

def apply_quantization(model, simulated_quantization, target_modules, quant_type='awq'):
  print(simulated_quantization, target_modules)
  if quant_type == 'awq':
    masks = [0b00000000000000000000000000000000, 0b10001000100010001000100010001000,
            0b11001100110011001100110011001100, 0b11101110111011101110111011101110,
            0b11111111111111111111111111111111]
  else:
    masks = [0b0000000000000000, 0b1000100010001000, 0b1100110011001100,
            0b1110111011101110, 0b1111111111111111]
  mask = masks[simulated_quantization]

  quant_module_name = ["bnb.nn.Linear4bit", "awq.modules.linear"]

  for name, module in model.named_modules():
      if any(name in str(type(module)) for name in quant_module_name):
        if any(str(module_to_shift) in str(name) for module_to_shift in target_modules):
          if quant_type == 'awq':
            qweight = module.qweight
          else:
            qweight = module.weight.data

          #print(name)

          if not torch.is_floating_point(qweight):
              # Apply bitmask only to the quantized values
              qweight &= mask
          else:
            print("Floating point")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
# Evaluate data on subset of dataset
eval_dataset = dataset.select(range(50))
input_texts = eval_dataset["text"]

In [None]:
def tokenize_batch(batch_texts, tokenizer, max_length=512):
    return tokenizer(
        batch_texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length,
    )

In [None]:
def compute_perplexity(model, tokenizer, texts, batch_size=4, max_length=512):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    if batch_size > 1:
      print("[WARNING]: Wrong PPL will be returned as token counts between\
      inputs is different and outputs.loss don't take it into account\n Use batch_size=1")
      return -1

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating"):
            batch_texts = texts[i : i + batch_size]
            batch = tokenize_batch(batch_texts, tokenizer, max_length)
            batch = {k: v.to(device) for k, v in batch.items()}

            # Shift inputs for causal LM loss
            outputs = model(**batch, labels=batch["input_ids"])
            loss = outputs.loss  # Cross-entropy over non-padded tokens

            # Count number of valid tokens
            attention_mask = batch["attention_mask"]
            n_tokens = attention_mask.sum().item()

            total_loss += loss.item() * n_tokens
            total_tokens += n_tokens

    mean_loss = total_loss / total_tokens
    ppl = math.exp(mean_loss)
    return ppl

In [None]:
# Initial Model and Quantization Setup
model,quant_type = load_model(model_name)

# --- First Adapter (2-bits, QKGateUp) ---
simulated_quantization = 2
target_modules = ["q_proj", "k_proj", "gate_proj", "up_proj"]
apply_quantization(model, simulated_quantization, target_modules,quant_type=quant_type)

# --- Second Adapter (3-bits, V) ---
simulated_quantization_2 = 3
target_modules_2 = ['v_proj']
# Apply the specific quantization for the V-proj module
apply_quantization(model, simulated_quantization_2, target_modules_2, quant_type=quant_type)

model_name_lora = "Tfloow/Meta-Llama-3.1-8B-Instruct-AWQ-INT4_simulated_2-bits_lora_test_QKGateUp"
model_with_adapters = PeftMixedModel.from_pretrained(
    model,
    model_name_lora,
    adapter_name="adapter_qk_up"
)
tokenizer = AutoTokenizer.from_pretrained(model_name_lora)
tokenizer.pad_token = tokenizer.eos_token

# --- Loading second adapter ---
model_name_lora_2 = "Tfloow/Meta-Llama-3.1-8B-Instruct-AWQ-INT4_simulated_3-bits_lora_test_V"

# Use load_adapter() to add the second adapter to the existing model.
model_with_adapters.load_adapter(
    model_name_lora_2,
    adapter_name="adapter_v_proj"
)

# Make sure they are all loaded
model_with_adapters.set_adapter(["adapter_qk_up", "adapter_v_proj"])
# This represents a ~ 30 % reduction compared to quantize AWQ INT4 only
# This also yields better perplexity than baseline model

tokenizer = AutoTokenizer.from_pretrained(model_name_lora)
tokenizer.pad_token = tokenizer.eos_token

ppl = compute_perplexity(model_with_adapters, tokenizer, input_texts, batch_size=1, max_length=512)
print(f"Perplexity on eval dataset: {ppl}")

In [None]:
prompt = [
  {"role": "system", "content": "You are a helpful assistant, that responds as a pirate."},
  {"role": "user", "content": "What's Deep Learning?"},
]
inputs = tokenizer.apply_chat_template(
  prompt,
  tokenize=True,
  add_generation_prompt=True,
  return_tensors="pt",
  return_dict=True,
).to("cuda")

outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
print(tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0])

In [None]:
# @title Unload the model to free up GPU memory
unload_model(model)
# Should remove from memory model
del model
gc.collect()
torch.cuda.empty_cache()
torch.cuda.ipc_collect()