In [None]:
%pip install torch numpy pandas transformers bitsandbytes huggingface_hub



In [None]:
!pip install --upgrade bitsandbytes



In [None]:
import torch

# Check cuda available or not
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.version.cuda)


True
NVIDIA A100-SXM4-80GB
12.6


In [None]:
import pandas as pd
import numpy as np
from google.colab import userdata
from huggingface_hub import login


# Load data, chosen LLMs, and Hugging Face Access Token
models = [
    "Qwen/Qwen2.5-7B-Instruct",
     "google/gemma-3-270m",
    "google/vaultgemma-1b",
    "apple/FastVLM-0.5B"
]

hugging_face_token = userdata.get("HuggingFaceToken")
login(token=hugging_face_token)

# Load datasets into dfs
splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/fancyzhx/ag_news/" + splits["train"])
df_train_imbalanced = pd.read_parquet("./ag_news_train_imbalanced.parquet")
df_test = pd.read_parquet("./ag_news_test_small.parquet")

print(f"\nBalanced Training data: \n{df_train.head(5)}")
print(f"\nImbalanced Training data: \n{df_train_imbalanced.head(5)}")
print(f"\nTesting data: \n{df_test.head(5)}")





Balanced Training data: 
                                                text  label
0  Wall St. Bears Claw Back Into the Black (Reute...      2
1  Carlyle Looks Toward Commercial Aerospace (Reu...      2
2  Oil and Economy Cloud Stocks' Outlook (Reuters...      2
3  Iraq Halts Oil Exports from Main Southern Pipe...      2
4  Oil prices soar to all-time record, posing new...      2

Imbalanced Training data: 
                                                text  label
0  Low Turnout Sinks Macedonian Bid to Kill Right...      0
1  Rallying Red Sox on Verge of Historic Win  NEW...      1
2  Astros Pick Munro to Pitch Game 6 of N.L.C.S. ...      1
3  Arafat supporters hold overnight vigil outside...      0
4  Twin Car Bombs Explode in Baghdad, Killing at ...      0

Testing data: 
                                                text  label
0  Peruvian Maoist trial thrown into chaos The fi...      0
1  Running may have defined the body Next time yo...      3
2  Report: CARE Hostage Faces 

In [None]:
import transformers
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import bitsandbytes
import torch

# Function to load the model
def load_model_tokenizer(model_name: str, bnb_config: BitsAndBytesConfig = None):
  """
  Load the model and tokenizer from HuggingFace model

  Args:
      model_name (str): The name of the model copied from HuggingFace
      bnb_config (BitsAndBytesConfig): The quantization configuration of BitsAndBytes.
  Returns:
      Tuple[AutoModelForCausalLM, AutoTokenizer]: The model and tokenizer.
  """
  if bnb_config:
      model = AutoModelForCausalLM.from_pretrained(
          model_name,
          quantization_config= bnb_config,
          device_map="auto" #Control where each part of the model is placed in the GPU
      )
  else:
      if torch.cuda.is_available():
          model = AutoModelForCausalLM.from_pretrained(model_name)
          model.to('cuda')
      else:
          print("CUDA not available. Loading model to CPU.")
          model = AutoModelForCausalLM.from_pretrained(model_name)


  tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
  tokenizer.pad_token = tokenizer.eos_token #This sets the padding token ([PAD]) to be the same as the end-of-sequence (EOS) token ([EOS]).

  return model, tokenizer

In [None]:
# Create a function to prompt the LLM
label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

def build_shots_prompt(train_df, shots=None, imbalanced_ratio=None):
    prompt_lines = ["Classify the following text into one of these categories: World, Sports, Business, Sci/Tech",
                   "",
                   "IMPORTANT: Respond with ONLY the category name, nothing else.",
                   "",
                   "Examples:"]

    if shots:
        # Build a balanced prompt
        for label in sorted(train_df['label'].unique()):
            class_samples = train_df[train_df['label'] == label].sample(shots, random_state=42)
            for _, row in class_samples.iterrows():
                prompt_lines.append(f"Text: {row['text']}")
                prompt_lines.append(f"Category: {label_map[row['label']]}")
                prompt_lines.append("")  # Add a blank line between examples

    elif imbalanced_ratio:
        # Build an imbalanced prompt based on the provided ratios
        for label, n_shots in imbalanced_ratio.items():
            class_samples = train_df[train_df['label'] == label].sample(n_shots, random_state=42)
            for _, row in class_samples.iterrows():
                prompt_lines.append(f"Text: {row['text']}")
                prompt_lines.append(f"Category: {label_map[row['label']]}")
                prompt_lines.append("")  # Add a blank line between examples
    else:
        raise ValueError("Must provide either 'shots_per_class' or 'imbalanced_ratios'")

    # Join all lines into a single string
    prompt_str = "\n".join(prompt_lines)
    return prompt_str



In [None]:
# Create a function to preprocess the data, or tokenize words
from functools import partial
from datasets import Dataset, DatasetDict

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed: int, dataset: pd.DataFrame, shots=None, imbalanced_ratio=None):
  """
    Preprocess the dataset for training.

    Args:
        tokenizer (AutoTokenizer): The tokenizer.
        max_length (int): The maximum length of the model.
        seed (int): The seed for shuffling the dataset.
        dataset (pd.DataFrame): The Pandas dataframe.

    Returns:
        DatasetDict: The preprocessed dataset.
    """
  # Convert Pandas Dataframe into HuggingFace Dataset
  dataset_hgf = Dataset.from_pandas(dataset)

  dataset_hgf['prompted_text'] = dataset_hgf.apply(
        lambda row: build_shots_prompt(
            train_df=dataset_hgf,
            shots=shots,
            imbalanced_ratio=imbalanced_ratio
        ) + f"\nText: {row['text']}\nCategory:",
        axis=1
    )

  def tokenize_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch["prompted_text"],
        padding="max_length",
        truncation=True,
        max_length=max_length
    )

  _preprocessing_function = partial(tokenize_batch, tokenizer=tokenizer, max_length=max_length)
  dataset_hgf = dataset_hgf.map(
      _preprocessing_function,
      batched=True
  )

  dataset_hgf = dataset_hgf.shuffle(seed=seed)

  return dataset_hgf

In [None]:
from typing import Union, List, Tuple
from transformers import BitsAndBytesConfig
from peft import LoraConfig

def get_qlora_configs(load_in_4bit: bool,
                         bnb_4bit_use_double_quant: bool,
                         bnb_4bit_quant_type: str,
                         bnb_4bit_compute_dtype: torch.dtype,
                         r: int,
                         lora_alpha: int,
                         target_modules: Union[List[str],str],
                         lora_dropout: float,
                         bias: str,
                         task_type: str) -> Tuple[BitsAndBytesConfig, LoraConfig]:
    """
    Create the configurations for use QLoRA thechniques

    Args:
        load_in_4bit (bool): This flag is used to enable 4-bit quantization by replacing the Linear layers with FP4/NF4 layers from
            `bitsandbytes`.
        bnb_4bit_use_double_quant (bool): This flag is used for nested quantization where the quantization constants from the first quantization are
            quantized again.
        bnb_4bit_quant_type (str): This sets the quantization data type in the bnb.nn.Linear4Bit layers. Options are FP4 and NF4 data types
            which are specified by `fp4` or `nf4`.
        bnb_4bit_compute_dtype (torch.dtype): This sets the computational type which might be different than the input time. For example, inputs might be
            fp32, but computation can be set to bf16 for speedups.
        r (int): Lora attention dimension.
        lora_alpha (int): The alpha parameter for Lora scaling.
        target_modules (Union[List[str],str]): The names of the modules to apply Lora to.
        lora_dropout (float): The dropout probability for Lora layers.
        bias (str): Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the
            corresponding biases will be updated during training. Be aware that this means that, even when disabling
            the adapters, the model will not produce the same output as the base model would have without adaptation.
        task_type (str): The task type for the model.

    Returns:
        Tuple[BitsAndBytesConfig, LoraConfig]: The configuration for BitsAndBytes and Lora.
    """

    bnb_config = BitsAndBytesConfig(
        load_in_4bit = load_in_4bit,
        bnb_4bit_use_double_quant = bnb_4bit_use_double_quant,
        bnb_4bit_quant_type = bnb_4bit_quant_type,
        bnb_4bit_compute_dtype = bnb_4bit_compute_dtype,
    )

    lora_config = LoraConfig(
        r = r,
        lora_alpha = lora_alpha,
        target_modules = target_modules,
        lora_dropout = lora_dropout,
        bias = bias,
        task_type = task_type,
    )

    return bnb_config, lora_config

In [None]:
# Create a function to prepare the model for fine-tuning
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
)

def find_all_linear_names(model):
    """
    Find all linear layers in the model that can be adapted with LoRA.
    """
    linear_module_names = []
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            linear_module_names.append(name)
    return linear_module_names

def create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type):
    """
    Create a LoRA PEFT configuration.
    """
    return LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        target_modules=target_modules,
        lora_dropout=lora_dropout,
        bias=bias,
        task_type=TaskType[task_type.upper()]
    )

def prepare_model_for_fine_tune(model: AutoModelForCausalLM,
                                 lora_r: int,
                                 lora_alpha: int,
                                 lora_dropout: float,
                                 bias: str,
                                 task_type: str) -> AutoModelForCausalLM:
    """
    Prepares the model for fine-tuning.

    Args:
        model (AutoModelForCausalLM): The model that will be fine-tuned.
        lora_r (int): Lora attention dimension.
        lora_alpha (int): The alpha parameter for Lora scaling.
        lora_dropout (float): The dropout probability for Lora layers.
        Bias type for Lora. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the
            corresponding biases will be updated during training. Be aware that this means that, even when disabling
            the adapters, the model will not produce the same output as the base model would have without adaptation.
        task_type (str): The task type for the model.

    Returns:
        AutoModelForCausalLM: The model prepared for fine-tuning.
    """

    model.gradient_checkpointing_enable()


    model = prepare_model_for_kbit_training(model)


    target_modules = find_all_linear_names(model)

    # Create PEFT configuration for these modules and wrap the model to PEFT
    peft_config = create_peft_config(lora_r, lora_alpha, target_modules, lora_dropout, bias, task_type)
    model = get_peft_model(model, peft_config)

    model.config.use_cache = False

    return model

In [None]:
# A function to print trainable params

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

def print_trainable_params(model):
  """
  Prints the number of trainable vs total parameters in the model.
  """
  trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
  total = sum(p.numel() for p in model.parameters())
  print(f"Trainable params: {trainable} || Total params: {total} || Trainable%: {100 * trainable/total:.2f}")

def trainer(
    tokenizer,
    model_name,
    dataset,
    batch_size_per_device,
    gradient_acc,
    warmup_steps=0,
    lr=2e-4,
    output_dir="output_dir",
    num_train_epochs=10,
    fp16=True
    ):
  return Trainer(
    model=model_name,
    train_dataset=dataset,
    args=TrainingArguments(
        per_device_train_batch_size=batch_size_per_device,
        gradient_accumulation_steps=gradient_acc,
        warmup_steps=warmup_steps,
        learning_rate = lr,
        fp16 =True,
        output_dir = output_dir,
        num_train_epochs=num_train_epochs
    ),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False)
)


In [None]:
# Fine-tuning function
def fine_tune(model: AutoModelForCausalLM, trainer: Trainer, output_dir: str) -> None:
    """
    Fine-tune the model.

    Args:
        model (AutoModelForCausalLM): The model to fine-tune.
        trainer (Trainer): The trainer with the training configuration.
        output_dir (str): The output directory to save the model.
    """

    print("=== FINE-TUNING STARTS ===")

    train_result = trainer.train()

    import json
    metrics = train_result.metrics
    with open(f"{output_dir}/metrics.json", 'w') as f:
      json.dump(metrics, f, indent=4)

    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # Free memory
    del model, trainer
    torch.cuda.empty_cache()
    import gc
    gc.collect()


In [None]:
# Define QLoRA parameters
load_in_4bit = False # Set to False to disable 4-bit quantization
bnb_4bit_use_double_quant = False
bnb_4bit_quant_type = "nf4"
bnb_4bit_compute_dtype = torch.bfloat16
lora_r = 64
lora_alpha = 16
target_modules = None # Let find_all_linear_names determine the target modules
lora_dropout = 0.1
bias = "none"
task_type = "CAUSAL_LM"


# If load_in_4bit is False, bnb_config is not needed for quantization
bnb_config, lora_config = get_qlora_configs(
    load_in_4bit=load_in_4bit,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias=bias,
    task_type=task_type
)


# Load the model and tokenizer
model_name_to_finetune = models[0] # Use the first model from the list

# Pass bnb_config only if load_in_4bit is True
model, tokenizer = load_model_tokenizer(
    model_name_to_finetune,
    bnb_config if load_in_4bit else None # Pass bnb_config only if needed
)



model = prepare_model_for_fine_tune(
    model=model,
    lora_r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias=bias,
    task_type=task_type
)


print_trainable_params(model)


max_length = 512
seed = 42
dataset = preprocess_dataset(tokenizer=tokenizer, max_length=max_length, seed=seed, dataset=df_train)


per_device_train_batch_size = 16
gradient_accumulation_steps = 4
fp16 = True
learning_rate = 2e-4
warmup_steps = 50
logging_steps = 10
output_dir = "training_output"
optim = "paged_adamw_8bit"
num_train_epochs = 3


trainer_instance = trainer(
    tokenizer=tokenizer,
    model_name=model,
    dataset=dataset,
    batch_size_per_device=per_device_train_batch_size,
    gradient_acc=gradient_accumulation_steps,
    warmup_steps=warmup_steps,
    lr=learning_rate,
    # Set fp16 to False if not using quantization, or if it causes issues
    fp16=False if not load_in_4bit else fp16,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs
)



fine_tune(model, trainer_instance, output_dir)

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]



tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Trainable params: 171442176 || Total params: 7787058688 || Trainable%: 2.20


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

=== FINE-TUNING STARTS ===


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhuysuy05[0m ([33mhuysuy05-augustana-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,2.1858
1000,2.0903
1500,2.0586




Step,Training Loss
500,2.1858
1000,2.0903
1500,2.0586


In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set paths
base_model_path = "Qwen/Qwen2.5-7B-Instruct"
adapter_path = "training_output"
merged_model_output_path = "fine_tuned_qwen2.5_7B_v1"


base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_path)


peft_model = PeftModel.from_pretrained(base_model, adapter_path)

# 3. Merge and unload the adapter weights
merged_model = peft_model.merge_and_unload()

# 4. Save the merged model and its tokenizer
merged_model.save_pretrained(merged_model_output_path, safe_serialization=True)
tokenizer.save_pretrained(merged_model_output_path)

print(f"Merged model saved to {merged_model_output_path}")

```python
from transformers import AutoModelForCausalLM

# Load original tied model
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", tie_word_embeddings=False)

# Set the randomly initialized lm_head to the previously tied embeddings
model.lm_head.weight.data = model.model.embed_tokens.weight.data.clone()

# Save the untied model
untied_model_dir = "dir/for/untied/model"
model.save_pretrained(untied_model_dir)
model.config.save_pretrained(untied_model_dir)

# Now use the original model but in untied format
model = AutoModelForCausalLM.from_pretrained(untied_model_dir)
```



Merged model saved to fine_tuned_gemma3_270m_v1
