In [1]:
!pip install -q transformers bitsandbytes accelerate torch tokenizer
!pip install -U datasets



In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from google.colab import userdata

hf_token = userdata.get('COLAB_HF_TOKEN')

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Load the model pass HF secret token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=hf_token,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

print("Model loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Model loaded successfully!


In [3]:
from datasets import load_dataset

ds = load_dataset("databricks/databricks-dolly-15k", split="train")

print(ds[0])

{'instruction': 'When did Virgin Australia start operating?', 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.", 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.', 'category': 'closed_qa'}


In [4]:
# This one loop will freeze every parameter in the entire model
for param in model.parameters():
  # "Freeze" the model's weight
  param.requires_grad = False

print("--- After freezing ALL layers ---")
print("Embedding layer frozen?", model.model.embed_tokens.weight.requires_grad == False)
print("Attention layer 15 frozen?", model.model.layers[15].self_attn.q_proj.weight.requires_grad == False)

--- After freezing ALL layers ---
Embedding layer frozen? True
Attention layer 15 frozen? True


In [5]:
import torch.nn as nn
import math

class LoRALayer(nn.Module):
    def __init__(self, original_layer, rank, alpha):
        super().__init__()

        self.original_layer = original_layer

        # Get dimensions from the original layer
        in_features = self.original_layer.in_features
        out_features = self.original_layer.out_features

        d_type = self.original_layer.weight.dtype

        # Initialize LoRA A & B matrices
        self.lora_A = nn.Parameter(torch.randn((in_features, rank), dtype=d_type))
        self.lora_B = nn.Parameter(torch.zeros((rank, out_features), dtype=d_type)) # Paper initializes B with zeros

        # Initialize A with Kaiming uniform for better stability
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))

        # Scaling factor
        self.scaling = alpha / rank

    def forward(self, x):

        # The origin linear layer calculation
        original_output = self.original_layer(x)

        # New update w/A & B matrices
        lora_update = (x @ self.lora_A @ self.lora_B) * self.scaling

        return original_output + lora_update

In [6]:
modules = model.model.layers

# Initialize LoRA layers in model's query and value attention matrices
for module in modules:

  # Rank and alpha variables for LoRA layers
  r = 8
  a = 16

  self_attn = module.self_attn
  self_attn.q_proj = LoRALayer(self_attn.q_proj, r, a)
  self_attn.v_proj = LoRALayer(self_attn.v_proj, r, a)

In [7]:
# Verify the freezing process was completed successfully

lora_q_proj = model.model.layers[0].self_attn.q_proj

print(f"Is lora_A trainable? {lora_q_proj.lora_A.requires_grad}")
print(f"Is lora_B trainable? {lora_q_proj.lora_B.requires_grad}")

original_weight = lora_q_proj.original_layer.weight
print(f"Is the ORIGINAL layer's weight trainable? {original_weight.requires_grad}")

Is lora_A trainable? True
Is lora_B trainable? True
Is the ORIGINAL layer's weight trainable? False


In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

In [12]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

# We assume 'tokenizer' and 'ds' (your dataset) are loaded, and tokenizer.pad_token is set.

# Step 1: Create the 'text' column from the instruction, context, and response.
def format_prompt(example):


  system_prompt = ""

  if example['context']:
      prompt = (
          f"{system_prompt}"
          f"### Instruction:\n{example['instruction']}\n\n"
          f"### Input:\n{example['context']}\n\n"
          f"### Response:\n{example['response']}"
      )
  else:
      prompt = (
          f"{system_prompt}"
          f"### Instruction:\n{example['instruction']}\n\n"
          f"### Response:\n{example['response']}"
      )
  return {"text": prompt}

formatted_dataset = ds.map(format_prompt, remove_columns=ds.column_names)


# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["text"])


# Create the DataLoader.
# The DataCollator  now automatically create the 'labels' by intelligently shifting the 'input_ids'
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

tokenizer.pad_token = tokenizer.eos_token

data_loader = DataLoader(
    tokenized_dataset, # We use the dataset right after tokenization
    batch_size=4,
    collate_fn=data_collator,
    shuffle=True
)


# Print batch keyes, input_ids shape, and labels shape
for batch in data_loader:
    print("Batch keys:", batch.keys())
    print("Shape of input_ids:", batch['input_ids'].shape)
    print("Shape of labels:", batch['labels'].shape)
    break

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
Shape of input_ids: torch.Size([4, 512])
Shape of labels: torch.Size([4, 512])


In [None]:
# Train the model (specifically the LoRA weights)


loss_function = torch.nn.CrossEntropyLoss()

num_epochs = 3

for epoch in range(num_epochs):
    print(f"--- Starting Epoch {epoch+1} ---")

    # Loop over every batch of data from your dataset
    for batch in data_loader:

        # Zero Out Gradients
        optimizer.zero_grad()

        # Forward Pass; get logits for batch
        outputs = model(**batch)

        # Calculate Loss
        loss = loss_function(outputs.logits, batch['labels'])

        # Backward Pass
        loss.backward()

        # Update Weights
        optimizer.step()

    print(f"Epoch {epoch+1} Finished")

--- Starting Epoch 1 ---
