In [8]:
!pip install bitsandbytes
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-



In [1]:
import os
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
import numpy as np
import json
import math

# Set environment variables for better GPU memory management
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
device = "cuda"

def load_quantized_model_and_tokenizer(model_name="meta-llama/Llama-3.1-8B"):
    # Configure quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True
    )

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Load model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        # device_map="auto",
        torch_dtype=torch.bfloat16
    ).to("cuda")

    return model, tokenizer

In [2]:
class KFAC:
    def __init__(self, model, damping=1e-3):
        self.model = model
        self.damping = damping
        self.A_dict = {}  # Store activation covariances
        self.G_dict = {}  # Store gradient covariances
        self.registered_modules = []

        # Register hooks for computing Fisher approximation
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                module.register_forward_pre_hook(self._save_input)
                module.register_backward_hook(self._save_grad_output)
                self.registered_modules.append((name, module))

    def _save_input(self, module, input):
        if not hasattr(module, 'input'):
            module.input = []
        module.input.append(input[0].detach().cuda())

    def _save_grad_output(self, module, grad_input, grad_output):
        if not hasattr(module, 'grad_output'):
            module.grad_output = []
        module.grad_output.append(grad_output[0].detach().cuda())

    def update_stats(self):
        for name, module in self.registered_modules:
            if hasattr(module, 'input') and hasattr(module, 'grad_output'):
                # Get the last saved input and gradient
                x = module.input[-1]
                grad_y = module.grad_output[-1]

                # Compute activation covariance (A)
                print(x.shape)
                # Reshape x to ensure it's 3D with shape [batch, seq_len, hidden_dim]
                batch_size = x.size(0) if len(x.shape) > 1 else 1
                x_3d = x.view(batch_size, -1, x.size(-1))

                # Now use bmm with proper 3D tensors
                a = torch.mean(torch.bmm(x_3d.transpose(1, 2), x_3d), dim=0)

                # a = torch.mean(torch.bmm(x.unsqueeze(2), x.unsqueeze(1)), dim=0).cuda()
                if name in self.A_dict:
                    self.A_dict[name] = 0.95 * self.A_dict[name] + 0.05 * a
                else:
                    self.A_dict[name] = a

                # Compute gradient covariance (G)
                batch_size = grad_y.size(0) if len(grad_y.shape) > 1 else 1
                grad_y = grad_y.view(batch_size, -1, grad_y.size(-1))
                g = torch.mean(torch.bmm(grad_y.transpose(1, 2), grad_y), dim=0).cuda()
                if name in self.G_dict:
                    self.G_dict[name] = 0.95 * self.G_dict[name] + 0.05 * g
                else:
                    self.G_dict[name] = g

                # Clear saved tensors to free memory
                module.input.clear()
                module.grad_output.clear()

    def get_kfac_preconditioned_update(self, name, module, weight_grad):
        if name not in self.A_dict or name not in self.G_dict:
            return weight_grad

        # Get KFAC matrices
        A = self.A_dict[name]
        G = self.G_dict[name]

        # Add damping
        A_eigenvalues = torch.linalg.eigvalsh(A)
        G_eigenvalues = torch.linalg.eigvalsh(G)
        damping_A = torch.max(torch.tensor(self.damping, device=A.device),
                             torch.min(A_eigenvalues) * 0.01)
        damping_G = torch.max(torch.tensor(self.damping, device=G.device),
                             torch.min(G_eigenvalues) * 0.01)

        A_damped = A + damping_A * torch.eye(A.shape[0], device=A.device)
        G_damped = G + damping_G * torch.eye(G.shape[0], device=G.device)

        # Compute inverses
        A_inv = torch.inverse(A_damped)
        G_inv = torch.inverse(G_damped)

        # Reshape gradient to match weight matrix
        grad_reshaped = weight_grad.view(weight_grad.shape)

        # Apply KFAC preconditioning
        preconditioned_grad = torch.mm(torch.mm(A_inv, grad_reshaped), G_inv)

        return preconditioned_grad.view_as(weight_grad)


In [3]:
class NeuralReprojection:
    def __init__(self, k=10):
        self.k = k

    def project(self, fisher_matrix, update_vector):
        """
        Project update_vector onto the subspace spanned by the top k eigenvectors of fisher_matrix
        """
        # Ensure computation happens on GPU
        fisher_matrix = fisher_matrix.cuda()
        update_vector = update_vector.cuda()

        # Compute eigendecomposition of the fisher matrix
        eigenvalues, eigenvectors = torch.linalg.eigh(fisher_matrix)

        # Sort eigenvalues and eigenvectors in descending order
        sorted_indices = torch.argsort(eigenvalues, descending=True)
        eigenvalues = eigenvalues[sorted_indices]
        eigenvectors = eigenvectors[:, sorted_indices]

        # Select top k eigenvectors
        U_k = eigenvectors[:, :self.k]

        # Project update vector onto the subspace spanned by U_k
        projected_update = U_k @ (U_k.T @ update_vector)

        return projected_update


In [4]:
class GRITLoRA:
    def __init__(self, model, rank=8, alpha=16, k_proj=10):
        self.model = model
        self.rank = rank
        self.alpha = alpha
        self.scaling = alpha / rank

        # Initialize KFAC and Neural Reprojection
        self.kfac = KFAC(model)
        self.neural_reprojection = NeuralReprojection(k=k_proj)

        # Apply LoRA to the model
        self.setup_lora()

    def setup_lora(self):
        # Define LoRA configuration for quantized model
        lora_config = LoraConfig(
            r=self.rank,
            lora_alpha=self.alpha,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM"
        )

        # Apply LoRA to the model
        self.model = get_peft_model(self.model, lora_config)

    def apply_natural_gradient_update(self, optimizer):
        """Apply natural gradient update using KFAC and neural reprojection"""
        # Update KFAC statistics
        self.kfac.update_stats()

        # Get all trainable parameters (LoRA parameters only)
        for name, param in self.model.named_parameters():
            if param.requires_grad:
                # Only process parameters that require gradients (LoRA params)
                if param.grad is not None:
                    # Get the module this parameter belongs to
                    module_name = name.split('.')[0]  # Simplified - adjust as needed

                    # Get KFAC preconditioned gradients
                    precond_grad = self.kfac.get_kfac_preconditioned_update(
                        module_name, None, param.grad
                    )

                    # Get Fisher matrix approximation for this parameter
                    fisher = self.kfac.A_dict.get(module_name,
                                                 torch.eye(param.grad.shape[0],
                                                           device=param.device))

                    # Apply neural reprojection
                    proj_grad = self.neural_reprojection.project(fisher, precond_grad)

                    # Replace gradient with projected one
                    param.grad = proj_grad

        # Let the optimizer apply the updates
        optimizer.step()


In [5]:
class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self.max_length = max_length

        for item in data:
            # Format: instruction, input (optional), output
            if 'input' in item and item['input']:
                prompt = f"Instruction: {item['instruction']}\nInput: {item['input']}\nOutput:"
            else:
                prompt = f"Instruction: {item['instruction']}\nOutput:"

            self.inputs.append(prompt)
            self.targets.append(item['output'])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]

        # Tokenize input
        input_encoding = self.tokenizer(input_text,
                                       return_tensors='pt',
                                       max_length=self.max_length,
                                       truncation=True,
                                       padding='max_length')

        # Tokenize target with the EOS token
        target_encoding = self.tokenizer(target_text,
                                        return_tensors='pt',
                                        max_length=self.max_length,
                                        truncation=True,
                                        padding='max_length')

        # Create labels by combining input and target
        labels = torch.full_like(input_encoding['input_ids'], -100)  # Ignore loss for input tokens

        # Set target tokens for loss calculation
        target_len = target_encoding['input_ids'].size(1)
        input_len = input_encoding['input_ids'].size(1)

        if input_len + target_len <= self.max_length:
            # If we can fit both input and target
            labels[:, input_len:input_len+target_len] = target_encoding['input_ids']
        else:
            # If we need to truncate
            available_len = self.max_length - input_len
            labels[:, input_len:] = target_encoding['input_ids'][:, :available_len]

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }


In [6]:
def train_with_grit(model, tokenizer, train_dataset, val_dataset=None,
                   epochs=3, batch_size=1, learning_rate=1e-4,
                   rank=8, alpha=16, k_proj=10):
    """
    Train a quantized model using GRIT
    """
    # Initialize GRIT LoRA
    grit_model = GRITLoRA(model, rank=rank, alpha=alpha, k_proj=k_proj)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size) if val_dataset else None

    # Initialize optimizer (we only optimize the LoRA parameters)
    optimizer = torch.optim.AdamW(grit_model.model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(epochs):
        grit_model.model.train()
        total_loss = 0

        for batch in train_loader:
            # Move batch to GPU
            batch = {k: v.cuda() for k, v in batch.items()}

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = grit_model.model(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )

            loss = outputs.loss
            total_loss += loss.item()

            # Backward pass
            loss.backward()

            # Apply natural gradient update with KFAC and neural reprojection
            grit_model.apply_natural_gradient_update(optimizer)

        # Print epoch stats
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")

        # Validation
        if val_loader:
            grit_model.model.eval()
            val_loss = 0

            with torch.no_grad():
                for batch in val_loader:
                    batch = {k: v.cuda() for k, v in batch.items()}
                    outputs = grit_model.model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels']
                    )
                    val_loss += outputs.loss.item()

            avg_val_loss = val_loss / len(val_loader)
            print(f"Validation Loss: {avg_val_loss:.4f}")

    return grit_model


In [7]:
from huggingface_hub import login

login("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")

In [8]:
def main():
    # Check GPU availability
    if not torch.cuda.is_available():
        raise RuntimeError("This implementation requires a GPU with CUDA support")

    print(f"Using GPU: {torch.cuda.get_device_name(0)}")

    # Load quantized model and tokenizer
    model, tokenizer = load_quantized_model_and_tokenizer("meta-llama/Llama-3.1-8B")

    # Load and prepare your dataset
    train_data = [
        {"instruction": "Summarize this text",
         "input": "The effects of climate change are becoming increasingly evident worldwide. Rising temperatures have led to melting ice caps, rising sea levels, and more frequent extreme weather events.",
         "output": "Climate change is causing rising temperatures, melting ice caps, rising sea levels, and more extreme weather."},

        {"instruction": "Translate this text to French",
         "input": "Hello, how are you today? I hope you're doing well.",
         "output": "Bonjour, comment allez-vous aujourd'hui? J'espère que vous allez bien."},

        {"instruction": "Extract the main entities from this text",
         "input": "Apple Inc. announced yesterday that CEO Tim Cook will present the new iPhone 15 at their headquarters in Cupertino, California next month.",
         "output": "Entities: Apple Inc., Tim Cook, iPhone 15, Cupertino, California"},

        # Add more examples as needed
    ]

    # Create datasets
    train_dataset = InstructionDataset(train_data, tokenizer)
    val_dataset = InstructionDataset(train_data[:1], tokenizer)  # Small validation set

    # Train with GRIT
    grit_model = train_with_grit(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        epochs=3,
        batch_size=1,  # Small batch size for quantized model
        learning_rate=1e-4,
        rank=8,
        alpha=16,
        k_proj=10
    )

    # Save the fine-tuned model
    output_dir = "llama-3.1-grit-quantized"
    os.makedirs(output_dir, exist_ok=True)

    # Save the PEFT model
    grit_model.model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print(f"Model saved to {output_dir}")

    # Test the model with a sample instruction
    test_instruction = "Explain the concept of machine learning"
    generate_response(grit_model.model, tokenizer, test_instruction)

if __name__ == "__main__":
    main()


Using GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  self._maybe_warn_non_full_backward_hook(args, result, grad_fn)


torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 14336])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 14336])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])
torch.Size([1, 512, 4096])


OutOfMemoryError: CUDA out of memory. Tried to allocate 392.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 80.12 MiB is free. Process 165302 has 14.66 GiB memory in use. Of the allocated memory 11.41 GiB is allocated by PyTorch, and 3.12 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)