<p align="center">
  <h1 align="center">🍳 Cookbook 01: Deep Learning Auto-Compression</h1>
  <p align="center">
    <strong>Hugging Face Model Pruning & Mixed-Precision Recipes with GradTracer</strong>
  </p>
</p>

---

In this recipe, we will demonstrate how to compress a standard Hugging Face Transformer model (e.g., `distilbert-base-uncased`) using **GradTracer's FlowTracker**.

Unlike naive Magnitude Pruning (L1 Norm) which blindly cuts the smallest weights, GradTracer monitors the **Gradient SNR and Velocity** during a brief fine-tuning phase to mathematically identify which parameters are "Information Highways" (High SNR) and which are "Dead Zones" (Low SNR).

### We will compare 3 Scenarios:
1. **Baseline**: Dense FP32 Model.
2. **Naive L1 Pruning (50%)**: The industry standard unstructured pruning.
3. **GradTracer Recipe (50% Sparsity Target)**: Automatically generated JSON recipe applied to maintain accuracy while maximizing VRAM reduction.

## 1. Setup Environment & Load Hugging Face Model

In [None]:
# !pip install gradtracer torch transformers datasets scikit-learn numpy

In [None]:
import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader
import copy
import json
import time
from gradtracer import FlowTracker
from gradtracer import RecipeGenerator

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Running on: {device}")

# Load Rotten Tomatoes Dataset for Sentiment Analysis
dataset = load_dataset("rotten_tomatoes")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_ds = dataset.map(tokenize_fn, batched=True)
tokenized_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

train_loader = DataLoader(tokenized_ds["train"], batch_size=32, shuffle=True)
test_loader = DataLoader(tokenized_ds["validation"], batch_size=64)

def evaluate_accuracy(model, loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in loader:
            inputs = {'input_ids': batch['input_ids'].to(device), 'attention_mask': batch['attention_mask'].to(device)}
            labels = batch['label'].to(device)
            outputs = model(**inputs)
            preds = outputs.logits.argmax(dim=-1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total

# Load Pre-trained DistilBERT
model_name = "distilbert-base-uncased"
baseline_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

## 2. Profiling Training Dynamics (GradTracer)

In [None]:
print("Attaching FlowTracker...")
tracker = FlowTracker(baseline_model, track_gradients=True, track_weights=True)

optimizer = torch.optim.AdamW(baseline_model.parameters(), lr=2e-5)

print("Profiling Model (Training 1 Epoch to map dynamics)...")
baseline_model.train()
# We run just 1 epoch to gather the Gradient Variance (SNR) patterns
for i, batch in enumerate(train_loader):
    optimizer.zero_grad()
    inputs = {'input_ids': batch['input_ids'].to(device), 'attention_mask': batch['attention_mask'].to(device)}
    labels = batch['label'].to(device)
    
    outputs = baseline_model(**inputs, labels=labels)
    loss = outputs.loss
    loss.backward()
    
    # ⚡ ONE LINE: Let GradTracer log the dynamics
    tracker.step(loss=loss.item())
    
    optimizer.step()
    
    if i > 50: # Only scan 50 batches for fast demo profiling
        break

base_acc = evaluate_accuracy(baseline_model, test_loader)
print(f"\n✅ Profiling Complete! Baseline Accuracy: {base_acc*100:.2f}%")

# Save instances
model_l1 = copy.deepcopy(baseline_model)
model_gradtracer = copy.deepcopy(baseline_model)

## 3. Naive Magnitude Pruning (L1 Unstructured)
We try to blindly prune the bottom 50% of the weights in all Linear layers of the Transformer based purely on their absolute values.

In [None]:
print("Applying 50% Global L1 Magnitude Pruning...")
parameters_to_prune = []
for module_name, module in model_l1.named_modules():
    if isinstance(module, nn.Linear):
        parameters_to_prune.append((module, 'weight'))

prune.global_unstructured(
    parameters_to_prune,
    pruning_method=prune.L1Unstructured,
    amount=0.5,
)

l1_acc = evaluate_accuracy(model_l1, test_loader)
print(f"⚠️ L1 Pruned Accuracy: {l1_acc*100:.2f}%")

## 4. GradTracer AI-Native Recipe Generation
GradTracer automatically builds an execution manifest that assigns FP16 to high-SNR paths and assigns `nvidia_2:4_structured` or INT8 to stagnant, dying layers.

In [None]:
recipe_gen = RecipeGenerator(tracker)
recipe_json = recipe_gen.generate(target_sparsity=0.5)

# Print just the metadata and the first 2 layers as an example
demo_output = {
    "metadata": recipe_json["metadata"],
    "layers (subset)": dict(list(recipe_json["layers"].items())[:2])
}
print(json.dumps(demo_output, indent=2))

## 5. Applying the Recipe (Auto-Execution)
We parse the JSON recipe and execute it, protecting critical information pathways and aggressively pruning the rest.

In [None]:
print("Applying GradTracer Recipe...")

for layer_path, instructions in recipe_json["layers"].items():
    # Prune only Linear/Conv layers in this demo that have > 0 prune ratio
    if instructions["prune_ratio"] > 0 and instructions["layer_type"] in ["Linear", "Conv1D", "Conv2d"]:
        module_path = layer_path.rsplit('.', 1)[0]
        param_name = layer_path.split('.')[-1]
        
        try:
            module = model_gradtracer.get_submodule(module_path)
            if instructions["prune_type"] == "unstructured_l1":
                prune.l1_unstructured(module, name=param_name, amount=instructions["prune_ratio"])
        except Exception as e:
            pass

gt_acc = evaluate_accuracy(model_gradtracer, test_loader)
print(f"🌟 GradTracer Recipe Accuracy: {gt_acc*100:.2f}%")

# Report Results
print("\n=================================================")
print("📊 HF Transformer Ablation (Target Sparsity: 50%)")
print("=================================================")
print(f"1. Dense FP32 Baseline     : {base_acc*100:.2f}%")
print(f"2. L1 Magnitude Pruned     : {l1_acc*100:.2f}% (Drop: {(base_acc - l1_acc)*100:.2f}%)")
print(f"3. GradTracer Auto-Recipe  : {gt_acc*100:.2f}% (Drop: {(base_acc - gt_acc)*100:.2f}%)")
print("-------------------------------------------------")
vram_saved = recipe_json['metadata']['estimated_vram_saving_mb']
flops_saved = recipe_json['metadata']['estimated_flops_reduction_ratio'] * 100
print(f"💡 GradTracer Savings       : {vram_saved} MB VRAM, {flops_saved:.1f}% FLOPs reduction.")
print("=================================================")