In [None]:
# Import required libraries
import torch
import os
import warnings
warnings.filterwarnings('ignore')

# HuggingFace libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# Import our streamlined modules (token-specific only)
from efficient_gradient_collector import (
    TokenSpecificGradientCollector, 
    collect_token_gradients, 
    load_single_layer_gradients,
    analyze_gradient_diversity,
    check_existing_gradients,
    list_existing_gradients
)
from visualization_utils import (
    create_tsne_with_token_data, 
    create_umap_with_token_data,
    print_token_gradient_summary
)

print("✅ All libraries imported successfully")


✅ All libraries imported successfully


In [2]:
# Configuration for Token-Specific Gradient Analysis
MODEL_NAME = "roneneldan/TinyStories-1M"
DATASET_NAME = "roneneldan/TinyStories"
SAVE_DIR = "tinystories_gradients"

print("🎯 TOKEN-SPECIFIC GRADIENT ANALYSIS")
print("   ✅ Each token gets its own gradient (precise attribution)")
print("   ✅ Clear current_token → next_token mapping")
print("   ✅ Optimized storage format (96x faster loading)")
print("   ✅ No reorganization needed (saves directly in efficient format)")

# Configuration optimized for token-specific gradients
MAX_SAMPLES = 100      # Stories to process (each generates ~100 token gradients)
MAX_LENGTH = 128       # Maximum sequence length
BATCH_SIZE = 4         # Stories processed together
SAVE_BATCH_SIZE = 100  # Token gradients saved per batch file
TOKEN_CONTEXT_WINDOW = 5  # Context tokens before/after for hover display

# Create save directory
os.makedirs(SAVE_DIR, exist_ok=True)

print(f"\nConfiguration set. Saving to: {SAVE_DIR}")
print(f"Max samples: {MAX_SAMPLES}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Save batch size: {SAVE_BATCH_SIZE}")
print(f"Expected token gradients: ~{MAX_SAMPLES * 100:,}")


🎯 TOKEN-SPECIFIC GRADIENT ANALYSIS
   ✅ Each token gets its own gradient (precise attribution)
   ✅ Clear current_token → next_token mapping
   ✅ Optimized storage format (96x faster loading)
   ✅ No reorganization needed (saves directly in efficient format)

Configuration set. Saving to: tinystories_gradients
Max samples: 100
Batch size: 4
Save batch size: 100
Expected token gradients: ~10,000


In [3]:
# Load TinyStories model and tokenizer
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer (use GPT-Neo tokenizer as specified in model card)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded on {device}")
print(f"Model type: {model.config.model_type}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Show model architecture
print("\nModel architecture:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"  {name}: {param.shape}")


Loading model: roneneldan/TinyStories-1M
Model loaded on cuda
Model type: gpt_neo
Tokenizer vocab size: 50257
Model parameters: 3,745,984

Model architecture:
  transformer.wte.weight: torch.Size([50257, 64])
  transformer.wpe.weight: torch.Size([2048, 64])
  transformer.h.0.ln_1.weight: torch.Size([64])
  transformer.h.0.ln_1.bias: torch.Size([64])
  transformer.h.0.attn.attention.k_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.v_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.q_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.out_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.out_proj.bias: torch.Size([64])
  transformer.h.0.ln_2.weight: torch.Size([64])
  transformer.h.0.ln_2.bias: torch.Size([64])
  transformer.h.0.mlp.c_fc.weight: torch.Size([256, 64])
  transformer.h.0.mlp.c_fc.bias: torch.Size([256])
  transformer.h.0.mlp.c_proj.weight: torch.Size([64, 256])
  transformer.h.0.mlp.c_proj.bias: torch.Siz

In [4]:
# Load TinyStories dataset
print(f"Loading dataset: {DATASET_NAME}")

dataset = load_dataset(DATASET_NAME, split="train", streaming=True)

# Sample some stories
stories = []
for i, example in enumerate(dataset):
    if i >= MAX_SAMPLES:
        break
    stories.append(example['text'])

print(f"Loaded {len(stories)} stories")
print(f"\nExample story:")
print(f"'{stories[0][:200]}...'")


Loading dataset: roneneldan/TinyStories
Loaded 100 stories

Example story:
'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on...'


In [5]:
# Collect token-specific gradients (or use existing ones)
print("🎯 Checking for existing gradients...")
print("   Will use existing gradients if found (much faster!)")
print("   Otherwise will collect new gradients")

collector = collect_token_gradients(
    model=model,
    tokenizer=tokenizer,
    stories=stories,
    save_dir=SAVE_DIR,
    max_samples=MAX_SAMPLES,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    save_batch_size=SAVE_BATCH_SIZE,
    token_context_window=TOKEN_CONTEXT_WINDOW,
    force_recollect=False  # Set to True if you want to recollect gradients
)


🎯 Starting token-specific gradient collection...
   Each token prediction gets its own gradient
   Saving directly in optimized per-layer format
🎯 Token-specific gradient collection initialized
Tracking 108 parameters
Will save batches of 100 token gradients
Using optimized per-layer storage format
Using device: cuda:0 (GPU: True)
🎯 Starting token-specific gradient collection...
   Processing 100 stories
   Each token gets its own gradient (precise attribution!)
   Saving directly in optimized format (no reorganization needed)
Processed 50 token positions (saved 0 batches)
💾 Saved optimized batch 0 with 100 token gradients
Processed 100 token positions (saved 1 batches)
Processed 150 token positions (saved 1 batches)
💾 Saved optimized batch 1 with 100 token gradients
Processed 200 token positions (saved 2 batches)
Processed 250 token positions (saved 2 batches)
💾 Saved optimized batch 2 with 100 token gradients
Processed 300 token positions (saved 3 batches)
Processed 350 token positio

In [None]:
# Optional: List all available layers in existing gradients
print("📋 Listing all available gradient layers...")
list_existing_gradients(SAVE_DIR)


In [5]:
# Load gradients for analysis (optimized single-layer loading)
TARGET_LAYER = "transformer.h.0.attn.attention.v_proj.weight"  # Specify which layer to analyze
'''
TARGET_LAYER = "transformer.h.0.attn.attention.q_proj.weight"
TARGET_LAYER = "transformer.h.0.attn.attention.k_proj.weight"
TARGET_LAYER = "transformer.h.0.attn.attention.v_proj.weight"
TARGET_LAYER = "transformer.h.0.mlp.c_fc.weight"
TARGET_LAYER = "transformer.h.7.attn.attention.q_proj.weight"
TARGET_LAYER = "transformer.h.7.mlp.c_fc.weight"
TARGET_LAYER = "transformer.wte.weight"
TARGET_LAYER = "transformer.wpe.weight"
'''

print("💾 Loading single layer gradients (memory efficient)...")
print(f"Target layer: {TARGET_LAYER}")

# Load from optimized storage format
gradient_tensors, token_data = load_single_layer_gradients(collector, target_layer=TARGET_LAYER)

if gradient_tensors is None:
    print("❌ Failed to load layer! Showing available layers...")
    gradient_tensors, token_data = load_single_layer_gradients(collector, target_layer=None)

if gradient_tensors is None or token_data is None:
    print("❌ No data found! Please run gradient collection first.")
else:
    print(f"✅ Data loaded successfully")
    print(f"Available parameters: {list(gradient_tensors.keys())}")
    print(f"Total token gradients: {len(token_data)}")
    
    # Show memory usage
    total_memory = 0
    for name, grads in gradient_tensors.items():
        mem_mb = grads.nelement() * grads.element_size() / (1024**2)
        total_memory += mem_mb
        print(f"  {name}: {grads.shape} ({mem_mb:.1f} MB)")
        
    print(f"📊 Total gradient data in memory: {total_memory:.1f} MB")
    
    # Show some example token data
    if token_data and len(token_data) > 0:
        print(f"\n🎯 Example token predictions:")
        for i in range(min(3, len(token_data))):
            example = token_data[i]
            print(f"  {i+1}. {example.get('prediction_task', 'Unknown')}")
            print(f"     Context: {example.get('context_text', 'No context')}")


💾 Loading single layer gradients (memory efficient)...
Target layer: transformer.h.0.attn.attention.v_proj.weight


NameError: name 'collector' is not defined

In [None]:
# Create token-specific t-SNE visualization with PCA preprocessing
if gradient_tensors is not None and token_data is not None:
    print(f"🎨 Creating token-specific t-SNE visualization for: {TARGET_LAYER}")
    print(f"💡 Using PCA preprocessing to reduce 4096D → 50D (faster & often better results)")
    
    # Create t-SNE visualization with token-specific hover information + PCA
    tsne_fig = create_tsne_with_token_data(
        gradients=gradient_tensors,
        token_data=token_data,
        layer_name=TARGET_LAYER,
        max_samples=10000,  # Use 10k samples for good visualization
        perplexity=30,
        show_prediction_task=True,  # Show "token A → token B" in hover
        use_pca=True,              # NEW: PCA preprocessing (recommended!)
        pca_components=50          # Reduce from 4096D to 50D
    )
    
    if tsne_fig:
        # Display the figure
        tsne_fig.show()
        
        # Save the visualization
        output_file = f"{SAVE_DIR}/tsne_token_specific_{TARGET_LAYER.replace('.', '_')}.html"
        tsne_fig.write_html(output_file)
        print(f"💾 Visualization saved to: {output_file}")
        print(f"🔍 Hover over points to see specific token predictions!")
        print(f"🔬 PCA helps: reduces noise, speeds up t-SNE, often improves clustering")
    
else:
    print("❌ No gradient data available for visualization")


🎨 Creating token-specific t-SNE visualization for: transformer.h.0.attn.attention.v_proj.weight
🔍 Computing t-SNE for 10000 token gradients...
Gradient shape: torch.Size([10000, 4096])
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.021s...
[t-SNE] Computed neighbors for 10000 samples in 2.695s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[

In [None]:
# Create UMAP visualization with PCA preprocessing (alternative to t-SNE)
if gradient_tensors is not None and token_data is not None:
    print("🗺️ Creating UMAP visualization as alternative to t-SNE...")
    print("💡 UMAP often reveals different clustering patterns and preserves global structure better")
    print("🔧 Also using PCA preprocessing for consistency and performance")
    
    # Create UMAP visualization with token-specific hover information + PCA
    # Using more samples to match t-SNE and different parameters for comparison
    umap_fig = create_umap_with_token_data(
        gradients=gradient_tensors,
        token_data=token_data,
        layer_name=TARGET_LAYER,
        max_samples=2000,  # Use more samples than t-SNE for good coverage
        n_neighbors=30,    # Larger neighborhood for global structure
        min_dist=0.05,     # Tighter clusters
        show_prediction_task=True,
        use_pca=True,              # NEW: PCA preprocessing (consistent with t-SNE)
        pca_components=50          # Same reduction as t-SNE for comparison
    )
    
    if umap_fig:
        # Display the figure
        umap_fig.show()
        
        # Save the visualization
        output_file = f"{SAVE_DIR}/umap_token_specific_{TARGET_LAYER.replace('.', '_')}.html"
        umap_fig.write_html(output_file)
        print(f"💾 UMAP visualization saved to: {output_file}")
        print(f"🔍 Compare with t-SNE to see different clustering perspectives!")
        print(f"🎯 UMAP advantages: Better global structure, faster computation, deterministic")
        print(f"🔬 Both now use PCA 4096D→50D for fair comparison")
    
else:
    print("❌ No gradient data available for UMAP visualization")
