In [1]:
# Import required libraries
import torch
import os
import warnings
warnings.filterwarnings('ignore')

# HuggingFace libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

# Import our custom modules
from efficient_gradient_collector import (
    TokenAwareGradientCollector, collect_gradients_from_stories, 
    load_single_layer_gradients, save_gradients_by_layer, load_single_layer_optimized
)
from visualization_utils import create_multiple_tsne_visualizations, print_gradient_summary

print("✅ All libraries imported successfully")


✅ All libraries imported successfully


In [2]:
# Configuration
MODEL_NAME = "roneneldan/TinyStories-1M"
DATASET_NAME = "roneneldan/TinyStories"
SAVE_DIR = "tinystories_gradients"
MAX_SAMPLES = 2500  # Number of samples to collect gradients from
MAX_LENGTH = 128   # Maximum sequence length
BATCH_SIZE = 8     # Batch size for gradient collection
SAVE_BATCH_SIZE = 50  # Save to disk every N samples (memory efficient)
TOKEN_CONTEXT_WINDOW = 5  # Number of tokens before/after to show

# Create save directory
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(os.path.join(SAVE_DIR, 'batches'), exist_ok=True)  # For batch files
print(f"Configuration set. Saving to: {SAVE_DIR}")
print(f"Will save in batches of {SAVE_BATCH_SIZE} samples")


Configuration set. Saving to: tinystories_gradients
Will save in batches of 50 samples


In [3]:
# Load TinyStories model and tokenizer
print(f"Loading model: {MODEL_NAME}")

# Load tokenizer (use GPT-Neo tokenizer as specified in model card)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

print(f"Model loaded on {device}")
print(f"Model type: {model.config.model_type}")
print(f"Tokenizer vocab size: {tokenizer.vocab_size}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Show model architecture
print("\nModel architecture:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"  {name}: {param.shape}")


Loading model: roneneldan/TinyStories-1M
Model loaded on cuda
Model type: gpt_neo
Tokenizer vocab size: 50257
Model parameters: 3,745,984

Model architecture:
  transformer.wte.weight: torch.Size([50257, 64])
  transformer.wpe.weight: torch.Size([2048, 64])
  transformer.h.0.ln_1.weight: torch.Size([64])
  transformer.h.0.ln_1.bias: torch.Size([64])
  transformer.h.0.attn.attention.k_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.v_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.q_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.out_proj.weight: torch.Size([64, 64])
  transformer.h.0.attn.attention.out_proj.bias: torch.Size([64])
  transformer.h.0.ln_2.weight: torch.Size([64])
  transformer.h.0.ln_2.bias: torch.Size([64])
  transformer.h.0.mlp.c_fc.weight: torch.Size([256, 64])
  transformer.h.0.mlp.c_fc.bias: torch.Size([256])
  transformer.h.0.mlp.c_proj.weight: torch.Size([64, 256])
  transformer.h.0.mlp.c_proj.bias: torch.Siz

In [4]:
# Load TinyStories dataset
print(f"Loading dataset: {DATASET_NAME}")

dataset = load_dataset(DATASET_NAME, split="train", streaming=True)

# Sample some stories
stories = []
for i, example in enumerate(dataset):
    if i >= MAX_SAMPLES:
        break
    stories.append(example['text'])

print(f"Loaded {len(stories)} stories")
print(f"\nExample story:")
print(f"'{stories[0][:200]}...'")


Loading dataset: roneneldan/TinyStories
Loaded 2500 stories

Example story:
'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on...'


In [5]:
# Collect gradients using the imported function
collector = collect_gradients_from_stories(
    model=model,
    tokenizer=tokenizer,
    stories=stories,
    save_dir=SAVE_DIR,
    max_samples=MAX_SAMPLES,
    max_length=MAX_LENGTH,
    batch_size=BATCH_SIZE,
    save_batch_size=SAVE_BATCH_SIZE,
    token_context_window=TOKEN_CONTEXT_WINDOW
)


Tracking 108 parameters
Will save batches of 50 samples to tinystories_gradients/batches
Using device: cuda:0 (GPU: True)
🔄 Starting gradient collection with progressive batch saving...
Processed 8 / 2500 stories
Processed 25 samples (saved 0 batches)
💾 Saved batch 0 with 50 samples
Processed 50 samples (saved 1 batches)
Processed 75 samples (saved 1 batches)
Processed 88 / 2500 stories
💾 Saved batch 1 with 50 samples
Processed 100 samples (saved 2 batches)
Processed 125 samples (saved 2 batches)
💾 Saved batch 2 with 50 samples
Processed 150 samples (saved 3 batches)
Processed 168 / 2500 stories
Processed 175 samples (saved 3 batches)
💾 Saved batch 3 with 50 samples
Processed 200 samples (saved 4 batches)
Processed 225 samples (saved 4 batches)
Processed 248 / 2500 stories
💾 Saved batch 4 with 50 samples
Processed 250 samples (saved 5 batches)
Processed 275 samples (saved 5 batches)
💾 Saved batch 5 with 50 samples
Processed 300 samples (saved 6 batches)
Processed 325 samples (saved 6 b

In [None]:
# Configuration for single layer analysis
print("🎯 Single Layer Analysis Configuration")
print("Available layer options:")
print("  - transformer.wte.weight (word embeddings)")
print("  - transformer.wpe.weight (position embeddings)")
print("  - transformer.h.0.attn.attention.q_proj.weight (first layer query projection)")
print("  - transformer.h.0.attn.attention.k_proj.weight (first layer key projection)")
print("  - transformer.h.0.attn.attention.v_proj.weight (first layer value projection)")
print("  - transformer.h.0.mlp.c_fc.weight (first layer MLP)")
print("  - transformer.h.7.attn.attention.q_proj.weight (last layer query projection)")
print("  - transformer.h.7.mlp.c_fc.weight (last layer MLP)")
print()
print("💡 To change the target layer, modify TARGET_LAYER in the next cell")
print("💡 To load all layers, set USE_SINGLE_LAYER = False")


In [6]:
# OPTIONAL: Reorganize storage for maximum efficiency
# This reorganizes the current 50 × 715MB files into layer-specific storage
# Benefits: 108x faster loading, same storage, better organization

REORGANIZE_STORAGE = True  # Set to True to reorganize existing data

if REORGANIZE_STORAGE and 'collector' in locals():
    print("🔄 Reorganizing gradient storage by layer for optimal efficiency...")
    print("This is a one-time process that will make future analysis much faster.")
    print()
    
    success = save_gradients_by_layer(collector)
    
    if success:
        print("✅ Storage reorganization complete!")
        print("   Future single-layer analysis will be 108x faster")
        print("   You can now use load_single_layer_optimized() for best performance")
    else:
        print("❌ Storage reorganization failed. Will use original format.")
else:
    print("ℹ️  Skipping storage reorganization (set REORGANIZE_STORAGE=True to enable)")
    print("   Current format works but is less efficient for single-layer analysis")


🔄 Reorganizing gradient storage by layer for optimal efficiency...
This is a one-time process that will make future analysis much faster.

🔄 Reorganizing 50 batch files by layer...
📊 Found 108 layers to reorganize


Reorganizing batches: 100%|██████████| 50/50 [01:36<00:00,  1.93s/batch]

✅ Reorganization complete!
📁 Original storage: 34.89 GB
📁 New storage: 34.90 GB
💾 Storage identical (expected - same data, different organization)
🚀 Loading efficiency: Up to 108x faster for single layer analysis
📂 New structure: tinystories_gradients/layers/{layer_name}/batch_XXXX_gradients.pt
✅ Storage reorganization complete!
   Future single-layer analysis will be 108x faster
   You can now use load_single_layer_optimized() for best performance





In [11]:
# Option to use single layer gradient loading for memory efficiency
USE_SINGLE_LAYER = True  # Set to False to load all layers
TARGET_LAYER = "transformer.h.0.attn.attention.v_proj.weight"  # Specify which layer to analyze

if USE_SINGLE_LAYER:
    print("💾 Loading single layer gradients for memory efficiency...")
    print(f"Target layer: {TARGET_LAYER}")
    
    # Try optimized loading first, fallback to standard loading
    try:
        gradient_tensors, token_data = load_single_layer_optimized(collector, target_layer=TARGET_LAYER)
    except Exception as e:
        print(f"⚠️  Optimized loading failed: {e}")
        print("🔄 Falling back to standard loading...")
        gradient_tensors, token_data = load_single_layer_gradients(collector, target_layer=TARGET_LAYER)
    
    if gradient_tensors is None:
        print("❌ Failed to load single layer gradients! Trying to show available layers...")
        gradient_tensors, token_data = load_single_layer_gradients(collector, target_layer=None)
    
else:
    print("💾 Loading all saved batches for visualization...")
    gradient_tensors, token_data = collector.load_all_batches()

if gradient_tensors is None or token_data is None:
    print("❌ No data found! Please run gradient collection first.")
else:
    print(f"✅ Data loaded successfully")
    print(f"Available parameters: {list(gradient_tensors.keys())}")
    print(f"Total token samples: {len(token_data)}")
    
    # Show memory usage
    total_memory = 0
    for name, grads in gradient_tensors.items():
        mem_mb = grads.nelement() * grads.element_size() / (1024**2)
        total_memory += mem_mb
        print(f"  {name}: {grads.shape} ({mem_mb:.1f} MB)")
        
    print(f"📊 Total gradient data in memory: {total_memory:.1f} MB")


💾 Loading single layer gradients for memory efficiency...
Target layer: transformer.h.0.attn.attention.v_proj.weight
🚀 Loading from optimized layer-specific storage...
📁 Found 50 optimized batch files


Loading layer batches: 100%|██████████| 50/50 [00:00<00:00, 1273.01file/s]


✅ Loaded 2500 samples for layer transformer.h.0.attn.attention.v_proj.weight
   Memory: 39.1 MB (vs 4218.8 MB for all layers)
✅ Data loaded successfully
Available parameters: ['transformer.h.0.attn.attention.v_proj.weight']
Total token samples: 2500
  transformer.h.0.attn.attention.v_proj.weight: torch.Size([2500, 64, 64]) (39.1 MB)
📊 Total gradient data in memory: 39.1 MB


In [12]:
# Create t-SNE visualizations
if gradient_tensors is not None:
    if USE_SINGLE_LAYER:
        print(f"🎨 Creating t-SNE visualization for single layer: {TARGET_LAYER}")
        # For single layer, adjust keywords based on the layer type
        if 'embed' in TARGET_LAYER.lower():
            keywords = ['embed']
        elif 'attn' in TARGET_LAYER.lower() or 'attention' in TARGET_LAYER.lower():
            keywords = ['attn', 'attention']
        elif 'mlp' in TARGET_LAYER.lower():
            keywords = ['mlp']
        else:
            keywords = [TARGET_LAYER.split('.')[-2]]  # Use the second-to-last component
        
        figures = create_multiple_tsne_visualizations(
            gradient_tensors=gradient_tensors,
            token_data=token_data,
            save_dir=SAVE_DIR,
            keywords=keywords,
            n_samples=500,  # Use more samples for single layer
            perplexity=30
        )
    else:
        print("🎨 Creating t-SNE visualizations for all layers...")
        figures = create_multiple_tsne_visualizations(
            gradient_tensors=gradient_tensors,
            token_data=token_data,
            save_dir=SAVE_DIR,
            keywords=['embed', 'attention', 'attn', 'mlp'],
            n_samples=300,
            perplexity=30
        )
else:
    print("❌ No gradient data available for visualization")


🎨 Creating t-SNE visualization for single layer: transformer.h.0.attn.attention.v_proj.weight
🎨 Creating t-SNE visualizations...
Visualizing parameters: ['transformer.h.0.attn.attention.v_proj.weight']

Creating t-SNE for transformer.h.0.attn.attention.v_proj.weight...
Original gradient shape: torch.Size([2500, 64, 64])
Using 500 samples for t-SNE
Running t-SNE...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 500 samples in 0.002s...
[t-SNE] Computed neighbors for 500 samples in 0.060s...
[t-SNE] Computed conditional probabilities for sample 500 / 500
[t-SNE] Mean sigma: 17.024346
[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.289207
[t-SNE] KL divergence after 1000 iterations: 1.569001


Saved interactive plot to tinystories_gradients/tsne_transformer_h_0_attn_attention_v_proj_weight.html

✅ t-SNE visualizations complete!


In [None]:
# Analyze gradient diversity
if gradient_tensors is not None:
    print("📊 Analyzing gradient diversity...")
    
    if USE_SINGLE_LAYER:
        # For single layer, we can analyze the loaded gradients directly
        from efficient_gradient_collector import analyze_gradient_diversity
        diversity_stats = analyze_gradient_diversity(gradient_tensors)
        print(f"✅ Analyzed diversity for {len(gradient_tensors)} layer(s)")
    else:
        # For all layers, use the batch-based analysis
        diversity_stats = collector.analyze_diversity_from_batches()
        print(f"✅ Analyzed diversity from saved batches")
else:
    diversity_stats = None


In [None]:
# Print comprehensive summary
if gradient_tensors is not None and diversity_stats is not None:
    if USE_SINGLE_LAYER:
        print(f"\n🎯 SINGLE LAYER ANALYSIS SUMMARY")
        print(f"=" * 50)
        print(f"Model: {MODEL_NAME}")
        print(f"Target Layer: {TARGET_LAYER}")
        print(f"Total samples analyzed: {len(token_data) if token_data else 0}")
        print()
        
        # Show gradient statistics for the single layer
        if isinstance(diversity_stats, dict):
            for layer_name, layer_stats in diversity_stats.items():
                if isinstance(layer_stats, dict):
                    print(f"📊 Layer: {layer_name}")
                    print(f"  Samples: {layer_stats.get('num_samples', 'N/A')}")
                    print(f"  Mean cosine similarity: {layer_stats.get('mean_similarity', 0):.4f}")
                    print(f"  Std cosine similarity: {layer_stats.get('std_similarity', 0):.4f}")
                    print(f"  High similarity pairs (>0.95): {layer_stats.get('high_similarity_count', 0)}")
                    print(f"  Mean gradient norm: {layer_stats.get('gradient_norm_mean', 0):.6f}")
                    print(f"  Std gradient norm: {layer_stats.get('gradient_norm_std', 0):.6f}")
        
        # Show visualization info
        try:
            if 'figures' in locals() and figures and len(figures) > 0:
                print(f"\n🎨 Visualizations created: {len(figures)}")
                for fig_name in figures.keys():
                    print(f"  - {fig_name}")
        except:
            print("\n🎨 Visualization status: Check above for creation status")
        
        print(f"\n💾 Data saved to: {SAVE_DIR}")
        
    else:
        print("📈 Using full multi-layer summary...")
        print_gradient_summary(
            collector=collector,
            gradient_tensors=gradient_tensors,
            token_data=token_data,
            diversity_stats=diversity_stats,
            figures=figures if 'figures' in locals() else {},
            model_name=MODEL_NAME
        )
else:
    print("❌ No data available for summary")
