# Understanding Padding Tokens in CLIP Embeddings

## The Question

When we have a prompt like **"a beaver with blue teeth"**, it only uses maybe 10 tokens out of 77 total positions. The remaining 67 positions are **padding tokens**.

**Key Questions:**
1. Are padding tokens always the same?
2. How do they affect the final embedding?
3. Do padding embeddings at position 10 differ from padding at position 50?
4. Can we manipulate padding tokens to affect Flux output?

Let's investigate!

## Setup

In [None]:
import torch
from transformers import CLIPTextModel, CLIPTokenizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load CLIP text model
model_name = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(model_name)
model = CLIPTextModel.from_pretrained(model_name).to(device)
model.eval()

print(f"âœ“ Model loaded")
print(f"  Vocab size: {tokenizer.vocab_size}")
print(f"  Max length: {tokenizer.model_max_length}")
print(f"  Pad token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"  EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f"  BOS token: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")

## Investigation 1: What Are the Token IDs?

In [None]:
# Test with a short prompt
prompt = "a beaver with blue teeth"

# Tokenize
tokens = tokenizer(
    prompt,
    padding="max_length",
    max_length=77,
    truncation=True,
    return_tensors="pt"
)

token_ids = tokens['input_ids'][0].tolist()
attention_mask = tokens['attention_mask'][0].tolist()

print(f"Prompt: '{prompt}'")
print(f"\nToken IDs (first 15 positions):")
print("Position | Token ID | Attention | Decoded")
print("-" * 60)
for i in range(15):
    decoded = tokenizer.decode([token_ids[i]])
    print(f"{i:8} | {token_ids[i]:8} | {attention_mask[i]:9} | '{decoded}'")

print("\n...")
print("\nLast 5 positions:")
for i in range(72, 77):
    decoded = tokenizer.decode([token_ids[i]])
    print(f"{i:8} | {token_ids[i]:8} | {attention_mask[i]:9} | '{decoded}'")

# Count real tokens vs padding
num_real_tokens = sum(attention_mask)
num_padding = 77 - num_real_tokens
print(f"\nâœ“ Real tokens: {num_real_tokens}")
print(f"âœ“ Padding tokens: {num_padding}")

## Investigation 2: Are Padding Embeddings Identical?

Let's check if padding embeddings at different positions are the same or different.

In [None]:
# Generate embedding
with torch.no_grad():
    tokens_device = {k: v.to(device) for k, v in tokens.items()}
    outputs = model(**tokens_device)
    embedding = outputs.last_hidden_state[0]  # [77, 768]

embedding_np = embedding.cpu().numpy()

print(f"Embedding shape: {embedding_np.shape}")
print(f"\nLet's compare padding embeddings at different positions...\n")

# Find first padding position
first_padding_pos = num_real_tokens
print(f"First padding position: {first_padding_pos}")

# Compare padding embeddings at different positions
if num_padding > 1:
    padding_positions = [first_padding_pos, first_padding_pos + 10, first_padding_pos + 20, 76]
    padding_positions = [p for p in padding_positions if p < 77]
    
    print(f"\nComparing padding embeddings at positions: {padding_positions}")
    print("-" * 60)
    
    for i, pos in enumerate(padding_positions):
        emb = embedding_np[pos]
        print(f"\nPosition {pos}:")
        print(f"  First 10 values: {emb[:10]}")
        print(f"  Mean: {emb.mean():.6f}")
        print(f"  Std: {emb.std():.6f}")
        print(f"  L2 norm: {np.linalg.norm(emb):.6f}")
    
    # Calculate pairwise differences
    print("\n" + "="*60)
    print("Pairwise Cosine Similarities Between Padding Embeddings:")
    print("="*60)
    
    for i in range(len(padding_positions)):
        for j in range(i+1, len(padding_positions)):
            pos_i = padding_positions[i]
            pos_j = padding_positions[j]
            emb_i = embedding_np[pos_i]
            emb_j = embedding_np[pos_j]
            
            # Cosine similarity
            cos_sim = np.dot(emb_i, emb_j) / (np.linalg.norm(emb_i) * np.linalg.norm(emb_j))
            
            # L2 distance
            l2_dist = np.linalg.norm(emb_i - emb_j)
            
            print(f"Positions {pos_i} vs {pos_j}:")
            print(f"  Cosine similarity: {cos_sim:.8f}")
            print(f"  L2 distance: {l2_dist:.6f}")
            print()

## Investigation 3: Are Padding Embeddings the Same Across Different Prompts?

Do the padding embeddings change when we use different prompts?

In [None]:
# Test multiple prompts of different lengths
test_prompts = [
    "cat",
    "a red cat",
    "a beaver with blue teeth",
    "an elephant standing in a field of flowers",
]

padding_embeddings = {}

for prompt in test_prompts:
    # Tokenize
    tokens = tokenizer(
        prompt,
        padding="max_length",
        max_length=77,
        truncation=True,
        return_tensors="pt"
    )
    
    # Get embedding
    with torch.no_grad():
        tokens_device = {k: v.to(device) for k, v in tokens.items()}
        outputs = model(**tokens_device)
        embedding = outputs.last_hidden_state[0].cpu().numpy()
    
    # Find padding positions
    attention_mask = tokens['attention_mask'][0].tolist()
    num_real = sum(attention_mask)
    first_padding = num_real
    
    # Store padding embedding at a consistent position (e.g., position 50)
    if first_padding < 50:
        padding_embeddings[prompt] = {
            'num_real_tokens': num_real,
            'first_padding': first_padding,
            'padding_at_50': embedding[50],
            'padding_at_76': embedding[76]
        }
    
    print(f"Prompt: '{prompt}'")
    print(f"  Real tokens: {num_real}")
    print(f"  First padding position: {first_padding}")
    print()

# Compare padding embeddings across prompts
print("="*60)
print("Comparing Padding at Position 50 Across Different Prompts:")
print("="*60)

prompts_list = list(padding_embeddings.keys())
for i in range(len(prompts_list)):
    for j in range(i+1, len(prompts_list)):
        prompt_i = prompts_list[i]
        prompt_j = prompts_list[j]
        
        emb_i = padding_embeddings[prompt_i]['padding_at_50']
        emb_j = padding_embeddings[prompt_j]['padding_at_50']
        
        cos_sim = np.dot(emb_i, emb_j) / (np.linalg.norm(emb_i) * np.linalg.norm(emb_j))
        l2_dist = np.linalg.norm(emb_i - emb_j)
        
        print(f"\n'{prompt_i[:30]}...' vs '{prompt_j[:30]}...'")
        print(f"  Cosine similarity: {cos_sim:.10f}")
        print(f"  L2 distance: {l2_dist:.10f}")

print("\n" + "="*60)
if cos_sim > 0.9999:
    print("âœ“ CONCLUSION: Padding embeddings at the same position are")
    print("  NEARLY IDENTICAL across different prompts!")
else:
    print("âœ“ CONCLUSION: Padding embeddings differ across prompts")

## Visualization: Real Tokens vs Padding Tokens

In [None]:
# Use the "a beaver with blue teeth" prompt
prompt = "a beaver with blue teeth"

tokens = tokenizer(
    prompt,
    padding="max_length",
    max_length=77,
    truncation=True,
    return_tensors="pt"
)

with torch.no_grad():
    tokens_device = {k: v.to(device) for k, v in tokens.items()}
    outputs = model(**tokens_device)
    embedding = outputs.last_hidden_state[0].cpu().numpy()

attention_mask = tokens['attention_mask'][0].tolist()
num_real = sum(attention_mask)

# Create visualization
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Plot 1: Heatmap of embedding
ax1 = axes[0]
im = ax1.imshow(embedding.T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
ax1.axvline(x=num_real-0.5, color='lime', linewidth=3, label='Padding starts here')
ax1.set_xlabel('Token Position', fontsize=12)
ax1.set_ylabel('Embedding Dimension', fontsize=12)
ax1.set_title(f'CLIP Text Embedding: "{prompt}"\n(Green line = where padding starts)', 
              fontsize=14, fontweight='bold')
ax1.legend(loc='upper right')
plt.colorbar(im, ax=ax1, label='Embedding Value')

# Plot 2: L2 norms of each token embedding
ax2 = axes[1]
norms = [np.linalg.norm(embedding[i]) for i in range(77)]
colors = ['steelblue' if i < num_real else 'orange' for i in range(77)]
ax2.bar(range(77), norms, color=colors, alpha=0.7)
ax2.axvline(x=num_real-0.5, color='lime', linewidth=3, linestyle='--', 
            label='Padding starts here')
ax2.set_xlabel('Token Position', fontsize=12)
ax2.set_ylabel('L2 Norm', fontsize=12)
ax2.set_title('L2 Norm of Each Token Embedding\n(Blue = real tokens, Orange = padding)', 
              fontsize=14, fontweight='bold')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nâœ“ Visualization complete")
print(f"  Real tokens: {num_real}")
print(f"  Padding tokens: {77 - num_real}")
print(f"  Average norm of real tokens: {np.mean(norms[:num_real]):.4f}")
print(f"  Average norm of padding tokens: {np.mean(norms[num_real:]):.4f}")

## Experiment: What If We Zero Out Padding?

What happens if we replace padding embeddings with zeros?

In [None]:
import json

prompt = "a beaver with blue teeth"

# Generate normal embedding
tokens = tokenizer(
    prompt,
    padding="max_length",
    max_length=77,
    truncation=True,
    return_tensors="pt"
)

with torch.no_grad():
    tokens_device = {k: v.to(device) for k, v in tokens.items()}
    outputs = model(**tokens_device)
    embedding_normal = outputs.last_hidden_state[0].cpu().numpy()

attention_mask = tokens['attention_mask'][0].tolist()
num_real = sum(attention_mask)

# Create version with zeroed padding
embedding_zeroed = embedding_normal.copy()
embedding_zeroed[num_real:] = 0  # Zero out all padding positions

print(f"Prompt: '{prompt}'")
print(f"Real tokens: {num_real}")
print(f"\nOriginal embedding stats:")
print(f"  Mean: {embedding_normal.mean():.6f}")
print(f"  Std: {embedding_normal.std():.6f}")
print(f"  Norm: {np.linalg.norm(embedding_normal):.6f}")

print(f"\nZeroed padding embedding stats:")
print(f"  Mean: {embedding_zeroed.mean():.6f}")
print(f"  Std: {embedding_zeroed.std():.6f}")
print(f"  Norm: {np.linalg.norm(embedding_zeroed):.6f}")

# Visualize difference
fig, axes = plt.subplots(3, 1, figsize=(15, 12))

# Original
im1 = axes[0].imshow(embedding_normal.T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
axes[0].axvline(x=num_real-0.5, color='lime', linewidth=2)
axes[0].set_title('Original Embedding (with padding)', fontweight='bold')
axes[0].set_ylabel('Dimension')
plt.colorbar(im1, ax=axes[0])

# Zeroed
im2 = axes[1].imshow(embedding_zeroed.T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
axes[1].axvline(x=num_real-0.5, color='lime', linewidth=2)
axes[1].set_title('Zeroed Padding Embedding', fontweight='bold')
axes[1].set_ylabel('Dimension')
plt.colorbar(im2, ax=axes[1])

# Difference
diff = embedding_normal - embedding_zeroed
im3 = axes[2].imshow(diff.T, aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1)
axes[2].axvline(x=num_real-0.5, color='lime', linewidth=2)
axes[2].set_title('Difference (what we removed)', fontweight='bold')
axes[2].set_xlabel('Token Position')
axes[2].set_ylabel('Dimension')
plt.colorbar(im3, ax=axes[2])

plt.tight_layout()
plt.show()

# Save both versions
current_dir = Path(os.getcwd())
output_dir = current_dir.parent / "data" / "embeddings" / "CLIP"
output_dir.mkdir(parents=True, exist_ok=True)

# Save normal
normal_data = {
    "prompt": prompt,
    "embedding": embedding_normal.tolist(),
    "shape": [77, 768]
}
with open(output_dir / "beaver_normal.json", 'w') as f:
    json.dump(normal_data, f)

# Save zeroed
zeroed_data = {
    "prompt": prompt + " (zeroed padding)",
    "embedding": embedding_zeroed.tolist(),
    "shape": [77, 768]
}
with open(output_dir / "beaver_zeroed_padding.json", 'w') as f:
    json.dump(zeroed_data, f)

print("\nâœ“ Saved both embeddings to:") 
print(f"  {output_dir / 'beaver_normal.json'}")
print(f"  {output_dir / 'beaver_zeroed_padding.json'}")
print("\nYou can test both in Flux to see if padding affects the output!")

## Summary: What We Learned About Padding

### Key Findings:

1. **Padding Token Structure**:
   - Padding uses a special token ID (usually the EOS token repeated)
   - Appears in positions after real tokens up to position 77

2. **Are Padding Embeddings Identical?**
   - Padding at the **same position** across different prompts is nearly identical
   - Padding at **different positions** within the same prompt may vary slightly (due to positional encodings)

3. **Do They Affect Flux? YES, DRAMATICALLY!**
   - **Normal padding** â†’ Generates expected content (beavers)
   - **Zero padding** â†’ Generates random but coherent imagery (NOT beavers!)
   - **Padding values matter significantly** to Flux's output
   - This is a HUGE finding for creative manipulation!

4. **Padding Variants Created**:
   - uniform_pos_0.05 - very small positive values
   - uniform_pos_0.1 - small positive values
   - uniform_pos_0.2 - moderate positive values
   - uniform_neg_0.1 - small negative values
   - uniform_neg_0.2 - moderate negative values
   - random_noise_std_0.1 - Gaussian noise
   - scaled_50pct - 50% of normal padding magnitude

5. **Practical Implications**:
   - Padding is NOT just filler - it actively influences generation!
   - We can use padding manipulation for creative effects
   - Different padding values might produce different styles or themes
   - This opens up a new dimension of embedding manipulation

### Experiments to Run:

1. Test all padding variants in Flux
2. Compare outputs - look for patterns:
   - Do positive/negative values produce different moods?
   - Does random noise create more variation?
   - What's the gradient effect as we vary magnitude?
3. Try combining padding manipulation with token-level manipulations
4. Explore if padding affects style more than content

### Research Questions:

- Why does padding affect output if it's supposed to be "nothing"?
- Does Flux's attention mechanism weight padding tokens?
- Can we use padding as a "style knob" while keeping content tokens intact?
- What is the ideal padding for maximum creativity vs. prompt adherence?

## Quick Reference: All Generated Embeddings

Use these files in your Flux pipeline to test how different padding affects generation:

In [None]:
# Create summary table
import pandas as pd

summary_data = []

# Add normal
summary_data.append({
    'Filename': 'beaver_normal.json',
    'Description': 'Original CLIP padding',
    'Expected Output': 'Beavers (baseline)',
    'Padding Type': 'normal'
})

# Add zeroed
summary_data.append({
    'Filename': 'beaver_zeroed_padding.json',
    'Description': 'All padding = 0',
    'Expected Output': 'Random coherent imagery',
    'Padding Type': 'zeroed'
})

# Add all variants
variant_descriptions = {
    'uniform_pos_0.05': 'Very small positive (0.05)',
    'uniform_pos_0.1': 'Small positive (0.1)',
    'uniform_pos_0.2': 'Moderate positive (0.2)',
    'uniform_neg_0.1': 'Small negative (-0.1)',
    'uniform_neg_0.2': 'Moderate negative (-0.2)',
    'random_noise_std_0.1': 'Random Gaussian (Ïƒ=0.1)',
    'scaled_50pct': '50% of normal padding'
}

for name, desc in variant_descriptions.items():
    summary_data.append({
        'Filename': f'beaver_padding_{name}.json',
        'Description': desc,
        'Expected Output': '? (test in Flux!)',
        'Padding Type': name
    })

df = pd.DataFrame(summary_data)

print("\n" + "="*80)
print("SUMMARY OF ALL GENERATED EMBEDDINGS")
print("="*80)
print(df.to_string(index=False))
print("\n" + "="*80)
print(f"All files saved in: {output_dir}")
print("="*80)

print("\nðŸ“Š Testing Strategy:")
print("  1. Start with beaver_normal.json (baseline)")
print("  2. Test beaver_zeroed_padding.json (maximum deviation)")
print("  3. Test moderate values (0.1, -0.1) to see direction effects")
print("  4. Test higher values (0.2, -0.2) to see magnitude effects")
print("  5. Test random_noise to see if structure matters")
print("  6. Test scaled_50pct to see if relative scaling matters")
print("\nðŸŽ¨ Look for:")
print("  - Style changes (lighting, mood, color palette)")
print("  - Content changes (does it still show beavers?)")
print("  - Coherence (is output still realistic?)")
print("  - Patterns (do positive values trend one way, negative another?)")

## Experiment 2: Moderate Padding Values

**Discovery:** Zeroing out padding creates random but coherent imagery!

Now let's test with more moderate values. CLIP embeddings typically have small values (roughly -0.3 to 0.3 range). Let's try:
- Small uniform positive (0.1)
- Small uniform negative (-0.1)
- Moderate positive (0.2)
- Moderate negative (-0.2)
- Small random noise (Gaussian, std=0.1)

In [None]:
# First, let's check what the actual value range of normal padding is
print("Normal padding statistics:")
print(f"  Min: {embedding_normal[num_real:].min():.6f}")
print(f"  Max: {embedding_normal[num_real:].max():.6f}")
print(f"  Mean: {embedding_normal[num_real:].mean():.6f}")
print(f"  Std: {embedding_normal[num_real:].std():.6f}")

# Also check real tokens for comparison
print("\nReal token statistics:")
print(f"  Min: {embedding_normal[:num_real].min():.6f}")
print(f"  Max: {embedding_normal[:num_real].max():.6f}")
print(f"  Mean: {embedding_normal[:num_real].mean():.6f}")
print(f"  Std: {embedding_normal[:num_real].std():.6f}")

# Create different padding variants
padding_variants = {}

# 1. Uniform small positive
emb_pos_01 = embedding_normal.copy()
emb_pos_01[num_real:] = 0.1
padding_variants["uniform_pos_0.1"] = emb_pos_01

# 2. Uniform small negative
emb_neg_01 = embedding_normal.copy()
emb_neg_01[num_real:] = -0.1
padding_variants["uniform_neg_0.1"] = emb_neg_01

# 3. Uniform moderate positive
emb_pos_02 = embedding_normal.copy()
emb_pos_02[num_real:] = 0.2
padding_variants["uniform_pos_0.2"] = emb_pos_02

# 4. Uniform moderate negative
emb_neg_02 = embedding_normal.copy()
emb_neg_02[num_real:] = -0.2
padding_variants["uniform_neg_0.2"] = emb_neg_02

# 5. Small random noise (Gaussian)
np.random.seed(42)  # For reproducibility
emb_random = embedding_normal.copy()
random_padding = np.random.normal(0, 0.1, size=emb_random[num_real:].shape)
emb_random[num_real:] = random_padding
padding_variants["random_noise_std_0.1"] = emb_random

# 6. Very small uniform (0.05)
emb_tiny = embedding_normal.copy()
emb_tiny[num_real:] = 0.05
padding_variants["uniform_pos_0.05"] = emb_tiny

# 7. Scaled down version of normal padding (50% magnitude)
emb_scaled = embedding_normal.copy()
emb_scaled[num_real:] = embedding_normal[num_real:] * 0.5
padding_variants["scaled_50pct"] = emb_scaled

print("\n" + "="*70)
print("Created padding variants:")
print("="*70)
for name, emb in padding_variants.items():
    padding_section = emb[num_real:]
    print(f"\n{name}:")
    print(f"  Min: {padding_section.min():.6f}")
    print(f"  Max: {padding_section.max():.6f}")
    print(f"  Mean: {padding_section.mean():.6f}")
    print(f"  Std: {padding_section.std():.6f}")

In [None]:
# Visualize all padding variants
n_variants = len(padding_variants) + 2  # +2 for normal and zeroed
fig, axes = plt.subplots(n_variants, 1, figsize=(15, 3*n_variants))

# Plot normal
ax = axes[0]
im = ax.imshow(embedding_normal.T, aspect='auto', cmap='RdBu_r', vmin=-0.5, vmax=0.5)
ax.axvline(x=num_real-0.5, color='lime', linewidth=2)
ax.set_title('NORMAL PADDING (original)', fontweight='bold', fontsize=12)
ax.set_ylabel('Dim')
plt.colorbar(im, ax=ax)

# Plot zeroed
ax = axes[1]
im = ax.imshow(embedding_zeroed.T, aspect='auto', cmap='RdBu_r', vmin=-0.5, vmax=0.5)
ax.axvline(x=num_real-0.5, color='lime', linewidth=2)
ax.set_title('ZEROED PADDING', fontweight='bold', fontsize=12)
ax.set_ylabel('Dim')
plt.colorbar(im, ax=ax)

# Plot each variant
for idx, (name, emb) in enumerate(padding_variants.items(), start=2):
    ax = axes[idx]
    im = ax.imshow(emb.T, aspect='auto', cmap='RdBu_r', vmin=-0.5, vmax=0.5)
    ax.axvline(x=num_real-0.5, color='lime', linewidth=2)
    ax.set_title(f'PADDING: {name}', fontweight='bold', fontsize=12)
    ax.set_ylabel('Dim')
    plt.colorbar(im, ax=ax)

axes[-1].set_xlabel('Token Position', fontsize=12)

plt.tight_layout()
plt.show()

print("âœ“ Visualization complete")

In [None]:
# Save all variants to JSON files for testing in Flux
print("Saving all padding variants...")
print("="*70)

saved_files = []

# Save each variant
for name, emb in padding_variants.items():
    filename = f"beaver_padding_{name}.json"
    filepath = output_dir / filename
    
    data = {
        "prompt": f"{prompt} (padding: {name})",
        "embedding": emb.tolist(),
        "shape": [77, 768],
        "padding_type": name
    }
    
    with open(filepath, 'w') as f:
        json.dump(data, f)
    
    saved_files.append(filename)
    print(f"âœ“ Saved: {filename}")

print("\n" + "="*70)
print(f"âœ“ All {len(saved_files)} variants saved to:")
print(f"  {output_dir}")
print("\n" + "="*70)
print("Test each in Flux to see how padding values affect generation!")
print("\nVariants to test:")
print("1. beaver_normal.json - baseline (should produce beavers)")
print("2. beaver_zeroed_padding.json - random coherent images")
for f in saved_files:
    print(f"3+. {f}")