# CLIP Text Embeddings for FLUX

This notebook generates **CLIP text embeddings** for use with FLUX image generation.

- **Model**: OpenAI CLIP-ViT-Large-Patch14
- **Embedding dimension**: 768
- **Sequence length**: 77 tokens
- **Output shape**: [77, 768]

```mermaid
flowchart LR
    T[Text Prompt]
    
    TOK[CLIP Tokenizer]
    ENC[CLIP-L Text Encoder]
    
    EMB[Text Embedding<br/>77 × 768]
    
    FLUX[FLUX<br/>Diffusion Transformer]
    
    T --> TOK --> ENC --> EMB
    EMB -->|sequence conditioning| FLUX
```

In [4]:
from transformers import CLIPTextModel, CLIPTokenizer
import torch
import os
from pathlib import Path

# Load models path from config
current_dir = Path.cwd()
models_path_file = current_dir.parent / "misc/paths/models.txt"
with open(models_path_file, 'r') as f:
    models_path = f.read().strip()
MODELS_DIR = current_dir.parent / models_path

# Define CLIP model path
CLIP_MODEL_PATH = MODELS_DIR / "clip-vit-large-patch14"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CLIP from local folder
print(f"Loading CLIP model from: {CLIP_MODEL_PATH}...")
if not os.path.exists(CLIP_MODEL_PATH):
    print("\n⚠️  Model not found locally. Downloading from Hugging Face...")
    print("This model is ~1.7GB and will take a few minutes.")
    print("Please be patient...\n")
    
    # Download and save to local folder
    clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
    clip_model = CLIPTextModel.from_pretrained(
        "openai/clip-vit-large-patch14",
        torch_dtype=torch.bfloat16  # Use bfloat16 to match FLUX
    )
    
    # Save to local folder
    print(f"Saving model to {CLIP_MODEL_PATH}...")
    clip_tokenizer.save_pretrained(CLIP_MODEL_PATH)
    clip_model.save_pretrained(CLIP_MODEL_PATH)
    print("✓ Model downloaded and saved locally!\n")
else:
    print("✓ Loading from local folder...\n")

# Load from local folder
clip_tokenizer = CLIPTokenizer.from_pretrained(CLIP_MODEL_PATH, local_files_only=True)
clip_model = CLIPTextModel.from_pretrained(
    CLIP_MODEL_PATH,
    torch_dtype=torch.bfloat16,  # Use bfloat16 to match FLUX
    local_files_only=True
).to(device)
clip_model.eval()  # Set to evaluation mode

print(f"✓ CLIP loaded successfully!")
print(f"  Embedding dimension: {clip_model.config.hidden_size}")
print(f"  Max sequence length: {clip_tokenizer.model_max_length}")
print(f"  Loaded from: {CLIP_MODEL_PATH}")
print(f"  Model dtype: {next(clip_model.parameters()).dtype}")

Loading CLIP model from: /shares/weddigen.ki.uzh/laura_wagner/latent_vandalism_workshop/data/models/clip-vit-large-patch14...
✓ Loading from local folder...

✓ CLIP loaded successfully!
  Embedding dimension: 768
  Max sequence length: 77
  Loaded from: /shares/weddigen.ki.uzh/laura_wagner/latent_vandalism_workshop/data/models/clip-vit-large-patch14
  Model dtype: torch.bfloat16


## Generate Single Embedding

Enter a prompt to generate its CLIP embedding.

In [5]:
import ipywidgets as widgets
from IPython.display import display

# Create text input widget
clip_prompt_input = widgets.Textarea(
    value='a puffy european robin sitting on a tree branch',
    placeholder='Enter your prompt here',
    description='Prompt:',
    layout=widgets.Layout(width='80%', height='80px')
)
clip_generate_button = widgets.Button(
    description='Generate CLIP Embedding',
    button_style='success'
)
clip_output_area = widgets.Output()

# Global variable to store current embedding
current_clip_embedding = None
current_clip_tokens = None

def generate_clip_embedding(b):
    global current_clip_embedding, current_clip_tokens
    
    with clip_output_area:
        clip_output_area.clear_output()
        
        prompt = clip_prompt_input.value
        print(f"Generating CLIP embedding for: '{prompt}'\n")
        
        # Tokenize
        tokens = clip_tokenizer(
            prompt,
            padding="max_length",
            max_length=77,  # CLIP uses 77 tokens
            truncation=True,
            return_tensors="pt"
        )
        
        # Get token strings for display
        token_ids = tokens['input_ids'][0].tolist()
        token_strings = [clip_tokenizer.decode([tid]) for tid in token_ids]
        
        # Find how many real tokens (non-padding)
        num_real_tokens = (tokens['input_ids'][0] != clip_tokenizer.pad_token_id).sum().item()
        
        print(f"Tokenized into {num_real_tokens} real tokens (+ {77 - num_real_tokens} padding):")
        print("First 10 tokens:", token_strings[:10])
        print()
        
        # Generate embedding
        with torch.no_grad():
            tokens = {k: v.to(device) for k, v in tokens.items()}
            outputs = clip_model(**tokens)
            embedding = outputs.last_hidden_state  # Shape: [1, 77, embedding_dim]
        
        # Convert bfloat16 to float32 before converting to numpy
        current_clip_embedding = embedding.float().cpu().numpy()[0]  # Shape: [77, embedding_dim]
        current_clip_tokens = token_strings
        
        embedding_dim = current_clip_embedding.shape[1]
        total_numbers = current_clip_embedding.shape[0] * current_clip_embedding.shape[1]
        
        print(f"✓ CLIP embedding generated!")
        print(f"  Shape: {current_clip_embedding.shape}")
        print(f"  Total numbers: {total_numbers:,}")
        print(f"  Size: {current_clip_embedding.nbytes / 1024:.2f} KB")
        print()
        print(f"First token '{token_strings[0]}' embedding (first 10 values):")
        print(current_clip_embedding[0, :10])



## Save Embedding

Save the generated embedding to a JSON file.

In [6]:
import json
from pathlib import Path

# Define embeddings directory
current_dir = Path.cwd()
EMBEDDINGS_DIR = current_dir.parent / "data/embeddings/CLIP"
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

clip_save_button = widgets.Button(
    description='Save Embedding',
    button_style='primary'
)

clip_save_output = widgets.Output()

def save_clip_embedding(b):
    with clip_save_output:
        clip_save_output.clear_output()
        
        if current_clip_embedding is None:
            print("❌ No embedding to save! Generate an embedding first.")
            return
        
        # Get first 4 non-padding tokens (excluding special tokens)
        filename_tokens = []
        for token in current_clip_tokens:
            # Skip special tokens and padding, clean up CLIP-specific markers
            cleaned_token = token.strip().replace('</w>', '').replace('<|startoftext|>', '').replace('<|endoftext|>', '')
            if cleaned_token and cleaned_token not in ['<|startoftext|>', '<|endoftext|>', '']:
                filename_tokens.append(cleaned_token)
            if len(filename_tokens) >= 4:
                break
        
        # Create filename from first 4 tokens
        filename = "_".join(filename_tokens) + ".json"
        filepath = EMBEDDINGS_DIR / filename
        
        # Save embedding
        embedding_data = {
            "prompt": clip_prompt_input.value,
            "embedding": current_clip_embedding.tolist(),
            "shape": list(current_clip_embedding.shape)
        }
        
        with open(filepath, 'w') as f:
            json.dump(embedding_data, f)
        
        print(f"✓ Embedding saved to:")
        print(f"  {filepath}")
        print(f"  Size: {os.path.getsize(filepath) / 1024:.2f} KB")

clip_generate_button.on_click(generate_clip_embedding)
display(clip_prompt_input, clip_generate_button, clip_output_area)

clip_save_button.on_click(save_clip_embedding)

display(clip_save_button, clip_save_output)

Textarea(value='a puffy european robin sitting on a tree branch', description='Prompt:', layout=Layout(height=…

Button(button_style='success', description='Generate CLIP Embedding', style=ButtonStyle())

Output()

Button(button_style='primary', description='Save Embedding', style=ButtonStyle())

Output()

## Batch Generation from Text Input

Enter multiple prompts (one per line) to generate and save embeddings for all of them.

In [8]:
# Batch generate CLIP embeddings from text input
batch_prompt_input = widgets.Textarea(
    value='A curious Raggiana bird-of-paradise peeking through dense green leaves.\nA magnificent riflebird with iridescent feathers perched on a mossy log.\nA vibrant King of Saxony bird-of-paradise showing off its long head plumes.\nA stunning Superb bird-of-paradife doing a courtship dance on the forest floor.',
    placeholder='Enter prompts, one per line',
    description='Prompts:',
    layout=widgets.Layout(width='80%', height='150px')
)

batch_generate_button = widgets.Button(
    description='Batch Generate & Save',
    button_style='warning'
)

batch_output_area = widgets.Output()

def batch_generate_clip_embeddings(b):
    with batch_output_area:
        batch_output_area.clear_output()
        
        # Parse prompts (one per line)
        prompts = [p.strip() for p in batch_prompt_input.value.strip().split('\n') if p.strip()]
        
        if not prompts:
            print("No prompts provided!")
            return
        
        print(f"Generating {len(prompts)} CLIP embeddings...\n")
        
        for i, prompt in enumerate(prompts, 1):
            print(f"[{i}/{len(prompts)}] '{prompt[:50]}{'...' if len(prompt) > 50 else ''}'")
            
            # Tokenize
            tokens = clip_tokenizer(
                prompt,
                padding="max_length",
                max_length=77,
                truncation=True,
                return_tensors="pt"
            )
            
            # Get token strings for filename
            token_ids = tokens['input_ids'][0].tolist()
            token_strings = [clip_tokenizer.decode([tid]) for tid in token_ids]
            
            # Generate embedding
            with torch.no_grad():
                tokens = {k: v.to(device) for k, v in tokens.items()}
                outputs = clip_model(**tokens)
                embedding = outputs.last_hidden_state.float().cpu().numpy()[0]
            
            # Create filename from first 4 tokens
            filename_tokens = []
            for token in token_strings:
                cleaned = token.strip().replace('</w>', '').replace('<|startoftext|>', '').replace('<|endoftext|>', '')
                if cleaned and cleaned not in ['<|startoftext|>', '<|endoftext|>', '']:
                    filename_tokens.append(cleaned)
                if len(filename_tokens) >= 4:
                    break
            
            filename = "_".join(filename_tokens) + ".json"
            filepath = EMBEDDINGS_DIR / filename
            
            # Save embedding
            embedding_data = {
                "prompt": prompt,
                "embedding": embedding.tolist(),
                "shape": list(embedding.shape)
            }
            
            with open(filepath, 'w') as f:
                json.dump(embedding_data, f)
            
            print(f"   ✓ Saved: {filename}")
        
        print(f"\n✓ All {len(prompts)} embeddings saved to:")
        print(f"  {EMBEDDINGS_DIR}")



batch_generate_button.on_click(batch_generate_clip_embeddings)

print("Batch CLIP Embedding Generator")
print(f"Output directory: {EMBEDDINGS_DIR}")
print("Enter prompts (one per line):\n")
display(batch_prompt_input, batch_generate_button, batch_output_area)

Batch CLIP Embedding Generator
Output directory: /shares/weddigen.ki.uzh/laura_wagner/latent_vandalism_workshop/data/embeddings/CLIP
Enter prompts (one per line):



Textarea(value='A curious Raggiana bird-of-paradise peeking through dense green leaves.\nA magnificent riflebi…



Output()

## Batch Generation from Example Prompts File

Load CLIP prompts from `misc/example_prompts.txt` and generate embeddings. Files are saved to `examples/` subfolder.

In [9]:
import os
import json
import torch
import ipywidgets as widgets
from pathlib import Path
from IPython.display import display

# ------------------------------------------------------------------
# Paths
# ------------------------------------------------------------------

current_dir = Path.cwd()
BASE_EXAMPLES_DIR = current_dir.parent / "data/embeddings/examples/clip"
os.makedirs(BASE_EXAMPLES_DIR, exist_ok=True)

prompts_file = current_dir.parent / "misc/example_prompts.txt"

# ------------------------------------------------------------------
# CLIP UX config
# ------------------------------------------------------------------

CLIP_CONFIG = {
    'short': {
        'section_name': 'Short prompts',
        'subdir': 'short',
        'max_length': 77
    },
    '77_tokens': {
        'section_name': 'CLIP prompts',
        'subdir': '77_tokens',
        'max_length': 77
    }
}

# ------------------------------------------------------------------
# Prompt loader (section-aware)
# ------------------------------------------------------------------

def load_clip_prompts_from_file(filepath, section_name):
    """Load a specific prompt section from example_prompts.txt."""
    if not filepath.exists():
        return []

    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()

    sections = content.split('#')
    target_section = None

    for section in sections:
        if section_name in section:
            target_section = section
            break

    if target_section is None:
        return []

    prompts = []
    for line in target_section.splitlines():
        line = line.strip()
        if line and not line.startswith('#'):
            prompts.append(line)

    return prompts

# ------------------------------------------------------------------
# Widgets
# ------------------------------------------------------------------

clip_token_selector = widgets.Dropdown(
    options=[
        ('Short prompts', 'short'),
        ('CLIP prompts (77 tokens)', '77_tokens')
    ],
    value='77_tokens',
    description='Prompt Set:',
    style={'description_width': 'initial'}
)

file_batch_button = widgets.Button(
    description='Generate from File',
    button_style='info'
)

file_batch_output = widgets.Output()

# ------------------------------------------------------------------
# Batch generation
# ------------------------------------------------------------------

def batch_generate_from_file(b):
    with file_batch_output:
        file_batch_output.clear_output()

        selection = clip_token_selector.value
        config = CLIP_CONFIG[selection]

        max_length = config['max_length']   # always 77 for CLIP
        output_dir = BASE_EXAMPLES_DIR / config['subdir']
        os.makedirs(output_dir, exist_ok=True)

        prompts = load_clip_prompts_from_file(
            prompts_file,
            config['section_name']
        )

        if not prompts:
            print(f"❌ No prompts found for section '{config['section_name']}'.")
            return

        print(f"Loaded {len(prompts)} prompts")
        print(f"Section: {config['section_name']}")
        print(f"Token length: {max_length}")
        print(f"Output: {output_dir}\n")

        for i, prompt in enumerate(prompts, 1):
            print(f"[{i}/{len(prompts)}] {prompt[:60]}{'...' if len(prompt) > 60 else ''}")

            tokens = clip_tokenizer(
                prompt,
                padding="max_length",
                max_length=max_length,
                truncation=True,
                return_tensors="pt"
            )

            with torch.no_grad():
                tokens = {k: v.to(device) for k, v in tokens.items()}
                outputs = clip_model(**tokens)
                embedding = outputs.last_hidden_state.float().cpu().numpy()[0]

            # Filename from first meaningful tokens
            token_ids = tokens['input_ids'][0].tolist()
            token_strings = [clip_tokenizer.decode([tid]) for tid in token_ids]

            filename_tokens = []
            for token in token_strings:
                cleaned = (
                    token.replace('</w>', '')
                         .replace('<|startoftext|>', '')
                         .replace('<|endoftext|>', '')
                         .strip()
                )
                if cleaned:
                    filename_tokens.append(cleaned)
                if len(filename_tokens) >= 4:
                    break

            filename = "_".join(filename_tokens) + ".json"
            filepath = output_dir / filename

            with open(filepath, 'w') as f:
                json.dump({
                    "prompt": prompt,
                    "embedding": embedding.tolist(),
                    "shape": list(embedding.shape),
                    "max_length": max_length,
                    "prompt_set": config['section_name']
                }, f)

            print(f"   ✓ Saved: {filename}")

        print("\n✅ All CLIP embeddings generated successfully.")

# ------------------------------------------------------------------
# UI wiring
# ------------------------------------------------------------------

file_batch_button.on_click(batch_generate_from_file)

print("Generate CLIP embeddings from example_prompts.txt")
print(f"Source: {prompts_file}")
print(f"Base output: {BASE_EXAMPLES_DIR}\n")

display(clip_token_selector, file_batch_button, file_batch_output)


Generate CLIP embeddings from example_prompts.txt
Source: /shares/weddigen.ki.uzh/laura_wagner/latent_vandalism_workshop/misc/example_prompts.txt
Base output: /shares/weddigen.ki.uzh/laura_wagner/latent_vandalism_workshop/data/embeddings/examples/clip



Dropdown(description='Prompt Set:', index=1, options=(('Short prompts', 'short'), ('CLIP prompts (77 tokens)',…

Button(button_style='info', description='Generate from File', style=ButtonStyle())

Output()

---
<sub>Latent Vandalism Workshop • Laura Wagner, 2026 • [laurajul.github.io](https://laurajul.github.io/)</sub>