# CLIP-L and CLIP-G Embeddings for SD3

SD3 uses **two** CLIP models that get concatenated:
- **CLIP-L** (OpenAI): 768-dim pooled embedding
- **CLIP-G** (OpenCLIP): 1280-dim pooled embedding
- **Combined**: 2048-dim pooled embedding for SD3

This notebook generates both and saves them together for use with SD3.

```mermaid
flowchart LR
    T[Text Prompt]

    CLIPL[CLIP-L Encoder]
    CLIPG[CLIP-G Encoder]

    PL[Pooled CLIP-L embedding]
    PG[Pooled CLIP-G embedding]

    F[Fusion and Projection]

    SD35[SD 3.5\nDiffusion Transformer]

    T --> CLIPL --> PL
    T --> CLIPG --> PG

    PL --> F
    PG --> F

    F -->|global conditioning| SD35


## Load CLIP-L and CLIP-G Models

In [None]:
from transformers import CLIPTextModel, CLIPTokenizer
import torch
import os
from pathlib import Path

# Setup paths
current_dir = Path.cwd()

# Load models path from config
models_path_file = current_dir.parent / "misc/paths/models.txt"
with open(models_path_file, 'r') as f:
    models_path = f.read().strip()
MODELS_DIR = current_dir.parent / models_path

CLIP_L_PATH = MODELS_DIR / "clip-vit-large-patch14"
CLIP_G_PATH = MODELS_DIR / "clip-vit-large-patch14-336"  # CLIP-G is the 336px variant

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

os.makedirs(MODELS_DIR, exist_ok=True)

In [None]:
# Load CLIP-L (OpenAI CLIP - 768 dim)
print("Loading CLIP-L (OpenAI)...")

if not os.path.exists(CLIP_L_PATH):
    print("  Downloading CLIP-L from Hugging Face...")
    clip_l_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
    clip_l_model = CLIPTextModel.from_pretrained(
        "openai/clip-vit-large-patch14",
        torch_dtype=torch.bfloat16
    )
    clip_l_tokenizer.save_pretrained(CLIP_L_PATH)
    clip_l_model.save_pretrained(CLIP_L_PATH)
    print("  ✓ Downloaded and saved")
else:
    print("  Loading from local...")

clip_l_tokenizer = CLIPTokenizer.from_pretrained(CLIP_L_PATH, local_files_only=True)
clip_l_model = CLIPTextModel.from_pretrained(
    CLIP_L_PATH,
    torch_dtype=torch.bfloat16,
    local_files_only=True
).to(device)
clip_l_model.eval()

print(f"✓ CLIP-L loaded!")
print(f"  Embedding dimension: {clip_l_model.config.hidden_size}")
print(f"  Max sequence length: {clip_l_tokenizer.model_max_length}")

In [None]:
# Load CLIP-G (OpenCLIP bigG - 1280 dim)
# Note: We'll use laion/CLIP-ViT-bigG-14-laion2B-39B-b160k which is CLIP-G
print("\nLoading CLIP-G (OpenCLIP bigG)...")

CLIP_G_PATH = MODELS_DIR / "CLIP-ViT-bigG-14-laion2B"

if not os.path.exists(CLIP_G_PATH):
    print("  Downloading CLIP-G from Hugging Face...")
    clip_g_tokenizer = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k")
    clip_g_model = CLIPTextModel.from_pretrained(
        "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k",
        torch_dtype=torch.bfloat16
    )
    clip_g_tokenizer.save_pretrained(CLIP_G_PATH)
    clip_g_model.save_pretrained(CLIP_G_PATH)
    print("  ✓ Downloaded and saved")
else:
    print("  Loading from local...")

clip_g_tokenizer = CLIPTokenizer.from_pretrained(CLIP_G_PATH, local_files_only=True)
clip_g_model = CLIPTextModel.from_pretrained(
    CLIP_G_PATH,
    torch_dtype=torch.bfloat16,
    local_files_only=True
).to(device)
clip_g_model.eval()

print(f"✓ CLIP-G loaded!")
print(f"  Embedding dimension: {clip_g_model.config.hidden_size}")
print(f"  Max sequence length: {clip_g_tokenizer.model_max_length}")

## Generate Combined CLIP Embeddings

In [None]:
import ipywidgets as widgets
from IPython.display import display
import numpy as np

# Create input widget
prompt_input = widgets.Textarea(
    value='an elephant',
    placeholder='Enter your prompt here',
    description='Prompt:',
    layout=widgets.Layout(width='80%', height='80px')
)

generate_button = widgets.Button(
    description='Generate CLIP-L + CLIP-G Embeddings',
    button_style='success'
)

output_area = widgets.Output()

# Global variables
current_clip_l_embedding = None
current_clip_g_embedding = None
current_combined_pooled = None
current_prompt = None

def generate_clip_embeddings(b):
    global current_clip_l_embedding, current_clip_g_embedding
    global current_combined_pooled, current_prompt
    
    with output_area:
        output_area.clear_output()
        
        prompt = prompt_input.value
        current_prompt = prompt
        
        print(f"Generating embeddings for: '{prompt}'\n")
        
        # Generate CLIP-L embeddings
        print("=== CLIP-L (OpenAI) ===")
        tokens_l = clip_l_tokenizer(
            prompt,
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        )
        
        with torch.no_grad():
            tokens_l = {k: v.to(device) for k, v in tokens_l.items()}
            outputs_l = clip_l_model(**tokens_l)
            
            # Get sequence embeddings and pooled embedding
            clip_l_hidden = outputs_l.last_hidden_state  # [1, 77, 768]
            clip_l_pooled = outputs_l.pooler_output      # [1, 768]
        
        current_clip_l_embedding = clip_l_hidden.float().cpu().numpy()[0]  # [77, 768]
        clip_l_pooled_np = clip_l_pooled.float().cpu().numpy()[0]          # [768]
        
        print(f"  Sequence shape: {current_clip_l_embedding.shape} (77 tokens × 768 dims)")
        print(f"  Pooled shape: {clip_l_pooled_np.shape} (768 dims)")
        
        # Generate CLIP-G embeddings
        print("\n=== CLIP-G (OpenCLIP bigG) ===")
        tokens_g = clip_g_tokenizer(
            prompt,
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        )
        
        with torch.no_grad():
            tokens_g = {k: v.to(device) for k, v in tokens_g.items()}
            outputs_g = clip_g_model(**tokens_g)
            
            # Get sequence embeddings and pooled embedding
            clip_g_hidden = outputs_g.last_hidden_state  # [1, 77, 1280]
            clip_g_pooled = outputs_g.pooler_output      # [1, 1280]
        
        current_clip_g_embedding = clip_g_hidden.float().cpu().numpy()[0]  # [77, 1280]
        clip_g_pooled_np = clip_g_pooled.float().cpu().numpy()[0]          # [1280]
        
        print(f"  Sequence shape: {current_clip_g_embedding.shape} (77 tokens × 1280 dims)")
        print(f"  Pooled shape: {clip_g_pooled_np.shape} (1280 dims)")
        
        # Concatenate pooled embeddings for SD3
        print("\n=== Combined for SD3 ===")
        current_combined_pooled = np.concatenate([clip_l_pooled_np, clip_g_pooled_np])
        print(f"  Combined pooled shape: {current_combined_pooled.shape} (768 + 1280 = 2048 dims)")
        print(f"  ✓ Ready for SD3!")
        
        print(f"\nFirst 10 values of combined pooled embedding:")
        print(current_combined_pooled[:10])

generate_button.on_click(generate_clip_embeddings)
display(prompt_input, generate_button, output_area)

## Save Combined CLIP Embeddings

In [None]:
import json

# Define embeddings directory
CLIP_COMBINED_DIR = current_dir.parent / "data/embeddings/CLIP_SD3"
os.makedirs(CLIP_COMBINED_DIR, exist_ok=True)

save_button = widgets.Button(
    description='Save CLIP Embeddings',
    button_style='primary'
)

save_output = widgets.Output()

def save_clip_embeddings(b):
    with save_output:
        save_output.clear_output()
        
        if current_combined_pooled is None:
            print("❌ No embeddings to save! Generate embeddings first.")
            return
        
        # Get first 4 tokens from CLIP-L for filename
        tokens_l = clip_l_tokenizer(
            current_prompt,
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        )
        
        token_ids = tokens_l['input_ids'][0].tolist()
        token_strings = [clip_l_tokenizer.decode([tid]) for tid in token_ids]
        
        # Get first 4 real tokens
        filename_tokens = []
        for token in token_strings:
            cleaned = token.strip().replace('</w>', '').replace('<|startoftext|>', '').replace('<|endoftext|>', '')
            if cleaned and cleaned not in ['<|startoftext|>', '<|endoftext|>', '']:
                filename_tokens.append(cleaned)
            if len(filename_tokens) >= 4:
                break
        
        filename = "_".join(filename_tokens) + ".json"
        filepath = CLIP_COMBINED_DIR / filename
        
        # Save all embeddings
        embedding_data = {
            "prompt": current_prompt,
            "clip_l_sequence": current_clip_l_embedding.tolist(),
            "clip_g_sequence": current_clip_g_embedding.tolist(),
            "combined_pooled": current_combined_pooled.tolist(),
            "shapes": {
                "clip_l_sequence": list(current_clip_l_embedding.shape),
                "clip_g_sequence": list(current_clip_g_embedding.shape),
                "combined_pooled": list(current_combined_pooled.shape)
            }
        }
        
        with open(filepath, 'w') as f:
            json.dump(embedding_data, f)
        
        print(f"✓ CLIP embeddings saved!")
        print(f"  File: {filepath}")
        print(f"  Size: {os.path.getsize(filepath) / 1024:.2f} KB")
        print(f"\nContains:")
        print(f"  - CLIP-L sequence: [77, 768]")
        print(f"  - CLIP-G sequence: [77, 1280]")
        print(f"  - Combined pooled: [2048] (for SD3)")

save_button.on_click(save_clip_embeddings)
display(save_button, save_output)

## Batch Generate CLIP-L + CLIP-G Embeddings

In [None]:
# Batch generate CLIP-L + CLIP-G embeddings for multiple prompts
BATCH_DIR = CLIP_COMBINED_DIR / "examples"
os.makedirs(BATCH_DIR, exist_ok=True)

batch_prompt_input = widgets.Textarea(
    value='an elephant\na red sports car\na mountain landscape with snow',
    placeholder='Enter up to 10 prompts, one per line',
    description='Prompts:',
    layout=widgets.Layout(width='80%', height='200px')
)

batch_generate_button = widgets.Button(
    description='Batch Generate & Save',
    button_style='warning'
)

batch_output_area = widgets.Output()

def batch_generate_combined_embeddings(b):
    with batch_output_area:
        batch_output_area.clear_output()
        
        # Parse prompts (one per line, max 10)
        prompts = [p.strip() for p in batch_prompt_input.value.strip().split('\n') if p.strip()]
        prompts = prompts[:10]  # Limit to 10
        
        if not prompts:
            print("❌ No prompts provided!")
            return
        
        print(f"Generating {len(prompts)} CLIP-L + CLIP-G embeddings...\n")
        
        for i, prompt in enumerate(prompts, 1):
            print(f"[{i}/{len(prompts)}] '{prompt[:50]}{'...' if len(prompt) > 50 else ''}'")
            
            # Generate CLIP-L embeddings
            tokens_l = clip_l_tokenizer(
                prompt,
                padding="max_length",
                max_length=77,
                truncation=True,
                return_tensors="pt"
            )
            
            with torch.no_grad():
                tokens_l = {k: v.to(device) for k, v in tokens_l.items()}
                outputs_l = clip_l_model(**tokens_l)
                clip_l_hidden = outputs_l.last_hidden_state.float().cpu().numpy()[0]
                clip_l_pooled = outputs_l.pooler_output.float().cpu().numpy()[0]
            
            # Generate CLIP-G embeddings
            tokens_g = clip_g_tokenizer(
                prompt,
                padding="max_length",
                max_length=77,
                truncation=True,
                return_tensors="pt"
            )
            
            with torch.no_grad():
                tokens_g = {k: v.to(device) for k, v in tokens_g.items()}
                outputs_g = clip_g_model(**tokens_g)
                clip_g_hidden = outputs_g.last_hidden_state.float().cpu().numpy()[0]
                clip_g_pooled = outputs_g.pooler_output.float().cpu().numpy()[0]
            
            # Concatenate pooled embeddings for SD3
            combined_pooled = np.concatenate([clip_l_pooled, clip_g_pooled])
            
            # Get token strings for filename
            token_ids = tokens_l['input_ids'][0].tolist()
            token_strings = [clip_l_tokenizer.decode([tid]) for tid in token_ids]
            
            # Create filename from first 4 tokens
            filename_tokens = []
            for token in token_strings:
                cleaned = token.strip().replace('</w>', '').replace('<|startoftext|>', '').replace('<|endoftext|>', '')
                if cleaned and cleaned not in ['<|startoftext|>', '<|endoftext|>', '']:
                    filename_tokens.append(cleaned)
                if len(filename_tokens) >= 4:
                    break
            
            filename = "_".join(filename_tokens) + ".json"
            filepath = BATCH_DIR / filename
            
            # Save all embeddings
            embedding_data = {
                "prompt": prompt,
                "clip_l_sequence": clip_l_hidden.tolist(),
                "clip_g_sequence": clip_g_hidden.tolist(),
                "combined_pooled": combined_pooled.tolist(),
                "shapes": {
                    "clip_l_sequence": list(clip_l_hidden.shape),
                    "clip_g_sequence": list(clip_g_hidden.shape),
                    "combined_pooled": list(combined_pooled.shape)
                }
            }
            
            with open(filepath, 'w') as f:
                json.dump(embedding_data, f)
            
            print(f"   ✓ Saved: {filename}")
        
        print(f"\n✓ All {len(prompts)} embeddings saved to:")
        print(f"  {BATCH_DIR}")

batch_generate_button.on_click(batch_generate_combined_embeddings)

print("Batch CLIP-L + CLIP-G Embedding Generator")
print(f"Output directory: {BATCH_DIR}")
print("Enter up to 10 prompts (one per line):\n")
display(batch_prompt_input, batch_generate_button, batch_output_area)

---
<sub>Latent Vandalism Workshop • Laura Wagner, 2026 • [laurajul.github.io](https://laurajul.github.io/)</sub>