# Stable Diffusion 2.1 Dataset Generation

In [None]:
!pip install -q diffusers transformers accelerate
!pip install -q xformers torch torchvision lpips
!pip install -q kornia scikit-image scikit-learn
!pip install -q scipy opencv-python
!pip install -q colormath scipy
!pip install -U diffusers
!pip install -U safetensors pillow tqdm pandas


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for colormath (setup.py) ... [?25l[?25hdone
Collecting pillow
  Downloading pillow-12.0.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (8.8 kB)
Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Downloading pillow-12.0.0-cp312-cp312-ma

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Hugging Face prompts dataset ({'train': (73718, 1), 'test': (8192, 1)})
from datasets import load_dataset

ds = load_dataset("Gustavosta/Stable-Diffusion-Prompts")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/777 [00:00<?, ?B/s]

data/train.parquet:   0%|          | 0.00/9.23M [00:00<?, ?B/s]

data/eval.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/73718 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8192 [00:00<?, ? examples/s]

In [None]:
import pandas as pd
import numpy as np

hf_prompts_df = pd.DataFrame(ds['train'])
hf_prompts_df = hf_prompts_df.sample(frac=1).reset_index(drop=True)
hf_prompts_df['prompt_length'] = hf_prompts_df['Prompt'].apply(lambda x: len(x))


In [None]:
hf_prompts_df.to_csv('hf_prompts.csv', index=False)

In [None]:
hf_prompts_df


Unnamed: 0,Prompt,prompt_length
0,"Concept art sketch ninja warrior, anime, volum...",295
1,a bunny made of mirrors in a forest. highly de...,168
2,"a golden woman, eyes closed, glowing lavender ...",177
3,"fall gnomes svg vector, detailed, concept art,...",205
4,a highly detailed epic cinematic concept art C...,481
...,...,...
73713,"ancient alien portal, crowd of androids, beams...",200
73714,intricate oil painting of barrack obama eating...,201
73715,a beautiful illustration of a satanic altar in...,224
73716,( ( ( ( ( hyperrealist distant portrait of sha...,383


In [None]:
import os, hashlib, pandas as pd, torch
from tqdm.auto import tqdm
from PIL import Image
from diffusers import StableDiffusionPipeline, EulerAncestralDiscreteScheduler

In [None]:
MODEL_ID     = "stabilityai/stable-diffusion-2-1-base"
HEIGHT, WIDTH = 512, 512
STEPS        = 50
GUIDANCE     = 7.5
SCHEDULER    = "EulerA"
N_SAMPLES    = 1000
RANDOM_SEED  = 261
PROMPT_COL   = "Prompt"
USE_SAFETY_CHECKER = False

OUT_DIR      = "/content/sd21_images"
OUT_512_DIR  = os.path.join(OUT_DIR, "512x512")
OUT_400_DIR  = os.path.join(OUT_DIR, "400x400")
os.makedirs(OUT_512_DIR, exist_ok=True)
os.makedirs(OUT_400_DIR, exist_ok=True)

In [None]:
# Selecting a sample of 1000 prompts

df = hf_prompts_df[(hf_prompts_df['prompt_length'] < 500) & (hf_prompts_df['prompt_length'] > 200)]

gsel = df.sample(frac=1.0, random_state=RANDOM_SEED).head(N_SAMPLES).reset_index(drop=True)
prompts = gsel[PROMPT_COL].astype(str).str.strip().tolist()

def seed_from_text(t: str) -> int:
    return int(hashlib.sha256(t.encode("utf-8")).hexdigest()[:8], 16)

pairs = [{"id": i, "prompt": p, "seed": seed_from_text(p)} for i, p in enumerate(prompts)]

In [None]:
# addinf column of length of prompt for filtering
gsel['prompt_length'] = gsel['Prompt'].apply(lambda x: len(x))


In [None]:
gsel.to_csv('filtered_prompts.csv', index=False)

In [None]:
# Load SD 2.1 pipeline (fp16)
pipe = StableDiffusionPipeline.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    safety_checker=None if not USE_SAFETY_CHECKER else None,
    use_safetensors=True
)

pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

if torch.cuda.is_available():
    pipe = pipe.to("cuda")
    pipe.enable_attention_slicing()
    try:
        pipe.enable_xformers_memory_efficient_attention()
    except Exception:
        pass
else:
    print("CUDA not found")



model_index.json:   0%|          | 0.00/543 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

scheduler_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/911 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# Saving stable diffusion pipeline components
SAVE_DIR = "/content/drive/MyDrive/Colab Notebooks/Watermarking/models/sd21_base_pipe"
pipe.save_pretrained(SAVE_DIR)

In [None]:
# Image Generation

manifest_rows = []
pipe.set_progress_bar_config(disable=True)

with torch.inference_mode():
    for item in tqdm(pairs, total=len(pairs), desc="Generating SD2.1 images"):
        gen = torch.Generator(device=pipe.device).manual_seed(item["seed"])
        result = pipe(
            prompt=item["prompt"],
            num_inference_steps=STEPS,
            guidance_scale=GUIDANCE,
            height=HEIGHT, width=WIDTH,
            generator=gen
        )
        img = result.images[0]

        fname_512 = f"{item['id']:06d}_{item['seed']}.png"
        fpath_512 = os.path.join(OUT_512_DIR, fname_512)
        img.save(fpath_512, format="PNG")

        # saving 400x400 images for StegaStamp watermark encoding
        img_400 = img.resize((400, 400), resample=Image.LANCZOS)
        fname_400 = f"{item['id']:06d}_{item['seed']}_400.png"
        fpath_400 = os.path.join(OUT_400_DIR, fname_400)
        img_400.save(fpath_400, format="PNG")

        with open(fpath_512, "rb") as f:
            sha512 = hashlib.sha256(f.read()).hexdigest()
        with open(fpath_400, "rb") as f:
            sha400 = hashlib.sha256(f.read()).hexdigest()

        manifest_rows.append({
            "id": item["id"],
            "prompt": item["prompt"],
            "seed": item["seed"],
            "model": MODEL_ID,
            "scheduler": SCHEDULER,
            "steps": STEPS,
            "guidance": GUIDANCE,
            "height": HEIGHT,
            "width": WIDTH,
            "safety_checker": USE_SAFETY_CHECKER,
            "file_512": os.path.relpath(fpath_512, OUT_DIR),
            "sha256_512": sha512,
            "file_400": os.path.relpath(fpath_400, OUT_DIR),
            "sha256_400": sha400,
        })

Generating SD2.1 images:   0%|          | 0/1000 [00:00<?, ?it/s]

The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['octane render']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['cker , wlop , boris vallejo ))), octane render , unreal engine , 3 d render , macro mugshot !!!!!, ugly !!!!!!, octane render , nvidia raytracing demo , grainy , muted']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['draws']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['m , award - winning']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['on artstation pixiv makoto shinkai']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['enson , trending on artstation']
The following part of your input was truncated because CLIP can only handle sequence