In [1]:
from diffusers import StableDiffusionXLPipeline, ControlNetModel,StableDiffusionXLImg2ImgPipeline
import torch
from PIL import Image
import os, json, time, gc, shutil


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

CUDA available: True
Device name: NVIDIA GeForce RTX 4070


In [3]:
# Load your dataset (replace with your path)
with open("pokemon_dataset_100.json", "r") as f:
    pokemon_pairs = json.load(f)[:]

print(type(pokemon_pairs))   # <class 'list'>
print(len(pokemon_pairs))    # 100 entries

<class 'list'>
100


In [4]:
pokemon_pairs = [
    {
        "base_name": "Embermane",
        "evolved_name": "Pyrograth",
        "base": {
            "prompt": "small elemental lion with embers flowing through its mane, smoldering eyes, faint heat haze, anime cel-shaded, magical aura, full body, arcane fantasy style, clean lineart",
            "negative": "text, watermark, signature, cartoonish, extra limbs, bad anatomy, humanoid, armor"
        },
        "evolved": {
            "prompt": (
                "Evolved volcanic lion deity version of the creature in the image: massive bipedal fire guardian with magma armor and flowing ember mane, "
                "runic symbols glowing across its body, retains orange-gold palette, anime fantasy RPG art, cel-shaded, dramatic lighting, "
                "high detail, coherent humanoid anatomy"
            ),
            "negative": (
                "small lion, quadruped animal, horse-like, flat lighting, low quality, overexposed, text, watermark, malformed limbs"
            ),
            "strength": 0.85,
            "guidance_scale": 7.4
        }
    },
    {
        "base_name": "Thornveil",
        "evolved_name": "Sylvarch",
        "base": {
            "prompt": "mystical forest reptile with vine scales and moss-covered body, faint glowing runes along its tail, anime cel-shaded, natural lighting, full body, fantasy forest tone",
            "negative": "text, watermark, signature, mechanical, humanoid, metallic, distorted anatomy"
        },
        "evolved": {
            "prompt": (
                "Evolved druid guardian version of the creature in the image: tall humanoid dragon wrapped in living vines and bark armor, "
                "leafy horns and glowing emerald eyes, runic patterns pulsing with life energy, retains green-brown palette, "
                "anime RPG art, cel-shaded, detailed foliage textures, coherent anatomy"
            ),
            "negative": (
                "tiny reptile, small lizard, bug, robotic, metal body, low quality, flat lighting, malformed face, text, watermark"
            ),
            "strength": 0.75,
            "guidance_scale": 7.2
        }
    },
    {
        "base_name": "Mireling",
        "evolved_name": "Abythral",
        "base": {
            "prompt": "shadowy amphibian creature emerging from mist, deep indigo skin with faint bioluminescent markings, anime fantasy style, full body, glowing eyes, mysterious tone",
            "negative": "text, watermark, signature, humanoid, armor, cartoon, extra limbs, bad anatomy"
        },
        "evolved": {
            "prompt": (
                "Evolved abyssal warden version of the creature in the image: towering aquatic specter with flowing dark robes of shadow-water, "
                "four glowing eyes, ornate trident, retains indigo and teal palette, anime fantasy art, ethereal lighting, high detail, "
                "cel-shaded, powerful composition"
            ),
            "negative": (
                "frog, tadpole, small creature, cute, simple design, low detail, flat color, distorted limbs, text, watermark"
            ),
            "strength": 0.8,
            "guidance_scale": 7.3
        }
    },
    {
        "base_name": "Voltusk",
        "evolved_name": "Stormarok",
        "base": {
            "prompt": "electric boar spirit wreathed in stormlight, azure sparks crackling along tusks, glowing patterns in fur, anime cel-shaded, full body, dramatic fantasy tone",
            "negative": "text, watermark, signature, humanoid, mechanical, messy lighting, bad anatomy"
        },
        "evolved": {
            "prompt": (
                "Evolved thunder colossus version of the creature in the image: massive armored beast of lightning with glowing tusks and storm halo, "
                "electric energy coursing through veins, retains blue-yellow palette, anime RPG art, cel-shaded, dynamic lighting, detailed rendering"
            ),
            "negative": (
                "small piglet, quadruped fox, metal plating, humanoid form, low quality, flat pose, text, watermark, malformed proportions"
            ),
            "strength": 0.7,
            "guidance_scale": 7.5
        }
    },
    {
        "base_name": "Frostbane",
        "evolved_name": "Cryovain",
        "base": {
            "prompt": "ice wolf familiar with crystalline fur and misty breath, piercing cyan eyes, arcane energy shimmer, anime cel-shaded, high contrast lighting, full body",
            "negative": "text, watermark, signature, humanoid, armor, deformed, fire effects, low quality"
        },
        "evolved": {
            "prompt": (
                "Evolved frost deity version of the creature in the image: regal humanoid wolf spirit cloaked in flowing ice robes, "
                "antler-like ice horns, glowing cyan veins, retains blue-white palette, anime fantasy art, cel-shaded, elegant yet fierce stance, "
                "magical frost aura, intricate details"
            ),
            "negative": (
                "quadruped animal, bulky beast, molten textures, messy shading, text, watermark, distorted proportions"
            ),
            "strength": 0.8,
            "guidance_scale": 7.5
        }
    },
    {
        "base_name": "Cindrel",
        "evolved_name": "Ashdrake",
        "base": {
            "prompt": "small draconic ember spirit with glowing scales and smoldering wings, faint ash falling around, anime cel-shaded, fantasy fire glow, full body",
            "negative": "text, watermark, signature, humanoid, flat lighting, bad anatomy, low quality"
        },
        "evolved": {
            "prompt": (
                "Evolved dragon champion version of the creature in the image: powerful humanoid dragon warrior with blazing wing-blades, "
                "crimson magma armor, horns of obsidian, retains fiery red-gold palette, anime RPG art, cel-shaded, detailed scales, "
                "coherent anatomy, heroic pose"
            ),
            "negative": (
                "tiny dragon, quadruped, wyvern shape, cartoon, simple shading, low detail, text, watermark, distorted anatomy"
            ),
            "strength": 0.8,
            "guidance_scale": 7.6
        }
    },
    {
        "base_name": "Lumora",
        "evolved_name": "Seraphis",
        "base": {
            "prompt": "floating spirit of light with veil-like wings and radiant runes on body, gentle aura, anime cel-shaded, magical fantasy glow, full body",
            "negative": "text, watermark, signature, humanoid face, extra limbs, low quality, mechanical"
        },
        "evolved": {
            "prompt": (
                "Evolved divine guardian version of the creature in the image: majestic angelic humanoid formed of pure light energy, "
                "runic armor inscribed with glyphs, radiant wings spreading cosmic glow, retains white-gold palette, "
                "anime celestial art, high detail, coherent anatomy, awe-inspiring stance"
            ),
            "negative": (
                "tiny fairy, opaque body, flat shading, robotic, distorted proportions, text, watermark, lowres"
            ),
            "strength": 0.9,
            "guidance_scale": 7.7
        }
    },
    {
        "base_name": "Obsidim",
        "evolved_name": "Gravemorn",
        "base": {
            "prompt": "small shadow elemental creature made of black glass shards, glowing cracks and violet core, dark fantasy anime art, cel-shaded, ominous glow, full body",
            "negative": "text, watermark, signature, cute, humanoid, metallic, distorted anatomy, cartoonish"
        },
        "evolved": {
            "prompt": (
                "Evolved wraith-lord version of the creature in the image: towering humanoid figure cloaked in broken obsidian armor, "
                "purple fire burning beneath, cracked crystal sword, retains black-violet palette, anime dark fantasy art, "
                "cel-shaded, dramatic lighting, high detail"
            ),
            "negative": (
                "small creature, ghost blob, flat silhouette, low quality, blurry details, text, watermark"
            ),
            "strength": 0.85,
            "guidance_scale": 7.5
        }
    },
    {
        "base_name": "Runeling",
        "evolved_name": "Arcanor",
        "base": {
            "prompt": "tiny golem inscribed with glowing runes, levitating stones forming its limbs, arcane core pulsing with light, anime cel-shaded, magical tone, full body",
            "negative": "text, watermark, signature, humanoid, animal, cartoon, mechanical"
        },
        "evolved": {
            "prompt": (
                "Evolved grand construct version of the creature in the image: colossal humanoid golem covered in layered runic armor, "
                "floating sigils orbiting its chest, runes glowing in shifting colors, retains stone-gray and cyan palette, "
                "anime arcane fantasy art, cel-shaded, intricate magic patterns, high detail"
            ),
            "negative": (
                "small golem, blob, melted texture, flat light, simple design, text, watermark, malformed proportions"
            ),
            "strength": 0.75,
            "guidance_scale": 7.3
        }
    },
    {
        "base_name": "Tempyr",
        "evolved_name": "Chronavon",
        "base": {
            "prompt": "mystical owl creature with floating clockwork feathers, glowing eyes that shift color, anime fantasy art, cel-shaded, elegant design, full body, magical atmosphere",
            "negative": "text, watermark, signature, humanoid, messy lineart, bad anatomy"
        },
        "evolved": {
            "prompt": (
                "Evolved timekeeper form of the creature in the image: divine humanoid owl sage with flowing time-sigil robes, "
                "ethereal feathers of gold and azure, clockwork halo of spinning runes, retains original palette, "
                "anime RPG fantasy art, cel-shaded, coherent anatomy, majestic pose, glowing magical symbols"
            ),
            "negative": (
                "tiny bird, animal owl, simple textures, flat colors, distorted wings, low quality, text, watermark"
            ),
            "strength": 0.8,
            "guidance_scale": 7.6
        }
    }
]


In [5]:
device = "cuda"
dtype = torch.float16

In [16]:
pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=dtype,
    use_safetensors=True
).to(device)

Loading pipeline components...: 100%|██████████| 7/7 [00:04<00:00,  1.69it/s]


In [17]:
# Prevent black outputs on some setups:
pipe.enable_attention_slicing()
pipe.enable_vae_tiling()

In [11]:
output_dir = "pokemon_out" 

In [None]:
 # ensure defined
try:
    shutil.rmtree(output_dir)
except Exception as e:
    print(e)
    pass
os.makedirs(output_dir, exist_ok=True)

[WinError 3] The system cannot find the path specified: 'pokemon_out'


In [6]:
SINGLE_SUBJECT_ADDON = (
    "SINGLE SUBJECT, SOLO, ONE CREATURE ONLY, CENTERED, FULL BODY, "
    "plain neutral background, subject isolated, studio backdrop"
)

SINGLE_SUBJECT_NEG = (
    "multiple creatures, duplicate creature, twins, second creature, extra subject, group, crowd, swarm, "
    "background characters, reflection duplicates, photobomb, collage, split screen, multi-panel"
)

def make_single_subject(prompt: str, negative: str = ""):
    p = f"{prompt}, {SINGLE_SUBJECT_ADDON}"
    n = (negative + ", " if negative else "") + SINGLE_SUBJECT_NEG
    return p, n

In [7]:
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()   # free unattached cached blocks
    torch.cuda.synchronize()   # finalize GPU work before timing

In [20]:
start_global = time.perf_counter()
for i,pair in enumerate(pokemon_pairs):
    print(f"MAKING {i} / {len(pokemon_pairs)}")
    base_name = pair["base_name"]
    
    base_p, base_n = make_single_subject(pair["base"]["prompt"], pair["base"]["negative"])
    print(f"MAKING {base_name}")

    out_folder = os.path.join(output_dir, base_name)
    os.makedirs(out_folder, exist_ok=True)

    base_info = pair["base"]

    t0 = time.perf_counter()

    # Optional: ensure no stale work on the device
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    result = pipe(
        prompt=base_p,
        negative_prompt=base_n + ', text, extra limbs, watermark, multiple bodies, character sheet, concept sheet, turnaround, orthographic, reference sheet, multiple angles, extra head, extra limbs, dismembered parts, split view, multiple views, alternate poses, layout, blueprint, overlay, design board, draft, cutout, outline, diagram, showcase, dissection, duplicated face, extra body',
        num_inference_steps=26,
        guidance_scale=8.0,
        height=832, width=832,
    )

    img = result.images[0]
    img.save(os.path.join(out_folder, f"{base_name}.png"))

    # ---- cleanup to avoid iteration-to-iteration slowdown ----
    del img, result
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()   # free unattached cached blocks
        torch.cuda.synchronize()   # finalize GPU work before timing

    t1 = time.perf_counter()
    print(f"{base_name} took {t1 - t0:.2f}s")

end_global = time.perf_counter()
print(f"Total: {end_global - start_global:.1f}s (avg {(end_global - start_global)/100:.2f}s/img)")

MAKING Embermane


100%|██████████| 26/26 [00:08<00:00,  3.06it/s]


Embermane took 16.75s
MAKING Thornveil


100%|██████████| 26/26 [00:13<00:00,  1.94it/s]


Thornveil took 48.89s
MAKING Mireling


100%|██████████| 26/26 [00:18<00:00,  1.42it/s]


Mireling took 51.93s
MAKING Voltusk


100%|██████████| 26/26 [00:12<00:00,  2.09it/s]


Voltusk took 46.20s
MAKING Frostbane


100%|██████████| 26/26 [00:12<00:00,  2.14it/s]


Frostbane took 16.71s
MAKING Cindrel


100%|██████████| 26/26 [00:14<00:00,  1.85it/s]


Cindrel took 37.52s
MAKING Lumora


100%|██████████| 26/26 [00:40<00:00,  1.55s/it]


Lumora took 66.23s
MAKING Obsidim


100%|██████████| 26/26 [00:08<00:00,  2.97it/s]


Obsidim took 36.67s
MAKING Runeling


100%|██████████| 26/26 [01:40<00:00,  3.87s/it]


Runeling took 106.32s
MAKING Tempyr


100%|██████████| 26/26 [01:46<00:00,  4.08s/it]


Tempyr took 145.26s
Total: 572.6s (avg 5.73s/img)


In [8]:
pipe = StableDiffusionXLPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=dtype,
    use_safetensors=True
).to(device)


pipe.enable_attention_slicing()   # reduces peak VRAM (slightly slower but more stable)
pipe.enable_vae_tiling()          # helpful if you go larger than 1024

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]`torch_dtype` is deprecated! Use `dtype` instead!
Loading pipeline components...: 100%|██████████| 7/7 [00:05<00:00,  1.19it/s]


In [9]:
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()   # free unattached cached blocks
    torch.cuda.synchronize()   # finalize GPU work before timing

In [13]:
start_global = time.perf_counter()
for i,pair in enumerate(pokemon_pairs):
    print(f"MAKING {i} / {len(pokemon_pairs)}")
    base_name = pair["base_name"]
    evolved_name = pair["evolved_name"]
    print(f"MAKING {base_name}")

    

    t0 = time.perf_counter()

    # Optional: ensure no stale work on the device
    if torch.cuda.is_available():
        torch.cuda.synchronize()

   
    out_folder = os.path.join(output_dir, base_name)
    init_img = Image.open(os.path.join(out_folder, f"{base_name}.png")).convert("RGB")
    evolved_p, evolved_n = make_single_subject(pair["evolved"]["prompt"], pair["evolved"]["negative"])

    
    result = pipe(
        prompt=evolved_p,
        evolved_n=evolved_n + ', text, extra limbs, watermark, multiple bodies, character sheet, concept sheet, turnaround, orthographic, reference sheet, multiple angles, extra head, extra limbs, dismembered parts, split view, multiple views, alternate poses, layout, blueprint, overlay, design board, draft, cutout, outline, diagram, showcase, dissection, duplicated face, extra body',
        num_inference_steps=26,
        stength = 0.9,
        image=init_img,
        guidance_scale=8.0,
        height=832, width=832,
    )

    img = result.images[0]
    img.save(os.path.join(out_folder, f"{evolved_name}.png"))

    # ---- cleanup to avoid iteration-to-iteration slowdown ----
    del img, result
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()   # free unattached cached blocks
        torch.cuda.synchronize()   # finalize GPU work before timing

    t1 = time.perf_counter()
    print(f"{evolved_name} took {t1 - t0:.2f}s")

end_global = time.perf_counter()
print(f"Total: {end_global - start_global:.1f}s (avg {(end_global - start_global)/100:.2f}s/img)")

Token indices sequence length is longer than the specified maximum sequence length for this model (85 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['neutral background, subject isolated, studio backdrop']


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Token indices sequence length is longer than the specified maximum sequence length for this model (85 > 77). Running this sequence through the model will result in indexing errors
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['neutral background, subject isolated, studio backdrop']


MAKING 0 / 10
MAKING Embermane


100%|██████████| 26/26 [00:07<00:00,  3.38it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['neutral background, subject isolated, studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['neutral background, subject isolated, studio backdrop']


Pyrograth took 9.45s
MAKING 1 / 10
MAKING Thornveil


100%|██████████| 26/26 [00:07<00:00,  3.32it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', studio backdrop']


Sylvarch took 11.36s
MAKING 2 / 10
MAKING Mireling


100%|██████████| 26/26 [00:07<00:00,  3.61it/s]


Abythral took 10.97s
MAKING 3 / 10
MAKING Voltusk


100%|██████████| 26/26 [00:07<00:00,  3.54it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['plain neutral background, subject isolated, studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['plain neutral background, subject isolated, studio backdrop']


Stormarok took 11.22s
MAKING 4 / 10
MAKING Frostbane


100%|██████████| 26/26 [00:07<00:00,  3.57it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['isolated, studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['isolated, studio backdrop']


Cryovain took 10.69s
MAKING 5 / 10
MAKING Cindrel


100%|██████████| 26/26 [00:07<00:00,  3.62it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['isolated, studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['isolated, studio backdrop']


Ashdrake took 10.50s
MAKING 6 / 10
MAKING Lumora


100%|██████████| 26/26 [00:06<00:00,  3.74it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: [', studio backdrop']


Seraphis took 9.50s
MAKING 7 / 10
MAKING Obsidim


100%|██████████| 26/26 [00:06<00:00,  3.76it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['full body, plain neutral background, subject isolated, studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['full body, plain neutral background, subject isolated, studio backdrop']


Gravemorn took 9.32s
MAKING 8 / 10
MAKING Runeling


100%|██████████| 26/26 [00:06<00:00,  3.72it/s]
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['plain neutral background, subject isolated, studio backdrop']
The following part of your input was truncated because CLIP can only handle sequences up to 77 tokens: ['plain neutral background, subject isolated, studio backdrop']


Arcanor took 9.45s
MAKING 9 / 10
MAKING Tempyr


100%|██████████| 26/26 [00:06<00:00,  3.72it/s]


Chronavon took 8.94s
Total: 101.4s (avg 1.01s/img)
