In [1]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from diffusers import DDPMScheduler, StableDiffusionPipeline
from transformers import CLIPTextModel, CLIPTokenizer
from datasets import load_dataset
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 8
num_epochs = 5
learning_rate = 5e-5
model_checkpoint = "runwayml/stable-diffusion-v1-5"

In [3]:
# Load CelebA dataset
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

dataset = load_dataset("./data/img_align_celeba", split="train")
dataset.set_transform(transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

Downloading data: 100%|██████████| 17582/17582 [00:00<00:00, 31450.08files/s] 
Generating train split: 17582 examples [00:00, 20732.90 examples/s]


In [4]:
pipe = StableDiffusionPipeline.from_pretrained(model_checkpoint, torch_dtype=torch.float16, variant="fp16")

Fetching 15 files: 100%|██████████| 15/15 [05:30<00:00, 22.02s/it]
Loading pipeline components...: 100%|██████████| 7/7 [00:01<00:00,  5.97it/s]


In [6]:
pipe

StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.28.0.dev0",
  "_name_or_path": "runwayml/stable-diffusion-v1-5",
  "feature_extractor": [
    "transformers",
    "CLIPImageProcessor"
  ],
  "image_encoder": [
    null,
    null
  ],
  "requires_safety_checker": true,
  "safety_checker": [
    "stable_diffusion",
    "StableDiffusionSafetyChecker"
  ],
  "scheduler": [
    "diffusers",
    "PNDMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

In [11]:
# Load Stable Diffusion model components
text_encoder = CLIPTextModel.from_pretrained(model_checkpoint)
tokenizer = CLIPTokenizer.from_pretrained(model_checkpoint)
model = StableDiffusionPipeline.from_pretrained(model_checkpoint, use_auth_token=True)
model.to(device)

# Set up the noise scheduler
noise_scheduler = DDPMScheduler(num_train_timesteps=1000)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

OSError: runwayml/stable-diffusion-v1-5 does not appear to have a file named config.json. Checkout 'https://huggingface.co/runwayml/stable-diffusion-v1-5/main' for available files.

In [None]:
# Training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        images = batch['image'].to(device)
        
        # Generate random noise
        noise = torch.randn(images.shape).to(device)
        
        # Sample timesteps uniformly
        timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (images.shape[0],), device=device).long()
        
        # Add noise to the images according to the timesteps
        noisy_images = noise_scheduler.add_noise(images, noise, timesteps)
        
        # Get text embeddings for conditioning (you can use any text, here we use a placeholder)
        text = ["a photo of a celebrity"] * images.shape[0]
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
        text_embeddings = text_encoder(**inputs).pooler_output
        
        # Forward pass
        noise_pred = model(noisy_images, timesteps, text_embeddings).sample
        
        # Compute the loss (mean squared error)
        loss = torch.nn.functional.mse_loss(noise_pred, noise)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {loss.item()}")

In [None]:




# Save the model
model.save_pretrained("./stable_diffusion_celeba")
tokenizer.save_pretrained("./stable_diffusion_celeba")

print("Training complete and model saved!")
