# Google Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install dependencies

In [2]:
!pip install -q diffusers transformers accelerate datasets safetensors peft xformers tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.2/117.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [3]:
import torch
import random
from pathlib import Path
from PIL import Image
from tqdm import tqdm

from diffusers import StableDiffusionPipeline, DDPMScheduler
from diffusers.optimization import get_scheduler
from transformers import CLIPTextModel, CLIPTokenizer
from accelerate import Accelerator
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Config

In [4]:
# Global config
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BASE_MODEL = "runwayml/stable-diffusion-v1-5"

LORA_ROOT = "/content/drive/MyDrive/Colab Notebooks/sd_lora"
DATA_ROOT = f"{LORA_ROOT}/train"
IMAGES_DIR = f"{DATA_ROOT}/images"
CAPTIONS_DIR = f"{DATA_ROOT}/captions"
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/sd_lora/output"
!mkdir -p "$OUTPUT_DIR"

# Training Hyperparameters (Adjusted for T4 GPU)
LR = 5e-5
BATCH_SIZE = 2                   # Per-device batch size for T4; effective batch = BATCH_SIZE * accum
GRADIENT_ACCUMULATION_STEPS = 4  # Accumulation steps for Effective Batch Size of 16

# Training config
EPOCHS = 5
NUM_IMAGES = 1000
steps_per_epoch = NUM_IMAGES / BATCH_SIZE
effective_steps_per_epoch = steps_per_epoch / GRADIENT_ACCUMULATION_STEPS
TARGET_STEPS = int(effective_steps_per_epoch * EPOCHS)
WARMUP_STEPS = max(1, int(0.05 * TARGET_STEPS))
SAVE_EVERY_STEPS = 200
RESOLUTION = 512
SEED = 42

torch.manual_seed(SEED)
random.seed(SEED)

# Dataset Definition

In [5]:
class ImageCaptionDataset(Dataset):
    def __init__(self, images_dir, captions_dir, resolution=512, tokenizer=None):
        self.images_dir = Path(images_dir)
        self.captions_dir = Path(captions_dir)
        self.ids = sorted([p.stem for p in self.images_dir.glob("*.jpg")])
        self.resolution = resolution
        self.tokenizer = tokenizer

        # Optimized augmentation for food images
        self.transform = transforms.Compose([
            # Slight random crop and scale
            transforms.RandomResizedCrop(
                resolution, scale=(0.85, 1.0), ratio=(0.95, 1.05)
            ),
            # Horizontal flip
            transforms.RandomHorizontalFlip(p=0.5),
            # Small rotation
            transforms.RandomRotation(degrees=5),
            # Subtle color variation
            transforms.ColorJitter(
                brightness=0.1,
                contrast=0.1,
                saturation=0.05,
                hue=0.02
            ),
            # Random Gaussian blur applied to ~30% of images
            transforms.RandomApply([
                transforms.GaussianBlur(kernel_size=(3, 5), sigma=(0.1, 1.0))
            ], p=0.3),
            # Convert to tensor and normalize to [-1, 1]
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        ])

    def __len__(self):
        return len(self.ids)


    def __getitem__(self, idx):
        id_ = self.ids[idx]
        img = Image.open(self.images_dir / f"{id_}.jpg").convert("RGB")
        img = self.transform(img)
        caption = (self.captions_dir / f"{id_}.txt").read_text(encoding="utf-8").strip()
        inputs = self.tokenizer(caption, padding="max_length", truncation=True, max_length=77, return_tensors="pt")
        return {
            "pixel_values": img,
            "input_ids": inputs.input_ids[0],
            "attention_mask": inputs.attention_mask[0]
        }

# Load pipeline

In [6]:
pipe = StableDiffusionPipeline.from_pretrained(
    BASE_MODEL,
    safety_checker=None
).to(DEVICE)

# Enable xformers memory efficient attention (if available)
try:
    pipe.enable_xformers_memory_efficient_attention()
    print("xformers enabled")
except Exception as e:
    print("xformers not available or failed to enable:", e)

# Extract components
vae = pipe.vae
unet = pipe.unet
scheduler = pipe.scheduler

# Free pipeline object
del pipe
torch.cuda.empty_cache()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

text_encoder/model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

unet/diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

vae/diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


xformers enabled


# Load Tokenizer & Text Encoder

In [7]:
tokenizer = CLIPTokenizer.from_pretrained(BASE_MODEL, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(BASE_MODEL, subfolder="text_encoder").to(DEVICE)

# LoRA config for UNet
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["to_q", "to_k", "to_v"],
    lora_dropout=0.05,
    bias="none",
)

unet = get_peft_model(unet, lora_config)
unet.print_trainable_parameters()

# optional guidance for text encoder
# If you'd like to also train a small LoRA on the text encoder to learn new dish names,
# you can create a second LoraConfig and wrap the text_encoder similarly. This helps
# the model better associate new words like "sarma" with the visual concept.
# Example (uncomment to use):
# te_lora = LoraConfig(r=4, lora_alpha=16, target_modules=["q_proj", "v_proj"], lora_dropout=0.0)
# text_encoder = get_peft_model(text_encoder, te_lora)
# text_encoder.print_trainable_parameters()

trainable params: 1,195,008 || all params: 860,715,972 || trainable%: 0.1388


# Dataloader Setup

In [8]:
dataset = ImageCaptionDataset(IMAGES_DIR, CAPTIONS_DIR, resolution=RESOLUTION, tokenizer=tokenizer)

def collate_fn(examples):
    pixel_values = torch.stack([e["pixel_values"] for e in examples])
    input_ids = torch.stack([e["input_ids"] for e in examples])
    attention_mask = torch.stack([e["attention_mask"] for e in examples])
    return {"pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask}

dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)



# Training loop

In [11]:
# -------------------- Accelerator --------------------
accelerator = Accelerator(
    mixed_precision="fp16",
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS
)

# -------------------- Optimizer & Schedulers --------------------
optimizer = torch.optim.AdamW(unet.parameters(), lr=LR)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=WARMUP_STEPS,
    num_training_steps=TARGET_STEPS
)

noise_scheduler = DDPMScheduler(
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    num_train_timesteps=1000
)

# -------------------- Prepare dataloader and models --------------------
unet, vae, text_encoder, optimizer, dataloader, lr_scheduler = accelerator.prepare(
    unet, vae, text_encoder, optimizer, dataloader, lr_scheduler
)

vae.eval()
text_encoder.eval()

# -------------------- Training loop --------------------
loss_history = []
global_step = 0
pbar = tqdm(total=TARGET_STEPS, desc=f"Training LoRA", dynamic_ncols=True, position=0)

for epoch in range(EPOCHS):
    unet.train()
    for step, batch in enumerate(dataloader):

        with accelerator.accumulate(unet):
            pixel_values = batch["pixel_values"].to(DEVICE, dtype=torch.float32)
            input_ids = batch["input_ids"].to(DEVICE)

            # Encode images to latents (no grad)
            with torch.no_grad():
                latents = vae.encode(pixel_values).latent_dist.sample() * 0.18215

            latents = latents.to(unet.dtype)

            # Encode text (no grad)
            with torch.no_grad():
                text_embeddings = text_encoder(input_ids)[0]

            # Diffusion noise
            timesteps = torch.randint(0, scheduler.config.num_train_timesteps, (latents.shape[0],), device=latents.device)
            noise = torch.randn_like(latents, device=latents.device)
            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

            # UNet forward
            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states=text_embeddings).sample

            # Compute MSE loss
            loss = torch.nn.functional.mse_loss(model_pred.float(), noise.float(), reduction="mean")

            # Backprop (Handles scaling for fp16)
            accelerator.backward(loss)

            # Clip gradients
            if accelerator.sync_gradients:
                accelerator.clip_grad_norm_(unet.parameters(), 1.0)

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if accelerator.sync_gradients:
            global_step += 1
            log_loss = accelerator.gather(loss).mean().item()

            # Store loss
            loss_history.append(log_loss)

            # Update tqdm
            pbar.set_postfix({
                "epoch": epoch,
                "step": global_step,
                "loss": f"{log_loss:.4f}",
                "recent": [f"{l:.4f}" for l in loss_history[-10:]]
            })
            pbar.update(1)

            # Save checkpoint every SAVE_EVERY_STEPS
            if global_step % SAVE_EVERY_STEPS == 0:
                unwrapped = accelerator.unwrap_model(unet)
                cp_path = Path(OUTPUT_DIR)/f"checkpoint-epoch{epoch}-step{global_step}"
                cp_path.mkdir(parents=True, exist_ok=True)
                unwrapped.save_pretrained(str(cp_path))
                # tqdm-friendly log
                pbar.write(f"Saved checkpoint to {cp_path}")

            if global_step >= TARGET_STEPS:
                break
    if global_step >= TARGET_STEPS:
        break

pbar.close()

# -------------------- Save final LoRA weights --------------------
unwrapped_unet = accelerator.unwrap_model(unet)
unwrapped_unet.save_pretrained(f"{OUTPUT_DIR}/lora_weights")
print("LoRA weights saved to:", f"{OUTPUT_DIR}/lora_weights")


Training LoRA:   0%|          | 0/625 [06:08<?, ?it/s]
Training LoRA:  32%|███▏      | 200/625 [46:54<30:07,  4.25s/it, epoch=1, step=200, loss=0.0871]

Saved checkpoint to /content/drive/MyDrive/Colab Notebooks/sd_lora/output/checkpoint-epoch1-step200


Training LoRA:  64%|██████▍   | 400/625 [1:01:04<15:56,  4.25s/it, epoch=3, step=400, loss=0.0953]

Saved checkpoint to /content/drive/MyDrive/Colab Notebooks/sd_lora/output/checkpoint-epoch3-step400


Training LoRA:  96%|█████████▌| 600/625 [1:15:12<01:45,  4.21s/it, epoch=4, step=600, loss=0.1018]

Saved checkpoint to /content/drive/MyDrive/Colab Notebooks/sd_lora/output/checkpoint-epoch4-step600


Training LoRA: 100%|██████████| 625/625 [1:16:58<00:00,  7.39s/it, epoch=4, step=625, loss=0.2234]


LoRA weights saved to: /content/drive/MyDrive/Colab Notebooks/sd_lora/output/lora_weights


# Load and try LoRA weights

In [13]:
pipe = StableDiffusionPipeline.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    safety_checker=None
).to("cuda")

try:
    pipe.enable_xformers_memory_efficient_attention()
except Exception:
    pass

# Load the PEFT-trained LoRA weights using the pipeline method
pipe.load_lora_weights(f"{OUTPUT_DIR}/lora_weights")

prompt = "Hearty Serbian bean stew with smoked pork"
image = pipe(
    prompt,
    num_inference_steps=30,
    guidance_scale=7.5
).images[0]

image.save(f"{LORA_ROOT}/test_img.png")
print(f"Image generated and saved to {LORA_ROOT}/test_img.png")

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


  0%|          | 0/30 [00:00<?, ?it/s]

Image generated and saved to /content/drive/MyDrive/Colab Notebooks/sd_lora/test_img.png
