In [None]:
#@markdown Check type of GPU and VRAM available.
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

Tesla T4, 15109 MiB, 15109 MiB


https://github.com/ShivamShrirao/diffusers/tree/main/examples/ldm_clip

## Install Requirements

In [None]:
!wget -q https://github.com/katsugeneration/diffusers/raw/main/examples/ldm_clip/train_ldm_clip.py
!wget -q https://github.com/katsugeneration/diffusers/raw/main/examples/ldm_clip/clip_loss.py
%pip install -qq git+https://github.com/katsugeneration/diffusers
%pip install -q -U --pre triton
%pip install -q accelerate==0.12.0 transformers ftfy bitsandbytes gradio datasets datasets[vision]

In [None]:
#@title Login to HuggingFace 🤗

#@markdown You need to accept the model license before downloading or using the Stable Diffusion weights. Please, visit the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4), read the license and tick the checkbox if you agree. You have to be a registered user in 🤗 Hugging Face Hub, and you'll also need to use an access token for the code to work.
from huggingface_hub import notebook_login
!git config --global credential.helper store
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


### Install xformers from precompiled wheel.

In [None]:
%pip install -q https://github.com/metrolobo/xformers_wheels/releases/download/1d31a3ac_various_6/xformers-0.0.14.dev0-cp37-cp37m-linux_x86_64.whl
# These were compiled on Tesla T4, should also work on P100, thanks to https://github.com/metrolobo

# If precompiled wheels don't work, install it with the following command. It will take around 40 minutes to compile.
# %pip install git+https://github.com/facebookresearch/xformers@1d31a3a#egg=xformers

## Settings and run

In [None]:
#@markdown Name/Path of the initial model.
MODEL_NAME = "CompVis/stable-diffusion-v1-4" #@param {type:"string"}

#@markdown Target Text.
TARGET_TEXT = "illustration" #@param {type:"string"}

#@markdown Target Dataset.
TARGET_DATASET = "Imagenet" #@param {type:"string"}

#@markdown Source Text.
SOURCE_TEXT = "photo" #@param {type:"string"}

#@markdown If model weights should be saved directly in google drive (takes around 4-5 GB).
save_to_gdrive = True #@param {type:"boolean"}
if save_to_gdrive:
    from google.colab import drive
    drive.mount('/content/drive')

#@markdown Enter the directory name to save model at.
OUTPUT_DIR = "stable_diffusion_weights/ldm_clip" #@param {type:"string"}
if save_to_gdrive:
    OUTPUT_DIR = "/content/drive/MyDrive/" + OUTPUT_DIR
else:
    OUTPUT_DIR = "/content/" + OUTPUT_DIR

print(f"[*] Weights will be saved at {OUTPUT_DIR}")
!mkdir -p $OUTPUT_DIR

# Start Training

Add `--gradient_checkpointing` flag to reduce VRAM usage.

remove `--use_8bit_adam` flag for full precision, uses more VRAM

In [None]:
!accelerate launch train_ldm_clip.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --output_dir=$OUTPUT_DIR \
  --target_text="{TARGET_TEXT}" \
  --target_dataset="{TARGET_DATASET}" \
  --source_text="{SOURCE_TEXT}" \
  --seed=3434554 \
  --resolution=512 \
  --mixed_precision="fp16" \
  --use_8bit_adam \
  --train_batch_size 1 \
  --gradient_accumulation_steps=1 \
  --gradient_checkpointing \
  --train_inference_steps 5 \
  --ddim_steps 50 \
  --learning_rate=8e-6 \
  --max_train_steps=300 \
  --l1_w 1.0


## Convert weights to ckpt to use in web UIs like AUTOMATIC1111.

In [None]:
#@markdown Download script
!wget -q https://github.com/ShivamShrirao/diffusers/raw/main/scripts/convert_diffusers_to_original_stable_diffusion.py

In [None]:
#@markdown Run conversion.
ckpt_path = OUTPUT_DIR + "/model.ckpt"

half_arg = ""
#@markdown  Whether to convert to fp16, takes half the space (2GB), might loose some quality.
fp16 = False #@param {type: "boolean"}
if fp16:
    half_arg = "--half"
!python convert_diffusers_to_original_stable_diffusion.py --model_path $OUTPUT_DIR  --checkpoint_path $ckpt_path $half_arg
print(f"[*] Converted ckpt saved at {ckpt_path}")

## Inference

In [16]:
import os
import torch
from torch import autocast
from diffusers import StableDiffusionImg2ImgPipeline, DDIMScheduler
from IPython.display import display

model_path = OUTPUT_DIR             # If you want to use previously trained model saved in gdrive, replace this with the full path of model in gdrive

scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(model_path, scheduler=scheduler, torch_dtype=torch.float16).to("cuda")
g_cuda = None

In [None]:
#@title Run for generating images.
from PIL import Image

num_samples = 4 #@param {type:"number"}
guidance_scale = 2 #@param {type:"number"}
strength = 0.2 #@param {type:"number"}
num_inference_steps = 200 #@param {type:"number"}
height = 512 #@param {type:"number"}
width = 512 #@param {type:"number"}
input_image = "" #@param {type:"string"}

#@markdown Can set random seed here for reproducibility.
g_cuda = torch.Generator(device='cuda')
seed = 4324 #@param {type:"number"}
g_cuda.manual_seed(seed)

with autocast("cuda"), torch.inference_mode():
    images = pipe(
        init_image=Image.open(input_image).resize((height, width)),
        prompt=TARGET_TEXT,
        height=height,
        width=width,
        num_images_per_prompt=num_samples,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        strength=strength,
        generator=g_cuda
    ).images

for img in images:
    display(img)