### Lora Trainning

#### Data Pre-processing
In order to use diffusers for lora training, a specific dataset format should be built.More details: https://huggingface.co/docs/diffusers/training/create_dataset

install environment

In [None]:
import torch
from PIL import Image
import requests
from lavis.models import load_model_and_preprocess

#blip2 environment: https://github.com/salesforce/LAVIS/tree/main/projects/blip2


# setup device to use
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"

#image caption by blip2
model, vis_processors, _ = load_model_and_preprocess(
    name="blip2_opt", model_type="pretrain_opt6.7b", is_eval=True, device=device
)

caption function

In [None]:

def img2txt(image):
    image = vis_processors["eval"](image).unsqueeze(0).to(device)
    txt = model.generate({"image":image,"prompt":"a clothing photo of"})
    # txt = model.generate({"image": image, "prompt": "Question: Please describe the following aspects of the clothing in the image, "+ 
    #             "1.Style and Design: Detail the overall style and any distinctive design features. " + 
    #             "2.Color and Pattern: Describe the primary color(s) and any patterns or prints present. " +
    #             "3.Fabric and Material: Identify the type of fabric and material quality. " +
    #             "4.Size and Fit: Comment on the size, fit, and cut of the clothing. " +
    #             "5.Details and Embellishments: Note any specific details or decorative elements. " +
    #             "Appropriate Occasions: Suggest occasions or settings where this clothing would be suitable to wear. Answer:"})
    return txt

Caption!

In [None]:
import os
import tqdm
import json
import time
from PIL import Image

img_path = '/home/sd/Harddisk/ZXP/DressCode/images'
out_path = '/home/sd/Harddisk/ZXP/SDXL_LOra/Data_set/DressCode'

if not os.path.exists(out_path):
    os.makedirs(out_path)
    
imgs = os.listdir(img_path)
print(len(imgs)/2)
time.sleep(3)
captions = []
for img_name in tqdm.tqdm(imgs):
    if img_name[-5] != '1':
        continue
    img = Image.open(os.path.join(img_path,img_name)).convert('RGB')
    txt = img2txt(img)
    caption = {'file_name':img_name,'text':txt[0]}
    captions.append(caption)

with open(os.path.join(out_path,'metadata.jsonl'),'w') as file:
    for caption in captions:
        json_line = json.dumps(caption)
        file.write(json_line + '\n')

#### Lora trainning
Use the training script provided by diffusers for training. For specific tutorials, refer to: https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/README_sdxl.md
Attention: the train_text_to_image_lora_sdxl.py used in this notebook has be editted.

In [None]:
!export MODEL_NAME="stabilityai/stable-diffusion-xl-base-1.0"
!export VAE_NAME="madebyollin/sdxl-vae-fp16-fix"
!export DATASET_NAME="/home/sd/Harddisk/ZXP/SDXL_LOra/Data_set/test/cloth"

!accelerate launch diffusers/examples/text_to_image/train_text_to_image_lora_sdxl.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --pretrained_vae_model_name_or_path=$VAE_NAME \
  --caption_column="text" \
  --resolution=1024 --random_flip \
  --train_data_dir /home/sd/Harddisk/ZXP/SDXL_LOra/Data_set \
  --train_batch_size=2 \
  --num_train_epochs=4 --checkpointing_steps=5000 \
  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
  --mixed_precision="fp16" \
  --seed=42 \
  --output_dir="sd-finetune-model-lora-sdxl" \
  --validation_prompt="a photo of blue dress" --report_to="wandb" \
  --push_to_hub \
  --validation_steps 2500 \

Lora inference

In [None]:
from diffusers import DiffusionPipeline
import torch

model_path = "/home/sd/Harddisk/ZXP/SDXL_LOra/sd-finetune-model-lora-sdxl/checkpoint-50000"
pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
pipe.to("cuda")
pipe.load_lora_weights(model_path)

prompt = "a photo of blue jeans skirt, White background"
image = pipe(prompt, num_inference_steps=50, 
             guidance_scale=2.5                   #text prompt guidance scale
             ).images[0]
image.save("Lora_output/test.png")

Multi Lora inference (Not compatible with other adapters, for now.) More details: https://huggingface.co/docs/diffusers/main/en/tutorials/using_peft_for_inference

In [None]:
!pip install peft

In [None]:
from diffusers import DiffusionPipeline
import torch

pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")

#lora No.1
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
#lora No.2
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
#lora fuse
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])

#inference
prompt = "toy_face of a hacker with a hoodie"
lora_scale= 0.9
image = pipe(
    prompt, num_inference_steps=30, 
    cross_attention_kwargs={"scale": lora_scale},    #lora scale 
    generator=torch.manual_seed(0)
).images[0]
image

### T2I Adapter(ControlNet) Trainning

#### Trainning will upload soon

#### Inference(Lora + T2I-Adapter)

In [None]:
from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, DiffusionPipeline
from diffusers.utils import load_image
import torch
import peft

base_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
adapter_path = "/home/sd/Harddisk/ZXP/SDXL_LOra/T2IAdapter/openpose"
model_path = "/home/sd/Harddisk/ZXP/SDXL_LOra/sd-pokemon-model-lora-sdxl/checkpoint-50000"

adapter = T2IAdapter.from_pretrained(adapter_path, torch_dtype=torch.float16)
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
    base_model_path, adapter=adapter, torch_dtype=torch.float16
)
pipe.load_lora_weights(model_path,adapter_name="black_ground_clothing")
# pipe.set_adapters("black_ground_clothing")
# speed up diffusion process with faster scheduler and memory optimization
# pipe.scheduler = EulerAncestralDiscreteSchedulerTest.from_config(pipe.scheduler.config)
# remove following line if xformers is not installed or when using Torch 2.0.
# pipe.enable_xformers_memory_efficient_attention()
# memory optimization.
pipe.enable_model_cpu_offload()

control_image = load_image("/home/sd/Harddisk/ZXP/DressCode/skeletons/000003_5.jpg")
prompt = "a photo of a blue off the shoulder top, White background"

# generate image
generator = torch.manual_seed(0)
image = pipe(
    prompt, num_inference_steps=50, 
    cross_attention_kwargs={"scale": 1},               #lora_scale 
    generator=generator, image=control_image, 
    guidance_scale=2.5,                                #prompt_scale
    adapter_conditioning_scale=0.8                     #T2I-adapter_scale
).images[0] 
image