In [1]:
import torch

#全局变量
hub_token = open('/root/hub_token.txt').read().strip()
repo_id = 'lansinuote/diffusion.3.dream_boothimages'
push_to_hub = True
checkpoint = 'CompVis/stable-diffusion-v1-4'

In [2]:
from datasets import Dataset, DatasetDict, load_dataset
import torchvision
import PIL.Image


def get_dataset():
    images = [{
        'image': PIL.Image.open('images/%d.jpeg' % i)
    } for i in range(5)]

    dataset = Dataset.from_list(images)

    return DatasetDict({'train': dataset})


if push_to_hub:
    dataset = get_dataset()
    dataset.push_to_hub(repo_id=repo_id, token=hub_token)

#直接使用我处理好的数据集
dataset = load_dataset(path=repo_id, split='train')

dataset, dataset[0]

Pushing split train to the Hub.


  0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration lansinuote--diffusion.3.dream_boothimages-73d13cf74f9b46f0


Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--diffusion.3.dream_boothimages-73d13cf74f9b46f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.59M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/lansinuote___parquet/lansinuote--diffusion.3.dream_boothimages-73d13cf74f9b46f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


(Dataset({
     features: ['image'],
     num_rows: 5
 }),
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=2469x2558>})

In [3]:
import torchvision

#数据增强
compose = torchvision.transforms.Compose([
    torchvision.transforms.Resize(
        512, interpolation=torchvision.transforms.InterpolationMode.BILINEAR),
    torchvision.transforms.RandomCrop(512),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.5], [0.5]),
])


def f(data):
    image = [compose(i) for i in data['image']]
    return {'image': image}


dataset = dataset.with_transform(f)

dataset, dataset[0]

(Dataset({
     features: ['image'],
     num_rows: 5
 }),
 {'image': tensor([[[ 0.7725,  0.7725,  0.7725,  ...,  0.7647,  0.7647,  0.7725],
           [ 0.7804,  0.7882,  0.7804,  ...,  0.7804,  0.7569,  0.7490],
           [ 0.7725,  0.7804,  0.7804,  ...,  0.7647,  0.7725,  0.7647],
           ...,
           [ 0.7176,  0.7020,  0.7176,  ...,  0.7176,  0.7020,  0.7020],
           [ 0.7098,  0.7098,  0.7020,  ...,  0.7020,  0.6941,  0.7098],
           [ 0.7098,  0.7176,  0.7176,  ...,  0.6941,  0.7098,  0.7020]],
  
          [[-0.1529, -0.1529, -0.1529,  ..., -0.1451, -0.1373, -0.1216],
           [-0.1451, -0.1373, -0.1451,  ..., -0.1373, -0.1529, -0.1529],
           [-0.1529, -0.1451, -0.1451,  ..., -0.1373, -0.1373, -0.1373],
           ...,
           [-0.2471, -0.2627, -0.2471,  ..., -0.2235, -0.2392, -0.2392],
           [-0.2392, -0.2471, -0.2549,  ..., -0.2392, -0.2471, -0.2314],
           [-0.2549, -0.2471, -0.2471,  ..., -0.2471, -0.2314, -0.2392]],
  
          [[-0.6

In [4]:
def collate_fn(datas):
    image = [i['image'] for i in datas]
    return torch.stack(image, dim=0)


loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=1,
                                     shuffle=True,
                                     collate_fn=collate_fn)

len(loader), next(iter(loader)).shape

(5, torch.Size([1, 3, 512, 512]))

In [5]:
from transformers.models.clip.modeling_clip import CLIPTextModel
from diffusers import AutoencoderKL, UNet2DConditionModel

encoder = CLIPTextModel.from_pretrained(checkpoint, subfolder='text_encoder')
vae = AutoencoderKL.from_pretrained(checkpoint, subfolder='vae')
unet = UNet2DConditionModel.from_pretrained(checkpoint, subfolder='unet')

vae.requires_grad_(False)
encoder.requires_grad_(False)


def print_model_size(name, model):
    print(name, sum(i.numel() for i in model.parameters()) / 10000)


print_model_size('encoder', encoder)
print_model_size('vae', vae)
print_model_size('unet', unet)

encoder 12306.048
vae 8365.3863
unet 85952.0964


In [6]:
from diffusers import DDPMScheduler
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint,
                                          subfolder='tokenizer',
                                          use_fast=False)

scheduler = DDPMScheduler.from_pretrained(checkpoint, subfolder='scheduler')

optimizer = torch.optim.AdamW(unet.parameters(),
                              lr=5e-6,
                              betas=(0.9, 0.999),
                              weight_decay=0.01,
                              eps=1e-8)

criterion = torch.nn.MSELoss()

tokenizer, scheduler, optimizer, criterion

(CLIPTokenizer(name_or_path='CompVis/stable-diffusion-v1-4', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}),
 DDPMScheduler {
   "_class_name": "DDPMScheduler",
   "_diffusers_version": "0.12.1",
   "beta_end": 0.012,
   "beta_schedule": "scaled_linear",
   "beta_start": 0.00085,
   "clip_sample": false,
   "num_train_timesteps": 1000,
   "prediction_type": "epsilon",
   "set_alpha_to_one": false,
   "skip_prk_steps": true,
   "steps_offset": 1,
   "trained_betas": null,
   "variance_type": "fixed_small"
 },
 AdamW (
 Parameter Group 0
     amsgrad: False
     betas: (0.9,

In [7]:
def get_loss(data):
    device = data.device

    #只需要input ids就可以了
    #全程都是一句话,其实只需要编码一次即可
    #77 = tokenizer.model_max_length
    input_ids = tokenizer('a photo of sks dog',
                          truncation=True,
                          padding='max_length',
                          max_length=77,
                          return_tensors='pt')['input_ids']

    #编码文字,由于encoder不训练,其实这一步也可以只运算一次
    #[1, 77] -> [1, 77, 768]
    out_encoder = encoder(input_ids.to(device))[0]

    #vae计算特征图
    #[1, 3, 512, 512] -> [1, 4, 64, 64]
    out_vae = vae.encode(data).latent_dist.sample().detach()

    #0.18215 = vae.config.scaling_factor
    out_vae = out_vae * 0.18215

    #随机噪声
    #[1, 4, 64, 64]
    noise = torch.randn_like(out_vae)

    #随机噪声步
    #1000 = scheduler.config.num_train_timesteps
    #1 = b
    noise_step = torch.randint(0, 1000, (1, ), device=device).long()

    #添加噪声
    #[1, 4, 64, 64]
    out_vae_noise = scheduler.add_noise(out_vae, noise, noise_step)

    #unet从噪声图中把噪声计算出来
    #[1, 4, 64, 64],[1, 77, 768] -> [1, 4, 64, 64]
    out_unet = unet(out_vae_noise, noise_step, out_encoder).sample

    return criterion(out_unet, noise)


get_loss(torch.randn(1, 3, 512, 512))

tensor(0.0034, grad_fn=<MseLossBackward0>)

In [8]:
from diffusers import DiffusionPipeline
from huggingface_hub import Repository, create_repo


def train():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    unet.to(device)
    vae.to(device)
    encoder.to(device)
    unet.train()

    loss_sum = 0
    for epoch in range(400):
        for i, data in enumerate(loader):
            loss = get_loss(data.to(device))

            loss.backward()
            torch.nn.utils.clip_grad_norm_(unet.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

            loss_sum += loss.item()

        if epoch % 20 == 0:
            print(epoch, loss_sum)
            loss_sum = 0

    DiffusionPipeline.from_pretrained(
        checkpoint, unet=unet, text_encoder=encoder).save_pretrained('./save')


if push_to_hub:
    create_repo(repo_id, exist_ok=True, token=hub_token)
    repo = Repository('./save', clone_from=repo_id, token=hub_token)
    train()
    repo.push_to_hub()

/root/code/Diffusers/3.dream booth/./save is already a clone of https://huggingface.co/lansinuote/diffusion.3.dream_boothimages. Make sure you pull the latest changes with `repo.git_pull()`.


0 1.2774939220398664
20 8.995061814552173
40 7.317525086225942
60 7.997788276174106
80 8.736398497479968
100 8.037149989278987
120 6.951517629553564
140 8.025313258636743
160 7.182113338029012
180 5.828842404880561
200 4.075685135438107
220 4.641587202670053
240 8.873134223162197
260 3.9080546323675662
280 6.689343460020609
300 5.815101726853754
320 6.1464903430314735
340 5.386142439674586
360 4.5272802241379395
380 4.679458085098304


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Upload file unet/diffusion_pytorch_model.bin:   0%|          | 32.0k/3.20G [00:00<?, ?B/s]

Upload file vae/diffusion_pytorch_model.bin:   0%|          | 32.0k/319M [00:00<?, ?B/s]

Upload file safety_checker/pytorch_model.bin:   0%|          | 32.0k/1.13G [00:00<?, ?B/s]

Upload file text_encoder/pytorch_model.bin:   0%|          | 32.0k/470M [00:00<?, ?B/s]

remote: Scanning LFS files for validity...[K
remote: LFS file scan complete.[K
To https://user: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@huggingface.co/lansinuote/diffusion.3.dream_boothimages
   e15cbf4..9451540  main -> main

