In [1]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import argparse
import torch
import json
from unilatent import UniLatentPipeline

from data.builder import build_dataset, build_dataloader
from aspect_ratio_sampler import AspectRatioBatchSampler
from torch.utils.data import RandomSampler

from tqdm import tqdm
from accelerate import Accelerator



In [7]:
# data_config = {
#     'type': 'FlexibleInternalDataMS',
#     'roots': [
#         # '/mnt/bn/us-aigc-temp/henry/coco_2014/val/val2014/',
#         '/mnt/bn/aigc-us/zjl/laion-coco-aesthetic/data_max1024/',
#     ],
#     'json_lst': [
#         # '/mnt/bn/us-aigc-temp/henry/test.json',
#         '/mnt/bn/aigc-us/zjl/laion-coco-aesthetic/data_max1024/meta_data_coco_edited.json',
#     ],
#     'load_vae_feat': False,
#     'load_t5_feat': False
# }
# dataset = build_dataset(
#     data_config, resolution=512, aspect_ratio_type='ASPECT_RATIO_512',
#     real_prompt_ratio=0.0, max_length=77, return_image_id=True
# )
# batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
#                                     batch_size=1, aspect_ratios=dataset.aspect_ratio, drop_last=True,
#                                     ratio_nums=dataset.ratio_nums, valid_num=0)
# dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=10)

accelerator = Accelerator(
        mixed_precision='fp16',
    )

data_config = {
    'type': 'FlexibleInternalDataMS',
    'roots': [
        '/mnt/bn/aigc-us/zjl/laion-coco-aesthetic/data_max1024/',
    ],
    'json_lst': [
        '/mnt/bn/aigc-us/zjl/laion-coco-aesthetic/data_max1024/meta_data_coco_edited.json',
    ],
    'load_vae_feat': False,
    'load_t5_feat': False
}
dataset = build_dataset(
    data_config, resolution=512, aspect_ratio_type='ASPECT_RATIO_512',
    real_prompt_ratio=0.0, max_length=77,
)
batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                    batch_size=1, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                    ratio_nums=dataset.ratio_nums, valid_num=0)
dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=10)

Detected kernel version 5.4.143, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Constructing dataset FlexibleInternalDataMS...




Dataset FlexibleInternalDataMS constructed. time: 42.67 s, length (use/ori): 7591625/7596238


In [13]:
def prepare(accelerator, pipe):
    (
        pipe.transformer,
        pipe.text_encoder, 
        pipe.text_encoder_2,
        pipe.clip_image_encoder,
        pipe.text_decoder,
        pipe.vae
    ) = accelerator.prepare(
        pipe.transformer,
        pipe.text_encoder, 
        pipe.text_encoder_2,
        pipe.clip_image_encoder,
        pipe.text_decoder,
        pipe.vae
    )

    return pipe

def dift_sampler(batch, pipe, index, block_num):
    index_ = torch.zeros(size=(1,), dtype=torch.long) + index
    embeds, pooled_embeds = pipe.dift_features(batch[0][:1], index_, return_layer=block_num)
    embeds = torch.cat([embeds, pooled_embeds], axis=1)
    decoded_tokens = pipe.text_decoder.generate_captions(embeds, 
                        eos_token_id=pipe.decoder_tokenizer.eos_token_id, device=accelerator.device)[0]
    decoded_text = pipe.decoder_tokenizer.batch_decode(decoded_tokens)[0]
    return decoded_text

def clip_sampler(batch, pipe):
    embeds, pooled_embeds = pipe.encode_image(batch[0][:1])
    embeds = torch.cat([embeds, pooled_embeds], axis=1)
    decoded_tokens = pipe.text_decoder.generate_captions(embeds, 
                        eos_token_id=pipe.decoder_tokenizer.eos_token_id, device=accelerator.device)[0]
    decoded_text = pipe.decoder_tokenizer.batch_decode(decoded_tokens)[0]
    return decoded_text

def generate_captions(pipe, dataloader, save_path, sampler, sampler_kwargs={}):
    json_list = []
    progbar = tqdm(dataloader)
    for i, batch in enumerate(progbar):
        with torch.no_grad():
            decoded_text = sampler(batch, pipe, **sampler_kwargs)
        
        caption = decoded_text.strip('!').replace('<|endoftext|>', '').replace('<|EOS|>', '').strip()
        image_id = batch[-1]['image_id'].item() if 'image_id' in batch[-1] else 0
        json_list.append({'image_id': image_id, 'caption': caption})

        progbar.set_description(f"Image: {i:05d} | Predicted: {caption} | True: {batch[1][0]}")

        if (i + 1) % 50 == 0:
            with open(save_path, 'w') as f:
                test = json.dump(json_list, f)

    return json_list

In [16]:
epoch = 49999
for block_num in [6]:
    # for index in [0, 250, 500, 750]:
    for index in [500, 750, 250, 0]:
        name = f'index_{index:03d}_block_{block_num}'
        save_path = f'/mnt/bn/us-aigc-temp/henry/data/captions/dift/dift_{name}_step_{epoch}.json'
        load_path = f'/mnt/bn/us-aigc-temp/henry/data/dift/{name}/epoch_0_step_{epoch}/'
        print(f"Loading pipeline from {load_path}")
        pipe = UniLatentPipeline.from_pretrained(load_path, torch_dtype=torch.float32)
        assert torch.allclose(pipe.text_decoder.transformer.lm_head.weight, pipe.text_decoder.transformer.transformer.wte.weight)

        pipe = prepare(accelerator, pipe)
        print(f"Running sampler for {name}:")
        sampler_kwargs = {'index': index, 'block_num': block_num}
        generate_captions(pipe, dataloader, save_path, dift_sampler, sampler_kwargs)

Loading pipeline from /mnt/bn/us-aigc-temp/henry/data/dift/index_500_block_6/epoch_0_step_49999/


Loading pipeline components...:   0%|          | 0/11 [00:00<?, ?it/s]

Some weights of TextDecoder were not initialized from the model checkpoint at /mnt/bn/us-aigc-temp/henry/data/dift/index_500_block_6/epoch_0_step_49999/text_decoder and are newly initialized: ['pooled_image_embedder.weight', 'transformer.lm_head.weight', 'image_embedder.bias', 'image_embedder.weight', 'pooled_image_embedder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running sampler for index_500_block_6:


  0%|          | 0/7591625 [00:00<?, ?it/s]

WHOA INSIDE CAPTION DECODER tensor(2803.8152, device='cuda:0') tensor(0.0344, device='cuda:0') tensor(4.4712, device='cuda:0')


Image: 00000 | Predicted: a man in a suit and tie standing outside. | True: Two red and white quartzite point standing on top of a wooden slab.:   0%|          | 1/7591625 [00:05<12546:25:04,  5.95s/it]

WHOA INSIDE CAPTION DECODER tensor(2852.0764, device='cuda:0') tensor(0.0228, device='cuda:0') tensor(4.5482, device='cuda:0')


Image: 00001 | Predicted: a man in a suit and tie standing outside. | True: Food is displayed on wooden crates at an event.:   0%|          | 2/7591625 [00:06<6465:52:09,  3.07s/it]                     

WHOA INSIDE CAPTION DECODER tensor(2801.2339, device='cuda:0') tensor(0.0035, device='cuda:0') tensor(4.4672, device='cuda:0')


Image: 00002 | Predicted: a man in a suit and tie standing outside. | True: The head and shoulders of an orange bearded lizard.:   0%|          | 3/7591625 [00:08<4506:16:10,  2.14s/it]

WHOA INSIDE CAPTION DECODER tensor(2920.0674, device='cuda:0') tensor(0.0568, device='cuda:0') tensor(4.6563, device='cuda:0')


Image: 00003 | Predicted: a man in man in man in a suit. | True: Three red bags sitting on the steps.:   0%|          | 4/7591625 [00:08<3427:19:52,  1.63s/it]                          

WHOA INSIDE CAPTION DECODER tensor(2806.8984, device='cuda:0') tensor(0.0176, device='cuda:0') tensor(4.4762, device='cuda:0')


Image: 00004 | Predicted: a man is smiling for the camera. | True: The kitchen has an island with wine racks.:   0%|          | 5/7591625 [00:09<2975:49:41,  1.41s/it]

WHOA INSIDE CAPTION DECODER tensor(2868.6042, device='cuda:0') tensor(0.0095, device='cuda:0') tensor(4.5746, device='cuda:0')


Image: 00005 | Predicted: a man in a suit and tie standing outside. | True: Two men on the sidelines with one pointing at something.:   0%|          | 6/7591625 [00:10<2666:40:49,  1.26s/it]

WHOA INSIDE CAPTION DECODER tensor(2797.3787, device='cuda:0') tensor(0.0138, device='cuda:0') tensor(4.4610, device='cuda:0')


Image: 00006 | Predicted: a man in a suit and tie standing outside. | True: The bracelet is made with lava and green jade.:   0%|          | 7/7591625 [00:11<2465:34:00,  1.17s/it]          

WHOA INSIDE CAPTION DECODER tensor(2931.7610, device='cuda:0') tensor(0.0046, device='cuda:0') tensor(4.6753, device='cuda:0')


Image: 00007 | Predicted: a man in a suit and tie standing outside. | True: Two people looking at the Belugas in an aquarium.:   0%|          | 8/7591625 [00:12<2338:12:24,  1.11s/it]

WHOA INSIDE CAPTION DECODER tensor(2862.6892, device='cuda:0') tensor(0.0167, device='cuda:0') tensor(4.5652, device='cuda:0')


Image: 00008 | Predicted: a man is smiling for the camera. | True: A group of military tanks sitting on top of a table.:   0%|          | 9/7591625 [00:13<2214:54:50,  1.05s/it]      

WHOA INSIDE CAPTION DECODER tensor(2929.7568, device='cuda:0') tensor(0.0413, device='cuda:0') tensor(4.6720, device='cuda:0')


Image: 00009 | Predicted: a man is smiling for the camera. | True: The calendar for 2016 year with colorful circles.:   0%|          | 10/7591625 [00:14<2207:03:00,  1.05s/it]  

WHOA INSIDE CAPTION DECODER tensor(2902.4514, device='cuda:0') tensor(0.0103, device='cuda:0') tensor(4.6286, device='cuda:0')


Image: 00010 | Predicted: a man in a suit and tie standing outside. | True: A gray wedding dress with white lace and tulle.:   0%|          | 11/7591625 [00:15<2160:52:03,  1.02s/it]

WHOA INSIDE CAPTION DECODER tensor(2823.6553, device='cuda:0') tensor(0.0370, device='cuda:0') tensor(4.5028, device='cuda:0')


Image: 00011 | Predicted: a man in a suit and tie standing outside. | True: The screen protector for Samsung Galaxy Core Prime.:   0%|          | 12/7591625 [00:16<2166:00:07,  1.03s/it]

WHOA INSIDE CAPTION DECODER tensor(2865.7021, device='cuda:0') tensor(0.0147, device='cuda:0') tensor(4.5700, device='cuda:0')


Image: 00012 | Predicted: a man is smiling for the camera. | True: Two dirt bikes parked on the side of a road.:   0%|          | 13/7591625 [00:17<2131:35:21,  1.01s/it]                

WHOA INSIDE CAPTION DECODER tensor(2820.8591, device='cuda:0') tensor(0.0158, device='cuda:0') tensor(4.4985, device='cuda:0')


Image: 00013 | Predicted: a man in a suit and tie standing outside. | True: The child's crocheted shoes are brightly colored.:   0%|          | 14/7591625 [00:18<2126:44:46,  1.01s/it]

WHOA INSIDE CAPTION DECODER tensor(2857.0354, device='cuda:0') tensor(0.0277, device='cuda:0') tensor(4.5561, device='cuda:0')


Image: 00014 | Predicted: a man in a suit and tie standing outside. | True: A baby is wrapped in a blue bear blanket.:   0%|          | 15/7591625 [00:19<2136:22:48,  1.01s/it]        

WHOA INSIDE CAPTION DECODER tensor(2822.0171, device='cuda:0') tensor(0.0385, device='cuda:0') tensor(4.5002, device='cuda:0')


Image: 00015 | Predicted: the man is smiling for the camera. | True: various bearing and seal kits for the rollers:   0%|          | 16/7591625 [00:20<2148:00:39,  1.02s/it]   

WHOA INSIDE CAPTION DECODER tensor(2788.3735, device='cuda:0') tensor(0.0346, device='cuda:0') tensor(4.4465, device='cuda:0')


Image: 00016 | Predicted: a man in a suit and tie standing outside. | True: The words for salsa are in red and white on a women's tank top.:   0%|          | 17/7591625 [00:22<2250:37:37,  1.07s/it]

WHOA INSIDE CAPTION DECODER tensor(2812.1951, device='cuda:0') tensor(0.0696, device='cuda:0') tensor(4.4841, device='cuda:0')


Image: 00017 | Predicted: a man in a suit and tie standing in front of a white background. | True: An Apple mouse is shown on the white surface.:   0%|          | 18/7591625 [00:23<2241:01:59,  1.06s/it]

WHOA INSIDE CAPTION DECODER tensor(2832.5264, device='cuda:0') tensor(0.0214, device='cuda:0') tensor(4.5170, device='cuda:0')


Image: 00018 | Predicted: a man in a suit and tie standing outside. | True: A police officer points at the window in his office.:   0%|          | 19/7591625 [00:24<2221:15:10,  1.05s/it]                

WHOA INSIDE CAPTION DECODER tensor(2775.8013, device='cuda:0') tensor(0.0250, device='cuda:0') tensor(4.4266, device='cuda:0')


Image: 00019 | Predicted: a man in a suit and tie standing outside. | True: The model is wearing an orange and green dress.:   0%|          | 20/7591625 [00:25<2204:14:44,  1.05s/it]     

WHOA INSIDE CAPTION DECODER tensor(2839.4426, device='cuda:0') tensor(0.0317, device='cuda:0') tensor(4.5280, device='cuda:0')


Image: 00020 | Predicted: a man in a suit and tie standing outside. | True: Two children sitting on the floor playing with toys.:   0%|          | 21/7591625 [00:26<2197:51:13,  1.04s/it]

WHOA INSIDE CAPTION DECODER tensor(2849.9678, device='cuda:0') tensor(0.0138, device='cuda:0') tensor(4.5449, device='cuda:0')


Image: 00021 | Predicted: a man in a suit and tie standing outside. | True: The refrigerator is decorated with an artistic flower design.:   0%|          | 22/7591625 [00:27<2157:13:30,  1.02s/it]

WHOA INSIDE CAPTION DECODER tensor(2857.4897, device='cuda:0') tensor(0.0241, device='cuda:0') tensor(4.5568, device='cuda:0')


Image: 00022 | Predicted: the man is smiling for the camera. | True: Boats are docked on the river in front of buildings.:   0%|          | 23/7591625 [00:28<2158:44:08,  1.02s/it]                

WHOA INSIDE CAPTION DECODER tensor(2867.0889, device='cuda:0') tensor(0.0311, device='cuda:0') tensor(4.5721, device='cuda:0')


Image: 00023 | Predicted: a man is smiling for the camera. | True: The well - connected man is being shown with his dog.:   0%|          | 24/7591625 [00:29<2162:22:30,  1.03s/it] 

WHOA INSIDE CAPTION DECODER tensor(2834.8398, device='cuda:0') tensor(0.0351, device='cuda:0') tensor(4.5206, device='cuda:0')


Image: 00024 | Predicted: a man in a suit and tie standing in front of a white wall. | True: Two palm trees stand in front of the mountains.:   0%|          | 25/7591625 [00:30<2166:42:14,  1.03s/it]

WHOA INSIDE CAPTION DECODER tensor(2826.8181, device='cuda:0') tensor(0.0097, device='cuda:0') tensor(4.5080, device='cuda:0')


Image: 00025 | Predicted: a man in a suit and tie standing outside. | True: The young boy is standing in front of a door.:   0%|          | 26/7591625 [00:31<2146:48:27,  1.02s/it]                   

WHOA INSIDE CAPTION DECODER tensor(2948.4783, device='cuda:0') tensor(0.0550, device='cuda:0') tensor(4.7017, device='cuda:0')


Image: 00026 | Predicted: a man in man in man in a suit. | True: An outdoor fire pit with the words Amazing London Rooftops.:   0%|          | 27/7591625 [00:32<1999:12:22,  1.05it/s]

WHOA INSIDE CAPTION DECODER tensor(2856.2441, device='cuda:0') tensor(0.0329, device='cuda:0') tensor(4.5548, device='cuda:0')


Image: 00027 | Predicted: a man in a suit and tie standing outside. | True: The silhouette of a man with a goat on his head.:   0%|          | 28/7591625 [00:33<2053:41:35,  1.03it/s]

WHOA INSIDE CAPTION DECODER tensor(2815.9836, device='cuda:0') tensor(0.0249, device='cuda:0') tensor(4.4906, device='cuda:0')


Image: 00028 | Predicted: a man in a suit and tie standing outside. | True: an image of the beautiful actress in yellow:   0%|          | 29/7591625 [00:34<2087:57:08,  1.01it/s]     

WHOA INSIDE CAPTION DECODER tensor(2805.9497, device='cuda:0') tensor(0.0529, device='cuda:0') tensor(4.4744, device='cuda:0')


Image: 00029 | Predicted: a man in a white shirt and black pants. | True: A glass table with metal legs and a round top.:   0%|          | 30/7591625 [00:36<2539:50:53,  1.20s/it]


KeyboardInterrupt: 