In [1]:
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import argparse
import torch
import json
from unilatent import UniLatentPipeline

from data.builder import build_dataset, build_dataloader
from aspect_ratio_sampler import AspectRatioBatchSampler
from torch.utils.data import RandomSampler

from tqdm import tqdm
from accelerate import Accelerator



In [3]:
data_config = {
    'type': 'FlexibleInternalDataMS',
    'roots': [
        '/mnt/bn/us-aigc-temp/henry/coco_2014/val/val2014/',
    ],
    'json_lst': [
        '/mnt/bn/us-aigc-temp/henry/test.json',
    ],
    'load_vae_feat': False,
    'load_t5_feat': False
}
dataset = build_dataset(
    data_config, resolution=512, aspect_ratio_type='ASPECT_RATIO_512',
    real_prompt_ratio=0.0, max_length=77, return_image_id=True
)
batch_sampler = AspectRatioBatchSampler(sampler=RandomSampler(dataset), dataset=dataset,
                                    batch_size=1, aspect_ratios=dataset.aspect_ratio, drop_last=True,
                                    ratio_nums=dataset.ratio_nums, valid_num=0)
dataloader = build_dataloader(dataset, batch_sampler=batch_sampler, num_workers=10)

accelerator = Accelerator(
        mixed_precision='fp16',
    )

Detected kernel version 5.4.143, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Constructing dataset FlexibleInternalDataMS...
Dataset FlexibleInternalDataMS constructed. time: 0.06 s, length (use/ori): 5000/5000


In [4]:
def prepare(accelerator, pipe):
    (
        pipe.transformer,
        pipe.text_encoder, 
        pipe.text_encoder_2,
        pipe.clip_image_encoder,
        pipe.text_decoder,
        pipe.vae
    ) = accelerator.prepare(
        pipe.transformer,
        pipe.text_encoder, 
        pipe.text_encoder_2,
        pipe.clip_image_encoder,
        pipe.text_decoder,
        pipe.vae
    )

    return pipe

def dift_sampler(batch, pipe, index, block_num):
    index_ = torch.zeros(size=(1,), dtype=torch.long) + index
    embeds, pooled_embeds = pipe.dift_features(batch[0][:1], index_, return_layer=block_num)
    embeds = torch.cat([embeds, pooled_embeds], axis=1)
    decoded_tokens = pipe.text_decoder.generate_captions(embeds, 
                        eos_token_id=pipe.decoder_tokenizer.eos_token_id, device=accelerator.device)[0]
    decoded_text = pipe.decoder_tokenizer.batch_decode(decoded_tokens)[0]
    return decoded_text

def clip_sampler(batch, pipe):
    embeds, pooled_embeds = pipe.encode_image(batch[0][:1])
    embeds = torch.cat([embeds, pooled_embeds], axis=1)
    decoded_tokens = pipe.text_decoder.generate_captions(embeds, 
                        eos_token_id=pipe.decoder_tokenizer.eos_token_id, device=accelerator.device)[0]
    decoded_text = pipe.decoder_tokenizer.batch_decode(decoded_tokens)[0]
    return decoded_text

def generate_captions(pipe, dataloader, save_path, sampler, sampler_kwargs={}):
    json_list = []
    progbar = tqdm(iter(dataloader))
    for i, batch in enumerate(progbar):
        with torch.no_grad():
            decoded_text = sampler(batch, pipe, **sampler_kwargs)
        
        caption = decoded_text.strip('!').replace('<|endoftext|>', '').replace('<|EOS|>', '').strip()
        json_list.append({'image_id': batch[-1]['image_id'].item(), 'caption': caption})

        progbar.set_description(f"Image: {i:05d} | Predicted: {caption} | True: {batch[1][0]}")

        if (i + 1) % 50 == 0:
            with open(save_path, 'w') as f:
                test = json.dump(json_list, f)

    return json_list

In [5]:
for block_num in [6, 12]:
    for index in [0, 250, 500, 750]:
        name = f'index_{index:03d}_block_{block_num}'
        save_path = f'/mnt/bn/us-aigc-temp/henry/data/captions/dift/dift_{name}_step_34999.json'
        load_path = f'/mnt/bn/us-aigc-temp/henry/data/dift/{name}/epoch_0_step_34999/'
        print(f"Loading pipeline for {name}:")
        pipe = UniLatentPipeline.from_pretrained(load_path, torch_dtype=torch.float32)

        pipe = prepare(accelerator, pipe)
        print(f"Running sampler for {name}:")
        sampler_kwargs = {'index': index, 'block_num': block_num}
        generate_captions(pipe, dataloader, save_path, dift_sampler, sampler_kwargs)

Loading pipeline for index_000_block_6:


Loading pipeline components...:   0%|          | 0/11 [00:00<?, ?it/s]

Some weights of TextDecoder were not initialized from the model checkpoint at /mnt/bn/us-aigc-temp/henry/data/dift/index_000_block_6/epoch_0_step_34999/text_decoder and are newly initialized: ['transformer.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running sampler for index_000_block_6:


Image: 00151 | Predicted: a man in a suit and tie standing on the red carpet. | True: A building front at the corner of N. Astor and E. Division St.        | 151/5000 [02:23<1:14:32,  1.08it/s]                                                         
Image: 00151 | Predicted: a man in a suit and tie standing on the red carpet. | True: A building front at the corner of N. Astor and E. Division St.
Image: 00309 | Predicted: he front view of this two story house. | True: Various people are acknowledging life and having a good time.          | 309/5000 [04:50<1:13:05,  1.07it/s]                                                                                                                                                                                                                
Image: 00309 | Predicted: he front view of this two story house. | True: Various people are acknowledging life and having a good time. 
Image: 00730 | Predicted: a kitchen with white cabinets and black counter 

Loading pipeline for index_250_block_6:





Loading pipeline components...:   0%|          | 0/11 [00:00<?, ?it/s]

Some weights of TextDecoder were not initialized from the model checkpoint at /mnt/bn/us-aigc-temp/henry/data/dift/index_250_block_6/epoch_0_step_34999/text_decoder and are newly initialized: ['transformer.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running sampler for index_250_block_6:


Image: 00176 | Predicted: side side view of a man wearing a black and white checkered shirt. | True: A stop sign posted in a foreign languageth a green fence.:   4%|▎         | 176/5000 [03:15<1:27:18,  1.09s/it]                                                                   
Image: 00176 | Predicted: side side view of a man wearing a black and white checkered shirt. | True: A stop sign posted in a foreign language
Image: 00225 | Predicted: side side view of a man wearing a black and white checkered shirt. | True: A boy throwing out a pitch in a ball game.4%|▍         | 225/5000 [04:08<1:24:55,  1.07s/it]                                                          
Image: 00225 | Predicted: side side view of a man wearing a black and white checkered shirt. | True: A boy throwing out a pitch in a ball game.
Image: 00376 | Predicted: side side view of a man wearing a black and white checkered shirt. | True: A photograph of an outside with numerous things in the scene.dress.:   8%|▊      

Loading pipeline for index_500_block_6:





Loading pipeline components...:   0%|          | 0/11 [00:00<?, ?it/s]

Some weights of TextDecoder were not initialized from the model checkpoint at /mnt/bn/us-aigc-temp/henry/data/dift/index_500_block_6/epoch_0_step_34999/text_decoder and are newly initialized: ['transformer.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running sampler for index_500_block_6:


Image: 00069 | Predicted: a man in a suit and tie standing outside. | True: A photo of an outside with various things in the scene.69/5000 [00:53<1:04:17,  1.28it/s]                                                      
Image: 00069 | Predicted: a man in a suit and tie standing outside. | True: A photo of an outside with various things in the scene.
Image: 00465 | Predicted: a man! a man in a suit and tie. | True: A person riding a baby blue motorcycle near haystacksing.:   9%|▉         | 465/5000 [06:01<57:12,  1.32it/s]                                                                    
Image: 00465 | Predicted: a man! a man in a suit and tie. | True: A person riding a baby blue motorcycle near haystacks
Image: 00483 | Predicted: a man in a suit and tie standing outside. | True: Various people are acknowledging life and having a good time. | 483/5000 [06:15<59:45,  1.26it/s]                                             
Image: 00483 | Predicted: a man in a suit and tie standing outsid

Loading pipeline for index_750_block_6:





Loading pipeline components...:   0%|          | 0/11 [00:00<?, ?it/s]

Some weights of TextDecoder were not initialized from the model checkpoint at /mnt/bn/us-aigc-temp/henry/data/dift/index_750_block_6/epoch_0_step_34999/text_decoder and are newly initialized: ['transformer.lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running sampler for index_750_block_6:


Image: 00393 | Predicted: a man in a suit and tie standing in front of a white background. | True: Two trains in an urban station with people waiting.<1:18:37,  1.02s/it]                                                                                  

Image: 00393 | Predicted: a man in a suit and tie standing in front of a white background. | True: Two trains in an urban station with people waiting.

Image: 00402 | Predicted: a man in a suit and tie standing. | True: some people and a dog under an open umbrella:   8%|▊         | 403/5000 [06:48<1:18:51,  1.03s/it]                                        