In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
import json

import numpy as np
import pandas as pd

import msgpack

import zipfile

import torch

import glob

from PIL import Image

from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModel, CLIPImageProcessor

In [3]:
BATCH_SIZE = 64
MAX_LENGTH = 77
# MODEL_NAME = 'BAAI/bge-base-en-v1.5'
MODEL_NAME = 'openai/clip-vit-large-patch14'
# MODEL_NAME = 'laion/CLIP-ViT-bigG-14-laion2B-39B-b160k'
# MODEL_NAME = 'johngiorgi/declutr-base'
# MAX_LENGTH = 88
# MODEL_NAME = '../Llama-2-7b-hf'

# load data

In [15]:
ROOT = '../kcg-ml-image-pipeline/output/dataset/'

DATASETs = [
    # 'environmental', 
    # 'character', 
    # 'icons', 
    # 'mech', 
    'waifu',
    'propaganda-poster'
]

## save json

In [16]:
def save_json(dataset_name):

    paths = sorted(glob.glob(os.path.join(ROOT, 'data', dataset_name, '**/*_data.msgpack')))
    
    job_uuids = list()
    file_paths = list()
    file_hashs = list()
    positive_prompts = list()
    negative_prompts = list()
    creation_times = list()
    for path in tqdm(paths, leave=False):
    
        if not os.path.exists(path.replace('_data.msgpack', '_clip.msgpack').replace('/data/', '/clip/')):
            continue
    
        with open(path, 'rb') as f:
            mp = msgpack.load(f)
        
        job_uuids.append(mp['job_uuid'])
        file_paths.append(mp['file_path'])
        file_hashs.append(mp['file_hash'])
        positive_prompts.append(mp['positive_prompt'])
        negative_prompts.append(mp['negative_prompt'])
        creation_times.append(mp['creation_time'])
    
    os.makedirs(os.path.join('data', dataset_name), exist_ok=True)

    df = pd.DataFrame(
        zip(file_paths, creation_times, job_uuids, positive_prompts, negative_prompts, file_hashs), 
        columns=['file_path', 'creation_time', 'job_uuid', 'positive_prompt', 'negative_prompt', 'file_hash']
    )

    df.drop_duplicates(['file_hash'], inplace=True)
    df.set_index('file_hash', inplace=True)
    
    json.dump(
        df.to_dict(orient='index'),
        open(os.path.join('data', dataset_name, 'prompt.json'), 'w')
    )

In [17]:
for dataset_name in DATASETs:
    save_json(dataset_name)

  0%|          | 0/8634 [00:00<?, ?it/s]

  0%|          | 0/8816 [00:00<?, ?it/s]

## load json

In [18]:
def load_json(dataset_name):

    file_hashs = list()
    file_paths = list()
    positive_prompts = list()
    negative_prompts = list()
    
    for file_hash, info in json.load(open(os.path.join('data', dataset_name, 'prompt.json'))).items():
        
        file_hashs.append(file_hash)
        file_paths.append(info['file_path'])
        positive_prompts.append(info['positive_prompt'])
        negative_prompts.append(info['negative_prompt'])

    return file_hashs, file_paths, positive_prompts, negative_prompts

In [19]:
for dataset_name in DATASETs:
    file_hashs, file_paths, positive_prompts, negative_prompts = load_json(dataset_name)
    print(len(file_hashs))

8634
8816


# load text embedder

## from transformers

In [20]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=True)

### from CLIP model

In [21]:
transformer = AutoModel.from_pretrained(MODEL_NAME, local_files_only=True).text_model.cuda().eval()

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


In [22]:
def worker(texts, use_penultimate=False):
    
    batch_encoding = tokenizer(
        texts,
        truncation=True, max_length=MAX_LENGTH, return_length=True,
        return_overflowing_tokens=False, padding="max_length", return_tensors="pt"
    )

    tokens = batch_encoding["input_ids"].cuda()

    clip_text_opt = transformer(input_ids=tokens, output_hidden_states=True, return_dict=True)
    
    attention_mask = batch_encoding.attention_mask.detach().cpu().numpy()
    pooler_output = clip_text_opt.pooler_output.detach().cpu().numpy()
    
    if use_penultimate:
        last_hidden_state = clip_text_opt.hidden_states[-1].detach().cpu().numpy()
    else:
        last_hidden_state = clip_text_opt.last_hidden_state.detach().cpu().numpy()
    
    return last_hidden_state, pooler_output, attention_mask

### from LLM model

In [9]:
# def worker(texts):
    
#     batch_encoding = tokenizer(
#         texts,
#         truncation=True, max_length=MAX_LENGTH, return_length=True,
#         return_overflowing_tokens=False, padding="max_length", return_tensors="pt"
#     )

#     tokens = batch_encoding["input_ids"].cuda()

#     clip_text_opt = transformer(input_ids=tokens)

#     last_hidden_state = clip_text_opt.last_hidden_state.detach().cpu().numpy()
#     attention_mask = batch_encoding.attention_mask.detach().cpu().numpy()
    
#     pooler_output = clip_text_opt.pooler_output.detach().cpu().numpy()
#     # pooler_output = None
    
#     return last_hidden_state, pooler_output, attention_mask

In [None]:
# transformer = AutoModel.from_pretrained(MODEL_NAME).cuda().eval()

### from LLM model

In [8]:
# tokenizer.pad_token = "[PAD]"
# tokenizer.padding_side = "left"

# transformer = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, load_in_8bit=True, device_map='auto').eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at ../Llama-2-7b-hf were not used when initializing LlamaModel: ['lm_head.weight']
- This IS expected if you are initializing LlamaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# embed & save

In [23]:
def save_text_npz(dataset_name):

    # load

    file_hashs, file_paths, positive_prompts, negative_prompts = load_json(dataset_name)

    # 
    
    # positive_last_hidden_states = list()
    # positive_attention_masks = list()
    positive_pooler_outputs = list()
    
    # negative_last_hidden_states = list()
    # negative_attention_masks = list()
    negative_pooler_outputs = list()
    
    with torch.no_grad():
        
        for i in tqdm(range(0, len(positive_prompts), BATCH_SIZE), leave=False):
            
            last_hidden_state, pooler_output, attention_mask = worker(positive_prompts[i:i+BATCH_SIZE])
            
            # positive_last_hidden_states.append(last_hidden_state)
            # positive_attention_masks.append(attention_mask)
            if pooler_output is not None:
                positive_pooler_outputs.append(pooler_output)
        
        for i in tqdm(range(0, len(negative_prompts), BATCH_SIZE), leave=False):
            
            last_hidden_state, pooler_output, attention_mask = worker(negative_prompts[i:i+BATCH_SIZE])
            
    #         negative_last_hidden_states.append(last_hidden_state)
    #         negative_attention_masks.append(attention_mask)
            if pooler_output is not None:
                negative_pooler_outputs.append(pooler_output)
    
    # positive_last_hidden_states = np.concatenate(positive_last_hidden_states, axis=0)
    # positive_attention_masks = np.concatenate(positive_attention_masks, axis=0)
    if len(positive_pooler_outputs) > 0:
        positive_pooler_outputs = np.concatenate(positive_pooler_outputs, axis=0)
    
    # negative_last_hidden_states = np.concatenate(negative_last_hidden_states, axis=0)
    # negative_attention_masks = np.concatenate(negative_attention_masks, axis=0)
    if len(positive_pooler_outputs) > 0:
        negative_pooler_outputs = np.concatenate(negative_pooler_outputs, axis=0)

    #
    
    np.savez(
        os.path.join('data', dataset_name, 'clip_text_emb.npz'), 
        file_hashs=np.array(file_hashs), 
        file_paths=np.array(file_paths), 
        # positive_last_hidden_states=positive_last_hidden_states, 
        # positive_attention_masks=positive_attention_masks,
        positive_pooler_outputs=positive_pooler_outputs,
        # negative_last_hidden_states=negative_last_hidden_states,
        # negative_attention_masks=negative_attention_masks,
        negative_pooler_outputs=negative_pooler_outputs
    )

In [24]:
for dataset_name in DATASETs:
    save_text_npz(dataset_name)

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/135 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

# vision

## from msgpack

In [25]:
def save_vision_npz(dataset_name):
    
    # load

    file_hashs, file_paths, positive_prompts, negative_prompts = load_json(dataset_name)

    #
    
    vectors = list()
    
    for file_path in tqdm(file_paths, leave=False):
        
        msp_path = file_path.replace('_data.msgpack', '_clip.msgpack').replace('.jpg', '_clip.msgpack')
    
        with open(os.path.join(ROOT, 'clip', msp_path), 'rb') as f:
            data = f.read()
        decoded_data = msgpack.unpackb(data)
        
        vectors.append(np.array(decoded_data['clip-feature-vector']))

    # save

    np.savez(
        os.path.join('data', dataset_name, 'clip_vision_emb.npz'), 
        file_hashs=np.array(file_hashs), 
        file_paths=np.array(file_paths), 
        image_embeds=np.concatenate(vectors, axis=0)
    )

In [26]:
for dataset_name in DATASETs:
    save_vision_npz(dataset_name)

  0%|          | 0/8634 [00:00<?, ?it/s]

  0%|          | 0/8816 [00:00<?, ?it/s]

## from file system

In [3]:
# INPUT_DIR = '../dataset/civitai-stable-diffusion-337k/images/'
# OUTPUT_DIR = '../dataset/civitai-stable-diffusion-337k/clip/'

INPUT_DIR = '../dataset/scrap/steam/screenshot/'
OUTPUT_DIR = '../dataset/scrap/steam/clip/'

# INPUT_DIR = '../dataset/scrap/leonardo/images/'
# OUTPUT_DIR = '../dataset/scrap/leonardo/clip/'

# INPUT_DIR = '../dataset/midjourney-messages/images/'
# OUTPUT_DIR = '../dataset/midjourney-messages/clip/'

In [4]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [5]:
preprocessor = CLIPImageProcessor.from_pretrained(MODEL_NAME, local_files_only=True)

clip_model = AutoModel.from_pretrained(MODEL_NAME, local_files_only=True).cuda().eval()

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


In [6]:
file_names = list()

for file_name in os.listdir(INPUT_DIR):
    if not file_name.lower().endswith(('.jpg', '.png', '.jpeg', '.bmp')):
        continue
    clip_path = os.path.join(OUTPUT_DIR, f'{os.path.splitext(file_name)[0]}.npy')
    
    if os.path.exists(clip_path):
        continue
        
    file_names.append(file_name)

In [7]:
for i in tqdm(range(0, len(file_names), BATCH_SIZE)):
    
    with torch.no_grad():

        images = list()
        names = list()
        for file_name in file_names[i:i+BATCH_SIZE]:
            try:
                image = Image.open(os.path.join(INPUT_DIR, file_name))
                image = preprocessor(images=image, return_tensors="pt")
            except:
                continue
            images.append(image['pixel_values'])
            names.append(file_name)

        images = torch.concat(images, dim=0)
    
        image_features = clip_model.get_image_features(pixel_values=images.to(clip_model.device))
        image_features = image_features.detach().cpu().numpy()
        
        for file_name, image_feature in zip(names, image_features):
            clip_path = os.path.join(OUTPUT_DIR, f'{os.path.splitext(file_name)[0]}.npy')
            np.save(clip_path, image_feature[None, ...])

  0%|          | 0/407 [00:00<?, ?it/s]

# from zip

In [42]:
ZIP_PATH = './generated-1119.zip'
EMB_DIR = './generated/1119/'

In [43]:
os.makedirs(EMB_DIR, exist_ok=True)

In [None]:
f = zipfile.ZipFile(ZIP_PATH)

In [None]:
file_paths = list()

files = set(f.namelist())

for file_path in f.namelist():
    
    if file_path.startswith('generated/image/') and file_path.endswith('.jpg'):
        
        embedding_path = file_path.replace('/image/', '/embedding/').replace('.jpg', '.npz')
        clip_path = file_path.replace('/image/', '/clip/').replace('.jpg', '.npy')
    
        if embedding_path not in files or clip_path not in files:
            continue
        
        file_paths.append(file_path)
    
file_names = np.array([os.path.split(i)[-1] for i in file_paths])

In [None]:
positive_embs = list()
negative_embs = list()
image_embs = list()

for file_name in tqdm(file_names):
    
    embedding_path = os.path.join('generated', 'embedding', file_name.replace('.jpg', '.npz'))
    
    npz = np.load(f.open(embedding_path))
    positive_embs.append(npz['positive_pooler_output'])
    negative_embs.append(npz['negative_pooler_output'])
    
    clip_path = os.path.join('generated', 'clip', file_name.replace('.jpg', '.npy'))
    
    image_embs.append(np.load(f.open(clip_path)))
    
positive_embs = np.concatenate(positive_embs, axis=0)
negative_embs = np.concatenate(negative_embs, axis=0)
image_embs = np.concatenate(image_embs, axis=0)

In [41]:
np.savez(
    os.path.join(EMB_DIR, 'clip_vision_emb.npz'), 
    image_embeds=image_embs
)

np.savez(
    os.path.join(EMB_DIR, 'clip_text_emb.npz'), 
    positive_pooler_outputs=positive_embs,
    negative_pooler_outputs=negative_embs
)