In [21]:
import os
import datasets

import json
import re
import hashlib

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

from matplotlib import pyplot

In [2]:
from prompt_process import is_ascii_string, check_complexity, check_word_redundancy, format_prompt, hash_prompt
from prompt_process import contains_cfg, contains_info, contains_junk, contains_link
from prompt_process import remove_weight, remove_extra, remove_redundant_tags

In [3]:
MAX_LENGTH = 500
MAX_TAG_LENGTH = 150

In [4]:
def single_worker(items, prompt_worker):
    
    results = list()

    for index, positive_prompt in tqdm(items):

        positive_raw_length = len(positive_prompt)

        positive_prompt = prompt_worker(positive_prompt)

        if len(positive_prompt) == 0:
            continue
            
        positive_hash = hash_prompt(positive_prompt)

        results.append((index, positive_prompt, positive_hash, positive_raw_length))
            
    results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
    results.drop_duplicates('positive_hash', inplace=True)

    return results

def pair_worker(items, prompt_worker):
    
    results = list()

    for index, positive_prompt, negative_prompt in tqdm(items):

        positive_raw_length = len(positive_prompt)

        positive_prompt = prompt_worker(positive_prompt)

        positive_hash = ''
        if len(positive_prompt) > 0:
            positive_hash = hash_prompt(positive_prompt)

        negative_raw_length = len(negative_prompt)

        negative_prompt = prompt_worker(negative_prompt)

        negative_hash = ''
        if len(negative_prompt) > 0:
            negative_hash = hash_prompt(negative_prompt)

        if len(positive_hash) > 0 or len(negative_hash) > 0:
            results.append((index, positive_prompt, positive_hash, positive_raw_length, negative_prompt, negative_hash, negative_raw_length))
            
    results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length', 'negative_prompt', 'negative_hash', 'negative_raw_length'])
    results.drop_duplicates(['positive_hash', 'negative_hash'], keep='last', inplace=True)

    return results

In [5]:
def civitai_worker(prompt):
    
    # check midjourney prompts
    if '::' in prompt:
        return ''
        
    if prompt[:2] == '**':
        return ''
        
    if '--' in prompt:
        return ''
    
    # extra prompt info
        
    if 'prompt' in prompt.lower():
        return ''
        
    if contains_cfg(prompt):
        return ''
    #
        
    if contains_link(prompt):
        return ''
    
    # if not check_brackets(positive_prompt)[0]:
    #     return ''
    
    prompt = remove_extra(prompt)
    
    if check_complexity(prompt):
        return ''
            
    prompt = format_prompt(prompt)

    prompt = remove_weight(prompt)

    prompt = prompt.lower()

    prompt = format_prompt(prompt)

    prompt = remove_redundant_tags(prompt)

    prompt = format_prompt(prompt)

    if not contains_info(prompt):
        return ''
    
    if check_word_redundancy(prompt):
        return ''
    
    if contains_junk(prompt):
        return ''

    return prompt

# discord

In [11]:
dataset = datasets.load_dataset("parquet", data_files={'train': '../dataset/diffusiondb/metadata-large.parquet'})

  table = cls._concat_blocks(blocks, axis=0)


In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image_name', 'prompt', 'part_id', 'seed', 'step', 'cfg', 'sampler', 'width', 'height', 'user_name', 'timestamp', 'image_nsfw', 'prompt_nsfw'],
        num_rows: 14000000
    })
})

In [8]:
items = list()

raw_hashs = set()

for i, item in tqdm(enumerate(dataset['train'])):
    
    if not is_ascii_string(item.get('prompt')):
        continue

    raw_hash = hash_prompt(item['prompt'])
    if raw_hash in raw_hashs:
        continue
    raw_hashs.add(raw_hash)
        
    positive_prompt = item['prompt']

    items.append((i, positive_prompt))

0it [00:00, ?it/s]

In [9]:
results = single_worker(items, civitai_worker)
results

  0%|          | 0/1763386 [00:00<?, ?it/s]

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,0,beautiful porcelain ivory fair face woman biom...,35e00c6cc4f7cdc8fbd4c1e97415084c,262
1,1,complex 3 d render hyper detailed ultra sharp ...,99122ff8e047cf8ff8e1b2b4609af60c,329
2,15,complex 3 d render hyper detailed ultra sharp ...,c0ec51b309a3d81e251d01371b8500cd,345
3,16,complex 3 d render hyper detailed ultra sharp ...,afe380f2f9b46392d99f156bc370c37c,338
4,33,complex 3 d render hyper detailed ultra sharp ...,644fe0a5da6abba388beb68567b73024,338
...,...,...,...,...
1743992,13999977,dreaming electric bicycle and electric car by ...,965c0c4fb20f0c7413c9a94ff61ea14f,54
1743993,13999978,"riding neon bycicles in the woods, painted by ...",4dec3e891e2a86cd3d981b0acaabacbb,96
1743994,13999987,"ibai llanos dressed as willy wonka, highly det...",39916ba8edba64c95161fdb3757a2e4c,66
1743995,13999993,"ibai berto romero as willy wonka, highly detai...",16c26bd944f4c77820d52119d653e67e,64


In [10]:
results.to_csv('../dataset/nonredundant-discord_prompts.tsv', sep='\t', index=False)

# civitai

## scrap

In [11]:
import requests
import json
import time
from tqdm.auto import tqdm

In [None]:
proxies = {
    # 'http': 'http://localhost:7890',
    # 'https': 'http://localhost:7890'
    'http': 'http://10.68.98.153:7890',
    'https': 'http://10.68.98.153:7890'
}

headers = {'Content-Type': 'application/json'}

In [None]:
url = 'https://civitai.com/api/v1/images'

results = list()

In [None]:
tbar = tqdm()

wait_time = 2
flag = True
while flag:
    
    try:
        data = requests.get(url, proxies=proxies)
        data = json.loads(data.content)
        url = data['metadata'].get('nextPage')
    except KeyboardInterrupt:
        break
    except:
        wait_time = min(60, wait_time * 2)
        time.sleep(wait_time)
        continue
    
    if url == None: break

    items = data['items']
    # now do whatever you want with data
            
    wait_time = 2
    time.sleep(wait_time)
    results.extend(items)
    tbar.update()

In [None]:
with open('../dataset/scrap/civitai-2023-11-14.jsonl', 'wt') as f:
    for i in tqdm(results):
        f.write(json.dumps(i) + '\n')

## from scrap

In [12]:
items = list()

sc_hashs = set()

with open('../dataset/scrap/civitai-2023-11-14.jsonl') as f:
    for i, line in tqdm(enumerate(f)):
        item = json.loads(line)
        
        sc_hashs.add(item['hash'])
        
        item = item['meta']
        
        if item is None:
            continue
        
        positive_prompt = ''
        if is_ascii_string(item.get('prompt')):
            positive_prompt = item['prompt']
        
        negative_prompt = ''
        if is_ascii_string(item.get('negativePrompt')):
            negative_prompt = item['negativePrompt']
        
        if len(positive_prompt) > 0 or len(negative_prompt) > 0:
            items.append((i, positive_prompt, negative_prompt))

0it [00:00, ?it/s]

## from hugging face

In [13]:
dataset = datasets.load_dataset("parquet", data_files={'train': '../dataset/civitai-stable-diffusion-337k/data/train-00000-of-00001-ace5b28cebba25a7.parquet'})

  table = cls._concat_blocks(blocks, axis=0)


In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'negativePrompt', 'steps', 'sampler', 'seed', 'Model', 'url', 'hash', 'nsfw', 'width', 'height', 'Size', 'createdAt', 'postId', 'stats', 'meta', 'username'],
        num_rows: 327138
    })
})

In [15]:
hf_hashs = set()

for i, item in tqdm(enumerate(dataset['train'])):
    
    hf_hashs.add(item['hash'])

    positive_prompt = ''
    if is_ascii_string(item.get('prompt')):
        positive_prompt = item['prompt']

    negative_prompt = ''
    if is_ascii_string(item.get('negativePrompt')):
        negative_prompt = item['negativePrompt']

    if len(positive_prompt) > 0 or len(negative_prompt) > 0:
        items.append((i, positive_prompt, negative_prompt))

0it [00:00, ?it/s]

In [16]:
len(hf_hashs & sc_hashs)

297377

In [17]:
results = pair_worker(items, civitai_worker)
results

  0%|          | 0/2561325 [00:00<?, ?it/s]

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length,negative_prompt,negative_hash,negative_raw_length
28,43,"ultra-realistic 8k cg, masterpiece, best quali...",70b886cea194b03aeb065b91eb5cdae3,642,"easynegative, worst quality, low quality, norm...",43a0c1dcfef3551b6ebe46a8e4bd76ff,245
30,45,"onoff, a dog in a vest",189609bbee1dd9be136973bd736ae48b,42,,,0
36,55,"sex, on floor, squatting cowgirl position, gir...",ab0037482a0499da81b41f19888ceb8e,573,"worst quality, bad quality, low quality, norma...",c83850626f176f2838dadfd434726c5b,961
39,59,"nsfw, 1girl, 1boy, a girl fucked by a boy, eks...",c5347fbee6fb83d36135f37bb9981644,358,"easynegativev2, worst quality, low quality, ba...",d93d163b6935a1f6bfc402393d3ad86c,182
42,63,"masterpiece, best quality, beautiful, extremel...",7e282e7767adaa9d2b9586498b89c98f,239,,,1754
...,...,...,...,...,...,...,...
2507983,326599,portrait of a ancient greek medium carmine dra...,9ae651f2f78951e49a680abb8d3fddaf,204,"cartoon, anime, horns",c1e03cb1b3996398a0f8e93bb1a6292a,22
2507984,326619,samdoesarts style drunken beautiful woman as d...,1165adbda1f896b6f320e0b13da9ff48,236,"cartoon, disfigured, bad art, deformed, poorly...",61e970afe2d6fc3d8ae61becc707bd3e,130
2507985,326620,"samdoesarts style woman by agnes cecile, lumin...",d344645b87ea82a301165439307acc0f,98,"cartoon, disfigured, bad art, deformed, poorly...",61e970afe2d6fc3d8ae61becc707bd3e,130
2507988,326623,"samdoesarts style symmetry, portrait of floral...",a2395962a5420d633e2dce3868189865,343,"cartoon, disfigured, bad art, deformed, poorly...",61e970afe2d6fc3d8ae61becc707bd3e,130


In [18]:
results.to_csv('../dataset/nonredundant-civitai_prompts.tsv', sep='\t', index=False)

## add id

In [23]:
df = pd.read_csv('../dataset/nonredundant-civitai_prompts.tsv', sep='\t')

In [24]:
last = 0
for offset, index in enumerate(df['index']):
    if index < last:
        break
    last = index

In [43]:
indices = list(reversed(list(df['index'].iloc[:offset])))

index = indices.pop()

ids = list()
has_controlnets = list()

with open('../dataset/scrap/civitai-2023-11-14.jsonl') as f:
    for i, line in tqdm(enumerate(f)):
        item = json.loads(line)
    
        if i > index:
            break
        if i < index:
            continue

        ids.append(item['id'])
        
        has_controlnet = False
        for key in item['meta'].keys():
            if 'control' in key.lower():
                has_controlnet = True
                break
        
        has_controlnets.append(has_controlnet)
        
        try:
            index = indices.pop()
        except:
            break

0it [00:00, ?it/s]

In [44]:
dataset = datasets.load_dataset("parquet", data_files={'train': '../dataset/civitai-stable-diffusion-337k/data/train-00000-of-00001-ace5b28cebba25a7.parquet'})

  table = cls._concat_blocks(blocks, axis=0)


In [55]:
indices = list(reversed(list(df['index'].iloc[offset:])))

index = indices.pop()

for i, item in tqdm(enumerate(dataset['train'])):
    
    if i > index:
        break
    if i < index:
        continue
    
    ids.append(item['id'])

    has_controlnet = False
    for key in eval(item['meta']).keys():
        if 'control' in key.lower():
            has_controlnet = True
            break

    has_controlnets.append(has_controlnet)
    
    try:
        index = indices.pop()
    except:
        break

0it [00:00, ?it/s]

In [56]:
df['id'] = ids
df['has_controlnet'] = has_controlnets

In [57]:
df.to_csv('../dataset/nonredundant-civitai_prompts.tsv', sep='\t', index=False)

## download paired images

In [3]:
save_dir = '../dataset/civitai-stable-diffusion-337k/images/'

In [4]:
df = pd.read_csv('../dataset/nonredundant-civitai_prompts.tsv', sep='\t')

df.dropna(inplace=True)

df['positive_length'] = df['positive_prompt'].str.len()
df['negative_length'] = df['negative_prompt'].str.len()

In [5]:
selected = df.query('positive_length > 25 and negative_length > 25').query('not has_controlnet')
# selected = selected.query('positive_length < 250 and negative_length < 250')
# selected = selected.query('positive_raw_length - positive_length < 100').query('negative_raw_length - negative_length < 100')
# selected = selected.query('positive_raw_length / positive_length < 1.5').query('negative_raw_length / negative_length < 1.5')
selected = selected.drop_duplicates('positive_prompt', keep='last').drop_duplicates('negative_prompt', keep='last')

In [6]:
import requests

from io import BytesIO
from PIL import Image
import time

proxies = {
    # 'http': 'http://localhost:7890',
    # 'https': 'http://localhost:7890'
    'http': 'http://10.68.98.153:7890',
    'https': 'http://10.68.98.153:7890'
}

In [19]:
ids = set(selected['index'])

with open('../dataset/scrap/civitai-2023-11-14-sorted.jsonl') as f:
    for i, line in tqdm(enumerate(f)):
        item = json.loads(line)
        
        if item is None:
            continue
            
        if item['id'] not in ids:
            continue
        
        if item['width'] * item['height'] > 2e6:
            continue
        
        if item['width'] != item['height']:
            continue
        
        output_path = os.path.join(save_dir, f'{item["id"]}.jpeg')

        if os.path.exists(output_path):
            continue
        
        try:

            data = requests.get(item['url'], proxies=proxies, timeout=(3.0, 10.0))

            if data.status_code == 200:
                Image.open(BytesIO(data.content)).save(output_path)
            else:
                continue

        except KeyboardInterrupt:
            break
        except:
            continue

0it [00:00, ?it/s]

## stats

In [12]:
metas = list()

for item in tqdm(dataset['train']):
    
    try:
        meta = eval(item['meta'])
    except:
        continue
    
    metas.append({
        'steps': meta.get('steps'),
        'sampler': meta.get('sampler'),
        'cfgScale': meta.get('cfgScale'),
        'Clip skip': meta.get('Clip skip'),
        'Denoising strength': meta.get('Denoising strength'),
    })

  0%|          | 0/327138 [00:00<?, ?it/s]

In [14]:
metas = pd.DataFrame(metas)

## get tags

In [26]:
nsfws = list()
nsfwLevels = list()

ids = list()
has_controlnets = list()

with open('../dataset/scrap/civitai-2023-11-14.jsonl') as f:
    for i, line in tqdm(enumerate(f)):
        
        item = json.loads(line)
        
        nsfws.append(item['nsfw'])
        nsfwLevels.append(item['nsfwLevel'])

        ids.append(item['id'])
        
        has_controlnet = False
        if item['meta'] is not None:
            for key in item['meta'].keys():
                if 'control' in key.lower():
                    has_controlnet = True
                    break
        has_controlnets.append(has_controlnet)

0it [00:00, ?it/s]

In [27]:
np.savez_compressed(
    '../dataset/scrap/civitai-2023-11-14.npz', 
    nsfw=np.array(nsfws).astype(bool), 
    nsfwLevel=np.array(nsfwLevels), 
    id=np.array(ids).astype('int32'), 
    has_controlnet=np.array(has_controlnet).astype(bool)
)

# leonardo ai

In [19]:
items = list()
with open('../dataset/scrap/leonardo-text-data.jsonl') as f:
    for i, line in tqdm(enumerate(f)):
        item = json.loads(line)
        item = item['generation']
        
        positive_prompt = ''
        if is_ascii_string(item['prompt']):
            positive_prompt = item['prompt']
        
        negative_prompt = ''
        if is_ascii_string(item['negativePrompt']):
            negative_prompt = item['negativePrompt']
        
        if len(positive_prompt) > 0 or len(negative_prompt) > 0:
            items.append((i, positive_prompt, negative_prompt))

0it [00:00, ?it/s]

In [20]:
results = pair_worker(items, civitai_worker)
results

  0%|          | 0/701684 [00:00<?, ?it/s]

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length,negative_prompt,negative_hash,negative_raw_length
3,3,"bohemian florals, t-shirt graphic, white backg...",54f5662d66804655c00261169767b139,51,,,0
35,37,"cute stickers, style cartoon, cute super defor...",2557eea36a8a3ec3a5486370babc04f9,246,,,0
59,62,"orange cat, drinking beer, bar, cute",ba4a2727c0c1e5bdc40d2389834b5bfd,36,,,0
61,64,delegates from britain and zanzibar sit down a...,275f2c69573028216fb9cfd8245f0132,158,,,0
66,69,"high quality, 8k ultra hd, high detailed, ench...",2233de0b246b2a6f62214618cb512bdd,665,,,0
...,...,...,...,...,...,...,...
670500,751077,indian emperor,272a10a54499605a3ee63bb5034fb5b4,14,,,0
670503,751080,"dark-skinned man, brown eyes, blackpower afro ...",c206bcd9641e729b7f7757f4195d6ae7,94,"low quality, ugly, crooked hands, ugly hair, c...",1e014bde2ff23b15cc87b72be75c9591,151
670505,751082,"backgroung, tarot card, high quality, 8k ultra...",124a78ce5d8ea952304f0162ececa4e3,144,,,0
670506,751083,mechanical worker with blue overall holding a ...,4013e002e70399f79c08dafb4891a375,139,,,0


In [21]:
results.to_csv('../dataset/nonredundant-leonardo_prompts.tsv', sep='\t', index=False)

## download paired images

In [10]:
df = pd.read_csv('../dataset/nonredundant-leonardo_prompts.tsv', sep='\t')

df.dropna(inplace=True)

df['positive_length'] = df['positive_prompt'].str.len()
df['negative_length'] = df['negative_prompt'].str.len()

In [30]:
selected = df.query('positive_length > 25 and negative_length > 25')#.query('not has_controlnet')
# selected = selected.query('positive_length < 500 and negative_length < 500')
selected = selected.query('positive_raw_length - positive_length < 100').query('negative_raw_length - negative_length < 100')
selected = selected.query('positive_raw_length / positive_length < 1.5').query('negative_raw_length / negative_length < 1.5')
# selected = selected.drop_duplicates('positive_prompt', keep='first').drop_duplicates('negative_prompt', keep='first')

selected

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length,negative_prompt,negative_hash,negative_raw_length,positive_length,negative_length
5,6,"in a cozy room, the curtains danced in the nig...",49d15617dd592c398d7ffd37275fefd6,135,"realistic, photo-real, multiple limbs, abnorma...",eb2c04d972b4547a3c7bde3edceef2a1,52,133,52
20,22,beautiful light watercolor cute baby rabbit su...,61217ad2bf6de962dada8d503d09b3dd,136,"dark colors, tree branches",e3d6efa04e36df02103b2c9ef1665622,27,136,26
32,39,ultra detailed illustration in a psychedelic s...,ae4d704ee1ab7924519dc8330f523599,219,"3d, low saturation, nudity, boobs, nipples, do...",0c420536151f03a094e1aeb2c5895001,794,219,785
36,43,ultra detailed photography of a dark room illu...,54022294fa231a42a4060502ab072262,316,"3d, low saturation, nudity, boobs, nipples, do...",0c420536151f03a094e1aeb2c5895001,794,316,785
41,48,ultra detailed illustration of a extremely bea...,bbaa549793d3503eb4342c60d3fba36a,323,"3d, low saturation, nudity, boobs, nipples, do...",0c420536151f03a094e1aeb2c5895001,794,303,785
...,...,...,...,...,...,...,...,...,...
249471,750974,"dark-skinned man, brown eyes, blackpower afro ...",c206bcd9641e729b7f7757f4195d6ae7,94,"low quality, ugly, crooked hands, ugly hair, c...",1e014bde2ff23b15cc87b72be75c9591,151,94,151
249474,750982,an finely detailed image of lady astor shoutin...,ba55c918fe1a7c8bfca801bbbcac3284,117,"distorted face, distorted eyes, distorted legs...",90bd69c0a0c0053795ba1047ad910322,84,116,79
249476,750986,disney pixar movie posters. a chubby young man...,af044103a69022ed39951a868fbeffa6,304,"double body, double face, double features, inc...",52178f49ebe397769e8a0c9cc191bbb0,933,304,893
249480,750999,"masterpiece, photography of beautiful 20 years...",ebeb7be366dd98ff22199ab106dca950,421,"2 heads, western, duplicate, malformed hand, d...",1562ea7205cb90f18088d469ac1e2303,733,419,697


In [20]:
import requests

from io import BytesIO
from PIL import Image
import time

proxies = {
    # 'http': 'http://localhost:7890',
    # 'https': 'http://localhost:7890'
    'http': 'http://10.68.98.153:7890',
    'https': 'http://10.68.98.153:7890'
}

In [21]:
save_dir = '../dataset/scrap/images/'

In [15]:
ids = set(selected['index'])

with open('../dataset/scrap/leonardo-text-data.jsonl') as f:
    for i, line in tqdm(enumerate(f)):
        item = json.loads(line)
        
        if item is None:
            continue
            
        if i not in ids:
            continue
            
#         if item['generation']['imageWidth'] != item['generation']['imageHeight']:
#             continue
            
        for info in item['images']:
            
            file_name = info['path']
        
            output_path = os.path.join(save_dir, file_name)

            if os.path.exists(output_path):
                continue
        
            try:

                data = requests.get(info['url'], proxies=proxies, timeout=(3.0, 10.0))

                if data.status_code == 200:
                    Image.open(BytesIO(data.content)).save(output_path)
                else:
                    continue

            except KeyboardInterrupt:
                break
            except:
                continue

0it [00:00, ?it/s]

# lexica

In [22]:
dataset = datasets.load_dataset("parquet", data_files={
    'train': '../dataset/Stable-Diffusion-Prompts/data/train.parquet',
    'eval': '../dataset/Stable-Diffusion-Prompts/data/eval.parquet'
})

In [23]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Prompt'],
        num_rows: 73718
    })
    eval: Dataset({
        features: ['Prompt'],
        num_rows: 8192
    })
})

In [24]:
items = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    if not is_ascii_string(item.get('Prompt')):
        continue
        
    positive_prompt = item['Prompt']

    items.append((i, positive_prompt))

0it [00:00, ?it/s]

In [25]:
results = single_worker(items, civitai_worker)
results

  0%|          | 0/72206 [00:00<?, ?it/s]

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,0,realistic car 3 d render sci - fi car and sci ...,9e1a7202a7dc32e8f6768a2a91f69c1b,431
1,1,a comic potrait of a female necromamcer with b...,a969789be9274f9090ec8800b87a1c19,336
2,2,"steampunk market interior, colorful, 3 d scene...",d5972456b82dbda080542af28126b54c,182
3,4,a full portrait of a beautiful post apocalypti...,4e922e02af2202868766035850596c5e,244
4,5,"beautiful victorian raven digital painting, ar...",6ed6649b76454b18bb82dea130295d3c,104
...,...,...,...,...
72006,73713,ismail inceoglu epic oil on canvas painting of...,de0b9b7c1c4367bf54fe114a5b11a4d4,397
72007,73714,eating crayons and being reborn in the loving ...,a5edefd2488c52ce874f796ee109f93a,258
72008,73715,"ilya kuvshinov with long hair, sky blue hair, ...",97a2db4bb697ae13665e296ebcbcf447,251
72009,73716,cyberpunk woman with green hair wearing futuri...,f72ca702b285aed42dd3a6d6634bbd22,198


In [26]:
results.to_csv('../dataset/nonredundant-lexica_prompts-train.tsv', sep='\t', index=False)

In [27]:
items = list()

for i, item in tqdm(enumerate(dataset['eval'])):
    
    if not is_ascii_string(item.get('Prompt')):
        continue
        
    positive_prompt = item['Prompt']

    items.append((i, positive_prompt))

0it [00:00, ?it/s]

In [28]:
results = single_worker(items, civitai_worker)
results

  0%|          | 0/8024 [00:00<?, ?it/s]

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,0,"young, curly haired, redhead natalie portman a...",1badf27e234909094c8b0e256a92f9bf,244
1,1,a mystical tribal goddess adorned with feather...,4317874d9990cd94cac73cb9b986d1e4,304
2,2,"molly millions, portrait of a beautiful cyberp...",beb676ffc28813c3ae1e71157be28152,296
3,3,"cyborg sweating water, big drops of sweat, for...",0a92b67f63cb3e3cca15d93abc117ceb,230
4,4,"max headroom in a perfume advertisement, magic...",1cc3de04bf5d44c962eb122305ec97db,269
...,...,...,...,...
7994,8187,"portrait of samuel jackson as gandalf, by alan...",6ad38b9e3c1f97420e27c9d733f36752,171
7995,8188,a photorealistic hyperrealistic render of an i...,09343d3ecb95cce22b18903b67fb2761,248
7996,8189,"occult leader by artgerm, tooth wu, dan mumfor...",0d3d5d3037bce4c4239f1d12d96c9187,225
7997,8190,"front shot of an alien cargo ship, intricate, ...",2fce18a1047c3ba2d98c5424bcf48570,312


In [29]:
results.to_csv('../dataset/nonredundant-lexica_prompts-eval.tsv', sep='\t', index=False)

# laion2B-aesthetic

In [4]:
dataset = datasets.load_dataset('../dataset/laion2B-en-aesthetic/')

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

  table = cls._concat_blocks(blocks, axis=0)


In [None]:
def worker(prompt):
    
    prompt = remove_extra(prompt)
    
    prompt = remove_weight(prompt)
    
    prompt = remove_complex(prompt)
    
    prompt = remove_brackets(prompt)
    
    prompt = format_prompt(prompt)
    
    prompt = prompt.lower()
    
    prompt = remove_redundancy(prompt)
    
    prompt = format_prompt(prompt)
    
    return prompt

In [117]:
results = list()

raw_hashs = set()

for i, item in tqdm(enumerate(dataset['train']), total=len(dataset['train'])):
    
    if item['TEXT'] is None:
        continue
        
    if not is_long(item['TEXT']):
        continue

    raw_hash = hash_prompt(item['TEXT'])
    if raw_hash in raw_hashs:
        continue
    raw_hashs.add(raw_hash)
        
    if not validate_prompt(item['TEXT']):
        continue
        
    positive_raw_length = len(item['TEXT'])
    
    positive_prompt = format_prompt(item['TEXT'])
    
    positive_prompt = positive_prompt.replace('"', ', ')
        
    positive_prompt = re.sub('[^,\s]*\.(co)|(org)[^,\s]+', ', ', positive_prompt)
    positive_prompt = re.sub('[^,\s]*[\d\s]{5,}[^,\s]*', ',', positive_prompt)

    positive_prompt = worker(positive_prompt)
    
    if not is_long(positive_prompt):
        continue
        
    results.append((i, positive_prompt, positive_hash, positive_raw_length))
    
len(results)

100%|██████████| 52068913/52068913 [47:24<00:00, 18303.02it/s] 


48768

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

print(results.shape[0])

results.to_csv('../dataset/nonredundant-laion2B_aesthetic.tsv', sep='\t', index=False)

In [None]:
df = pd.read_csv('../dataset/nonredundant-laion2B_aesthetic.tsv', sep='\t')

## long

In [115]:
def worker(prompt):
    
    prompt = remove_extra(prompt)
    
    prompt = remove_weight(prompt)
    
    prompt = remove_complex(prompt)
    
    prompt = remove_brackets(prompt)
    
    prompt = format_prompt(prompt)
    
    prompt = prompt.lower()
    
    prompt = remove_redundancy(prompt)
    
    prompt = format_prompt(prompt)
    
    return prompt

DatasetDict({
    train: Dataset({
        features: ['URL', 'TEXT', 'WIDTH', 'HEIGHT', 'similarity', 'hash', 'punsafe', 'pwatermark', 'aesthetic'],
        num_rows: 52068913
    })
})

In [117]:
results = list()

raw_hashs = set()

for i, item in tqdm(enumerate(dataset['train']), total=len(dataset['train'])):
    
    if item['TEXT'] is None:
        continue
        
    if not is_long(item['TEXT']):
        continue

    raw_hash = hash_prompt(item['TEXT'])
    if raw_hash in raw_hashs:
        continue
    raw_hashs.add(raw_hash)
        
    if not validate_prompt(item['TEXT']):
        continue
        
    positive_raw_length = len(item['TEXT'])
    
    positive_prompt = format_prompt(item['TEXT'])
    
    positive_prompt = positive_prompt.replace('"', ', ')
        
    positive_prompt = re.sub('[^,\s]*\.(co)|(org)[^,\s]+', ', ', positive_prompt)
    positive_prompt = re.sub('[^,\s]*[\d\s]{5,}[^,\s]*', ',', positive_prompt)

    positive_prompt = worker(positive_prompt)
    
    if not is_long(positive_prompt):
        continue
        
    results.append((i, positive_prompt, positive_hash, positive_raw_length))
    
len(results)

100%|██████████| 52068913/52068913 [47:24<00:00, 18303.02it/s] 


48768

In [151]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,240,walt disney company chairman and ceo michael e...,a23bd335ca09dd8a3f1fd946379e5618,73
1,2509,wooden bowl teak red hwb19 sold viafrika tagss...,041fd059667762c2268f382f7e4de831,73
2,2861,patterson custom homes - boy's rooms - bunk ro...,5ae01696c035bbedd273ab4ee58f4deb,73
3,4946,mice cream truck driver arrested for dui. hey ...,73dba67db6970fb55e277a505d88db8c,73
4,5617,"writes dalrymple, 'the asylum notes show richa...",edb54f66e2a7e9a1e9dd1ba0be6eab77,73
...,...,...,...,...
33208,52061219,"angela holt, sunset at storm's pass, sandia cr...",321027869964f13a2f4906adb6115dd7,73
33209,52063478,motorcycle craftsman xanti garcia corb motorcy...,f54a642795e208542e2c21a49036f507,73
33210,52063710,"all students in, degree nursing class at east ...",7e015ab94bc9135d9a8d1f0e4a202915,73
33211,52067088,this vertical still life painting shows a gran...,0139ffb8725e2c88800b36df60d44c58,73


In [152]:
results.to_csv('../dataset/long-laion2B-en-aesthetic.tsv', sep='\t', index=False)

# midjourney

In [30]:
dataset = datasets.load_dataset('../dataset/midjourney-messages/')

Resolving data files:   0%|          | 0/59 [00:00<?, ?it/s]

  table = cls._concat_blocks(blocks, axis=0)


In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'channel_id', 'content', 'timestamp', 'image_id', 'height', 'width', 'url', 'size'],
        num_rows: 55082563
    })
})

In [32]:
def extract_prompt(prompt):
    result = re.search('\*\*(.+)\*\*', prompt)
    if result is None:
        return ''
    return result.group(1)

def validate_prompt(prompt):
    if len(prompt) == 0:
        return False
    return max(map(ord, prompt)) <= 127

In [None]:
results = list()

hashs = set()

pairs = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    if not validate_prompt(item['content']):
        continue
    
    if contains_link(item['content']):
        continue
        
    positive_prompt = extract_prompt(item['content']).strip()
        
    positive_hash = hash_prompt(positive_prompt)
    if positive_hash in hashs:
        continue
    hashs.add(positive_hash)
    
    # if not check_brackets(positive_prompt)[0]:
    #     continue
        
    positive_raw_length = len(positive_prompt)
    
    positive_prompt = re.sub('[\x00-\x08]', '', positive_prompt)
    
    if '--no ' in positive_prompt.lower() or '::-' in positive_prompt:
        pairs.append((i, positive_prompt, positive_raw_length))
        continue

    positive_prompt = positive_prompt.split('--')[0]
        
    positive_prompt = positive_prompt.replace('\\', ', ')
        
    positive_prompt = format_prompt(positive_prompt)
    
    if len(positive_prompt) == 0:
        continue
        
    positive_hash = hash_prompt(positive_prompt)
        
    results.append((i, positive_prompt, positive_hash, positive_raw_length))

0it [00:00, ?it/s]

## paired

In [42]:
paired_results = list()

for index, positive_prompt, positive_raw_length in tqdm(pairs):

    if type(positive_prompt) != str:
        continue
    
    if '|' in positive_prompt:
        continue
        
    parts = re.findall('[a-zA-Z]+:[^:\.]+\.', positive_prompt)
    if len(parts) > 0:
        continue
        
    positive_prompt = positive_prompt.replace('"', ', ')
    
    negs = list()
    
    for i, part in enumerate(positive_prompt.split('--')):
        if i == 0:
            positive_prompt = part
        elif part.lower().startswith('no '):
            negs.append(' '.join(part.split()[1:]))
            
    if re.search(':\s*-[0-9,\.\s]*', positive_prompt):
        negs.extend(re.findall('([^,:]+):\s*:\s*-[0-9,\.\s]*', positive_prompt))
        positive_prompt = re.sub('([^,:]+):\s*:\s*-[0-9,\.\s]*', ', ', positive_prompt)
    
    positive_prompt = remove_weight(positive_prompt)
    positive_prompt = positive_prompt.lower()
    positive_prompt = format_prompt(positive_prompt)
    positive_prompt = remove_redundant_tags(positive_prompt)
    positive_prompt = format_prompt(positive_prompt)
    
    if len(positive_prompt) == 0:
        continue
        
    positive_hash = hash_prompt(positive_prompt)
    
    if len(negs) == 0:
        results.append((index, positive_prompt, positive_hash, positive_raw_length))
        continue
    
    negative_prompt = ', '.join(negs)
    
    negative_prompt = remove_weight(negative_prompt)
    negative_prompt = negative_prompt.lower()
    negative_prompt = format_prompt(negative_prompt)
    negative_prompt = remove_redundant_tags(negative_prompt)
    negative_prompt = format_prompt(negative_prompt)
    
    if len(negative_prompt) == 0:
        continue
        
    negative_hash = hash_prompt(negative_prompt)
        
    negative_raw_length = len(negative_prompt)
        
    paired_results.append((index, positive_prompt, positive_hash, positive_raw_length, negative_prompt, negative_hash, negative_raw_length))

  0%|          | 0/257735 [00:00<?, ?it/s]

In [43]:
paired_results = pd.DataFrame(paired_results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length', 'negative_prompt', 'negative_hash', 'negative_raw_length'])
paired_results.drop_duplicates(['positive_hash', 'negative_hash'], inplace=True)

paired_results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length,negative_prompt,negative_hash,negative_raw_length
0,2026,"teamwork, garden, happiness, sunset, house",fc83d8514dee80f4373fac06c3e80d01,64,"hands, white",3e80f433334416e843cc61080c51ec60,12
1,2073,"boy helping a girl to escape from danger, scar...",9c26b159e7c8b1717234a07075cc686d,94,"hands, white",3e80f433334416e843cc61080c51ec60,12
2,2140,help,657f8b8da628ef83cf69101b6817150a,26,"hands, white",3e80f433334416e843cc61080c51ec60,12
3,2184,"help each other, supporting people, nature",7a56829fb6592e123273d3b7bc9814df,53,hands,6c8d61508321ac444175370124200350,5
4,2217,"sunset, nature, lovely house and people",eae284a91e5e1e8954ef8c5d1ff46b83,61,"hands, white",3e80f433334416e843cc61080c51ec60,12
...,...,...,...,...,...,...,...
250809,55082056,"coloring book page, an illustration of a witch...",66688c434d225cb6ac8f4dd910232c65,410,"noise, book, logo, page, letters, words, marke...",3ae9f136b7632f2580526e138d818165,77
250810,55082128,"coloring book page, detailed illustration of a...",d8c378869987b80994e7d69db98b71a4,415,"noise, book, logo, page, letters, words, marke...",3ae9f136b7632f2580526e138d818165,77
250811,55082296,"coloring book page, different types of protect...",af637e73e10902a5d9aeed9cf9a0e73b,384,"noise, book, logo, page, letters, words, marke...",3ae9f136b7632f2580526e138d818165,77
250812,55082364,"coloring book page, detailed illustration of a...",14c337ee9b25620acbfde8d65e49b009,405,"noise, book, logo, page, letters, words, marke...",3ae9f136b7632f2580526e138d818165,77


In [44]:
paired_results.to_csv('../dataset/nonredundant-midjourney_prompts-paired.tsv', sep='\t', index=False)

In [45]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,0,"adult Goku in Dragonball Z, walking on a beach...",44fa41e731d78cdf1c47b02b99f0e094,79
1,1,"hyperrealism,In the valley, there lies a laven...",93fbc69be2d2de98f3efd5ce619a2e2c,430
2,2,thai chicken bowl,b64f2f2dbbe9e243dff262bb4b87fe6c,17
3,6,"hyperrealism, In the valley lies a lavender ga...",345b236dc8e4c65012d653f25e1ed59d,305
4,11,"a Scandinavian modern-style sauna, front on vi...",8f2ff7168016b70f700734de3229f597,191
...,...,...,...,...
17072568,54704620,"inflated, oversized, gas pump with person stan...",936d3caba360fd8f94d96e04ff65b484,68
17072569,54878665,a mars outpost,b50a8d7bf8689762a2b683fde222ecb2,305
17072570,55013640,"lion as human wearing a leather jacket, smoke ...",d06b4a8ff0023a67abe6eca22d5f7f15,112
17072571,55013685,"lion as human wearing a white shirt, portrait ...",01cb4118b59a773f450f68dced87c8ab,80


In [46]:
results.to_csv('../dataset/midjourney_prompts.tsv', sep='\t', index=False)

## download paired images

In [None]:
import requests

from io import BytesIO
from PIL import Image
import time

proxies = {
    # 'http': 'http://localhost:7890',
    # 'https': 'http://localhost:7890'
    'http': 'http://10.68.98.153:7890',
    'https': 'http://10.68.98.153:7890'
}

In [None]:
df = pd.read_csv('../dataset/nonredundant-midjourney_prompts-paired.tsv', sep='\t')
selected = df.query('positive_raw_length > 25 and positive_raw_length < 250 and negative_raw_length > 25 and negative_raw_length < 250')
selected

In [None]:
save_dir = '../dataset/midjourney-messages/images/'
os.makedirs(save_dir, exist_ok=True)

In [None]:
indices = list(reversed(list(selected['index'])))

index = indices.pop()

errors = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    if i > index:
        break
    if i < index:
        continue
        
    output_path = os.path.join(save_dir, f'{index}.jpg')
    
    if not os.path.exists(output_path):
        
        try:

            data = requests.get(item['url'], proxies=proxies, timeout=(3.0, 10.0))

            if data.status_code == 200:
                Image.open(BytesIO(data.content)).save(output_path)
            else:
                errors.append(index)

        except KeyboardInterrupt:
            break
        except:
            errors.append(index)
        
    try:
        index = indices.pop()
    except:
        break
        
    time.sleep(0.5)

## step2

In [47]:
# df = pd.read_csv('../dataset/midjourney_prompts.tsv', sep='\t')
df = results

In [49]:
def remove_brackets(prompt):
    
    while re.search(r'\([^\)]*\)', prompt):
        prompt = re.sub(r'\(([^\)]*)\)', r', \1,', prompt)
    
    while re.search(r'\[[^\]]*\]', prompt):
        prompt = re.sub(r'\[([^\]]*)\]', r', \1,', prompt)
    
    while re.search(r'\{[^\}]*\}', prompt):
        prompt = re.sub(r'\{([^\}]*)\}', r', \1,', prompt)
    
    while re.search(r'\<[^\>]*\>', prompt):
        prompt = re.sub(r'\<([^\>]*)\>', r', \1,', prompt)
    
    return prompt

In [None]:
results = list()

complexs = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):

    if type(positive_prompt) != str:
        continue

    if '|' in positive_prompt:
        continue
    
    if '"' in positive_prompt:
        continue
    
    if contains_link(positive_prompt):
        continue
        
    # remove neg weighted tags
    positive_prompt = re.sub('([^,:]+)[:\s]{2,}-[0-9,\.]*', ', ', positive_prompt)
    # remove weight
    positive_prompt = re.sub('(:\s?){2,}[0-9,\.]*', ', ', positive_prompt)
    
    positive_prompt = re.sub('[^,\s]*[\d\s]{5,}[^,\s]*', ',', positive_prompt)
    
    parts = re.findall('[a-zA-Z]+:[^:\.]+\.', positive_prompt)
    
    if len(parts) > 0:
        complexs.append((index, positive_prompt, positive_raw_length))
        continue
        
    positive_prompt = remove_brackets(positive_prompt)
        
    positive_prompt = format_prompt(positive_prompt)

    positive_prompt = positive_prompt.lower()

    positive_prompt = remove_redundant_tags(positive_prompt)

    if len(positive_prompt) == 0:
        continue

    positive_hash = hash_prompt(positive_prompt)

    results.append((index, positive_prompt, positive_hash, positive_raw_length))

0it [00:00, ?it/s]

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

In [None]:
results.to_csv('../dataset/nonredundant-midjourney_prompts.tsv', sep='\t', index=False)

# dalle

In [None]:
dataset = datasets.load_dataset('../dataset/dalle-3-contrastive-captions/')

In [None]:
dataset

In [None]:
results = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    for key in ['caption', 'dense_caption_1', 'dense_caption_2', 'dense_caption_3', 'dense_caption_4', 'dense_caption_5', 'dense_caption_6', 'dense_caption_7', 'dense_caption_8', 'dense_caption_9', 'dense_caption_10']:
        
        positive_prompt = item[key]
    
        if not validate_prompt(positive_prompt):
            continue

        if not check_brackets(positive_prompt)[0]:
            continue

        positive_raw_length = len(positive_prompt)

        positive_prompt = format_prompt(positive_prompt)

        if len(positive_prompt) == 0:
            continue

        positive_hash = hash_prompt(positive_prompt)

        results.append((i, positive_prompt, positive_hash, positive_raw_length))

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

In [None]:
results.to_csv('../dataset/dalle_prompts.tsv', sep='\t', index=False)

In [None]:
df = pd.read_csv('../dataset/dalle_prompts.tsv', sep='\t')

In [None]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):

    if type(positive_prompt) != str:
        continue
        
    positive_prompt = remove_brackets(positive_prompt)
        
    positive_prompt = format_prompt(positive_prompt)

    positive_prompt = positive_prompt.lower()

    redundant, max_tag_length = check_redundancy(positive_prompt)

    if redundant or max_tag_length > MAX_TAG_LENGTH or len(positive_prompt) == 0:
        continue

    positive_hash = hash_prompt(positive_prompt)

    results.append((index, positive_prompt, positive_hash, positive_raw_length))

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

In [None]:
results.to_csv('../dataset/nonredundant-dalle_captions.tsv', sep='\t', index=False)

# dalle gpt

In [None]:
dataset = datasets.load_dataset('arrow', data_files={'train': '../dataset/DALL-E-Prompts-OpenAI-ChatGPT/train/data-00000-of-00001.arrow'})

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
results = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    positive_prompt, positive_hash = '', ''
    if item['Prompt'] is not None and validate_prompt(item['Prompt']):
        positive_prompt = format_prompt(item['Prompt'])
        positive_hash = hash_prompt(positive_prompt)
    
    if len(positive_prompt) == 0:
        continue
        
    positive_raw_length = len(item['Prompt'])
    
    results.append((i, positive_prompt, positive_hash, positive_raw_length))
    
len(results)

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

In [None]:
results.to_csv('../dataset/dalle_chatgpt_prompts.tsv', sep='\t', index=False)

## step2

In [None]:
df = pd.read_csv('../dataset/dalle_chatgpt_prompts.tsv', sep='\t')

In [None]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):

    if type(positive_prompt) != str:
        continue
        
    positive_prompt = positive_prompt.lower()

    redundant, max_tag_length = check_redundancy(positive_prompt)

    if redundant or max_tag_length > MAX_TAG_LENGTH or len(positive_prompt) == 0:
        continue

    positive_hash = hash_prompt(positive_prompt)

    results.append((index, positive_prompt, positive_hash, positive_raw_length))

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

In [None]:
results.to_csv('../dataset/nonredundant-dalle_chatgpt_prompts.tsv', sep='\t', index=False)

# dalle discord

In [None]:
df = pd.read_csv('../dataset/dalle_discord_prompts.tsv', sep='\t')

In [None]:
df

In [None]:
results = list()

for index, positive_prompt in tqdm(enumerate(df['caption'])):

    if type(positive_prompt) != str or not validate_prompt(positive_prompt):
        continue
        
    positive_raw_length = len(positive_prompt)
        
    positive_prompt = format_prompt(positive_prompt)
    
    if '|' in positive_prompt:
        continue
    if '"' in positive_prompt:
        continue
    
    if len(positive_prompt) == 0:
        continue
        
    positive_prompt = positive_prompt.lower()

    redundant, max_tag_length = check_redundancy(positive_prompt)

    if redundant or max_tag_length > MAX_TAG_LENGTH or len(positive_prompt) == 0:
        continue

    positive_hash = hash_prompt(positive_prompt)

    results.append((index, positive_prompt, positive_hash, positive_raw_length))

In [None]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

In [None]:
results.to_csv('../dataset/nonredundant-dalle_discord_prompts.tsv', sep='\t', index=False)