In [1]:
import os
import datasets

import re
import hashlib

import pandas as pd

# from tqdm.auto import tqdm
from tqdm import tqdm

from matplotlib import pyplot

In [2]:
MIN_LENGTH = 500
MAX_TAG_LENGTH = 150

In [77]:
def validate_prompt(prompt):
    
    if type(prompt) != str or len(prompt) == 0:
        return False
    
    if r'\\' in prompt or r'//' in prompt or '#' in prompt:
        return False
    if max(map(ord, prompt)) > 127:
        return False
    
    if not is_long(prompt):
        return False
    
    if not check_brackets(item['prompt'])[0]:
        return False
    
    return True

def is_long(prompt):
    return len(prompt) > MIN_LENGTH

def format_prompt(prompt):
    
    prompt = prompt.strip()
    
    # prompt = re.sub(r'\\n', ', ', prompt)
    # prompt = re.sub(r'\\([\(\)\[\]])', r'\1', prompt)
    # prompt = re.sub(r'\\[\\\s]+', ' ', prompt)
    # prompt = re.sub(r'\\', ' ', prompt)
    # prompt = re.sub(r'[/\/]{2,}', ' ', prompt)
    
    while re.search(r'(\([\s,]*\))|(\<[\s,]*\>)|(\[[\s,]*\])|(\{[\s,]*\})', prompt):
        prompt = re.sub(r'(\([\s,]*\))|(\<[\s,]*\>)|(\[[\s,]*\])|(\{[\s,]*\})', '', prompt)
    
    prompt = re.sub(r'([\[\(\{\<])\s', r'\1', prompt)
    prompt = re.sub(r'\s([\]\)\}\>])', r'\1', prompt)
    prompt = re.sub(r'\s+', ' ', prompt)
    prompt = re.sub(r'(\s?[,;])+', r',', prompt)
    
    prompt = re.sub(r'^[\.,;\s]+', '', prompt)
    prompt = re.sub(r'[\.,;\s]+$', '', prompt)
    
    return prompt

def hash_prompt(prompt):
    return hashlib.md5(prompt.encode()).hexdigest()

bracket_map = {')': '(', ']': '[', '}': '{'}

def check_brackets(prompt):
    stack = list()
    has_brackets = False
    for i, c in enumerate(prompt):
        if c in {'[', '(', '{'}:
            stack.append(c)
            has_brackets = True
        elif c in {']', ')', '}'}:
            if len(stack) == 0:
                return False, True
            if bracket_map[c] != stack.pop():
                return False, True
    return len(stack) == 0, has_brackets

def remove_brackets(prompt):
    
    while re.search(r'\([^\)]*\)', prompt):
        prompt = re.sub(r'\(([^\)]*)\)', r', \1,', prompt)
    
    while re.search(r'\[[^\]]*\]', prompt):
        prompt = re.sub(r'\[([^\]]*)\]', r', \1,', prompt)
    
    while re.search(r'\{[^\}]*\}', prompt):
        prompt = re.sub(r'\{([^\}]*)\}', r', \1,', prompt)
    
    while re.search(r'\<[^\>]*\>', prompt):
        prompt = re.sub(r'\<([^\>]*)\>', r', \1,', prompt)
    
    return prompt

#formula to check if user prompt contains a link
# url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
def contains_link(prompt):
    # if url_pattern.search(prompt):
    prompt = prompt.lower()
    
    if '://' in prompt or '.jpg' in prompt or '.png' in prompt or '.co' in prompt or '.org' in prompt:
        return True
    return False

In [82]:
def remove_extra(prompt):
    
    # remove lora, hypernets
    prompt = re.sub(r'<[^<>]+>', '', prompt)
    
    return prompt

def remove_weight(prompt):
    
    prompt = re.sub(r':[\d,\.\s]+', '', prompt)
    prompt = re.sub(r'[\(\[\{\<\>\}\]\)]+', '', prompt)
    
    return prompt

def remove_complex(prompt):
    
    prompt = re.sub(r'[:\|][^:\|,]*', ',', prompt)
    
    return prompt

def remove_redundancy(prompt):
    
    tags = list()
    exists = set()
    for tag in prompt.split(','):
        tag = tag.strip()
        t = re.sub(r'\s+', '', tag)
        if len(t) > 0 and not tag in exists:
            exists.add(t)
        tags.append(tag)
    
    return ', '.join(tags)

# discord

In [4]:
dataset = datasets.load_dataset("parquet", data_files={'train': '../dataset/diffusiondb/metadata-large.parquet'})

  table = cls._concat_blocks(blocks, axis=0)


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image_name', 'prompt', 'part_id', 'seed', 'step', 'cfg', 'sampler', 'width', 'height', 'user_name', 'timestamp', 'image_nsfw', 'prompt_nsfw'],
        num_rows: 14000000
    })
})

In [5]:
dataset['train'][0]

{'image_name': '3ccdc650-871a-4ad9-9bf2-dc475b83ed32.webp',
 'prompt': 'beautiful porcelain ivory fair face woman biomechanical cyborg, close - up, sharp focus, studio light, iris van herpen haute couture headdress made of rhizomorphs, daisies, brackets, colorful corals, fractal mushrooms, puffballs, octane render, ultra sharp, 8 k ',
 'part_id': 1,
 'seed': 2625978990,
 'step': 50,
 'cfg': 21.0,
 'sampler': 8,
 'width': 512,
 'height': 704,
 'user_name': '01f4e782b48faedf416083b2fbabaca2a45621b15ead2333f03f0979b10e7266',
 'timestamp': datetime.datetime(2022, 8, 20, 10, 3, tzinfo=<UTC>),
 'image_nsfw': 0.03846566379070282,
 'prompt_nsfw': 0.003088998841121793}

In [18]:
results = list()

for i, item in tqdm(enumerate(dataset['train']), total=len(dataset['train'])):
    
    if item['prompt'] is None or not validate_prompt(item['prompt']):
        continue
 
    if contains_link(item['prompt']):
        continue

    if re.search('\d{5,}', item['prompt']):
        continue

    positive_prompt = format_prompt(item['prompt'])

    if not is_long(positive_prompt):
        continue
        
    positive_hash = hash_prompt(positive_prompt)
    
    results.append((i, positive_prompt, positive_hash))
    
len(results)

100%|██████████| 14000000/14000000 [21:05<00:00, 11064.90it/s]


104966

In [19]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash
0,50394,"beautiful painting by jeremy mann, only one he...",0ced9d76fd0731f1576c58f700545eb7
9,50403,"beautiful painting by jeremy mann, alphonse mu...",93971a04c78bd3692f4396fffe6ccb33
18,54352,"beautiful painting by jeremy mann, only one he...",cad4f5c6b5fff00d9a70f50fdd45e56e
27,56371,perfectly - centered!! looking at the camera!!...,7d2804cc0d5a724e42371396d5aefb91
36,73102,a extremely ultra highly detailed majestic hi ...,020db4f13e8ef108dd3a56062455d9ff
...,...,...,...
104929,13989691,a night photo of a minimalist contemporary hou...,81b6bd8622e2077606eb3ef953f6b9d6
104930,13989692,a night photo of a multistory minimalist conte...,03da3b5d6a386162efff1db26738a399
104931,13994166,photorealistic Emma Watson closeup angry tired...,ad46b24ed20796d12744a337ace73b19
104940,13996179,Emma Watson conjoined closeup angry tired figh...,1877dadfea04b2c42bd4c92c4233d6b7


In [20]:
df = results

In [42]:
results = list()

for index, positive_prompt, positive_hash in tqdm(df.itertuples(index=False, name=None)):
    
    positive_raw_length = len(positive_prompt)

    positive_prompt = remove_extra(positive_prompt)
    
    positive_prompt = remove_weight(positive_prompt)
    
    positive_prompt = remove_brackets(positive_prompt)
    
    positive_prompt = format_prompt(positive_prompt)
    
    positive_prompt = positive_prompt.lower()
    
    positive_prompt = remove_redundancy(positive_prompt)
    
    positive_prompt = format_prompt(positive_prompt)
    
    if not is_long(positive_prompt):
        continue
    
    positive_hash = hash_prompt(positive_prompt)

    results.append((index, positive_prompt, positive_hash, positive_raw_length))

9272it [00:02, 3212.06it/s]


In [43]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,50394,"beautiful painting by jeremy mann, only one he...",0ced9d76fd0731f1576c58f700545eb7,618
1,50403,"beautiful painting by jeremy mann, alphonse mu...",93971a04c78bd3692f4396fffe6ccb33,636
2,54352,"beautiful painting by jeremy mann, only one he...",cad4f5c6b5fff00d9a70f50fdd45e56e,624
3,56371,perfectly - centered!! looking at the camera!!...,7d2804cc0d5a724e42371396d5aefb91,505
4,73102,a extremely ultra highly detailed majestic hi ...,020db4f13e8ef108dd3a56062455d9ff,654
...,...,...,...,...
9231,13989691,a night photo of a minimalist contemporary hou...,86869e189cd6549f7f6d498a788fd9aa,524
9232,13989692,a night photo of a multistory minimalist conte...,9ec7a7d8198da07eea754de9d08b27cd,512
9233,13994166,photorealistic emma watson closeup angry tired...,6dc6e460f42514fb5ad9e8bf2eb988d2,697
9234,13996179,emma watson conjoined closeup angry tired figh...,acfbd2495b96a9ef1850ec2a95b5f995,520


In [44]:
results.to_csv('../dataset/long-discord_prompts.tsv', sep='\t', index=False)

# civitai

In [60]:
dataset = datasets.load_dataset("parquet", data_files={'train': '../dataset/civitai-stable-diffusion-337k/data/train-00000-of-00001-ace5b28cebba25a7.parquet'})

  table = cls._concat_blocks(blocks, axis=0)


In [61]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'negativePrompt', 'steps', 'sampler', 'seed', 'Model', 'url', 'hash', 'nsfw', 'width', 'height', 'Size', 'createdAt', 'postId', 'stats', 'meta', 'username'],
        num_rows: 327138
    })
})

In [62]:
dataset['train'][0]

{'id': 100657,
 'prompt': '<lora:hiqcg_body-epoch-000004:0.5>, <lora:hiqcg_face-epoch-000004:0.4>, hiqcgbody, hiqcgface, 1girl, full body, standing, \ndetailed skin texture, detailed cloth texture,  beautiful detailed face,\nmasterpiece, best quality, ultra detailed, 8k, intricate details,',
 'negativePrompt': 'EasyNegative, extra fingers,fewer fingers, multiple girls, multiple views,',
 'steps': 20.0,
 'sampler': 'DPM++ 2M Karras',
 'seed': 3994946333.0,
 'Model': 'AbyssOrangeMix2_sfw',
 'url': 'https://imagecache.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/2338276a-87f7-4a1e-f92a-776a18ee4200/width=768/2338276a-87f7-4a1e-f92a-776a18ee4200.jpeg',
 'hash': 'U5Exz_00.8D$t89Z%M0100~VD*RktQxaIU~p',
 'nsfw': True,
 'width': 768,
 'height': 1368,
 'Size': '512x912',
 'createdAt': '2023-02-14T10:05:11.498Z',
 'postId': 60841,
 'stats': "{'cryCount': 0, 'laughCount': 0, 'likeCount': 26, 'dislikeCount': 0, 'heartCount': 50, 'commentCount': 4}",
 'meta': "{'ENSD': '31337', 'Size': '512x912', 'seed': 399

In [63]:
results = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    positive_prompt, positive_hash, positive_raw_length = '', '', 0
    if item['prompt'] is not None and validate_prompt(item['prompt']) and not contains_link(item['prompt']) and not re.search('\d{5,}', item['prompt']):
        positive_prompt = format_prompt(item['prompt'])
        positive_hash = hash_prompt(positive_prompt)
        positive_raw_length = len(item['prompt'])
    
    negative_prompt, negative_hash, negative_raw_length = '', '', 0
    if item['negativePrompt'] is not None and validate_prompt(item['negativePrompt']) and not contains_link(item['negativePrompt']) and not re.search('\d{5,}', item['negativePrompt']):
        negative_prompt = format_prompt(item['negativePrompt'])
        negative_hash = hash_prompt(negative_prompt)
        negative_raw_length = len(item['negativePrompt'])
    
    if len(positive_hash + negative_hash) == 0:
        continue
    
    results.append((i, positive_prompt, positive_hash, positive_raw_length, negative_prompt, negative_hash, negative_raw_length))
    
len(results)

327138it [01:16, 4284.68it/s]


94348

In [64]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length', 'negative_prompt', 'negative_hash', 'negative_raw_length'])
results.drop_duplicates(['positive_hash', 'negative_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length,negative_prompt,negative_hash,negative_raw_length
0,3,"20 year old k-idol, (legs spread on cock:1.4),...",902137f79cb79089fcb9c7587407d1f2,627,,,0
1,5,,,0,"(pubic hair:2), paintings, sketches, (worst qu...",5a67e23bef1d074e50f34db9d51e9af8,669
2,6,"(8k, RAW photo:1.2),cityscape,(portrait:1.4),1...",74c8c3ed90c6e8cea6d56f316a956b0f,599,"paintings, sketches, fingers, (worst quality:2...",cd5f4181fb008b932564211ef9d253ff,652
3,8,<lora:fashionGirl_v20SmallFileSize:0.8> unpara...,b529850151129b0b6667f3973433299c,620,,,0
4,11,"(RAW photo:1.2), (photorealistic:1.4),(masterp...",75cdccf02173e44b58121ecf64f6320b,679,"paintings, sketches, (worst quality:2), (low q...",559d97422a99f14bbe1adbb3b4de3575,678
...,...,...,...,...,...,...,...
94333,325768,,,0,"nsfw, lowres, bad anatomy, bad hands, text, er...",7f003b0df3a84d650e83c7beca385f47,896
94334,325769,,,0,"nsfw, lowres, bad anatomy, bad hands, text, er...",7b25ee4860c02bb6ec551e62011091f8,820
94336,325771,,,0,"nsfw, lowres, bad anatomy, bad hands, text, er...",86d87eb24cc7f14137a294780590f2cc,816
94343,326173,"fisheye, 1girl, bangs, black long hair, blush,...",3f73ab1a2062a3bafefa3b64e1d92746,531,,,0


In [65]:
df = results

In [124]:
def worker(prompt):
    
    prompt = remove_extra(prompt)
    
    prompt = remove_weight(prompt)
    
    prompt = remove_complex(prompt)
    
    prompt = remove_brackets(prompt)
    
    prompt = format_prompt(prompt)
    
    prompt = prompt.lower()
    
    prompt = remove_redundancy(prompt)
    
    prompt = format_prompt(prompt)
    
    return prompt

In [83]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length, negative_prompt, negative_hash, negative_raw_length in tqdm(df.itertuples(index=False, name=None)):
        
        positive_hash = ''
        if type(positive_prompt) == str:
            positive_prompt = worker(positive_prompt)
            if is_long(positive_prompt):
                positive_hash = hash_prompt(positive_prompt)
                    
        negative_hash = ''
        if type(negative_prompt) == str:
            negative_prompt = worker(negative_prompt)
            if is_long(negative_prompt):
                negative_hash = hash_prompt(negative_prompt)
        
        if positive_hash == '' and negative_hash == '':
            continue

        results.append((index, positive_prompt, positive_hash, positive_raw_length, negative_prompt, negative_hash, negative_raw_length))

41619it [00:25, 1614.17it/s]


In [84]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length', 'negative_prompt', 'negative_hash', 'negative_raw_length'])
results.drop_duplicates(['positive_hash', 'negative_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length,negative_prompt,negative_hash,negative_raw_length
0,5,,,0,"pubic hair, paintings, sketches, worst quality...",6f4cea02c4539931989e9c5c02ac6c18,669
1,6,"8k, raw photo, cityscape, portrait, 1girl, lon...",,599,"paintings, sketches, fingers, worst quality, l...",36e418acf25ce2c7af8ee6bed0e4a288,652
2,8,"unparalleled masterpiece, ultra realistic 8k c...",81fa53a0e33bd637aa27d5d13e15cc94,620,,,0
3,11,"raw photo, photorealistic, masterpiece, best q...",de09ca0fd886971cf8ac84eecf191230,679,"paintings, sketches, worst quality, low qualit...",3a748fa53f86e3ac8d30ac943acbcff5,678
4,12,"raw photo, photorealistic, masterpiece, best q...",5eca6a564af2445057ac90cebea52ea9,675,"paintings, sketches, worst quality, low qualit...",3a748fa53f86e3ac8d30ac943acbcff5,678
...,...,...,...,...,...,...,...
35878,325670,,,0,"wet pussy, cum in pussy, purple lips, deformed...",b94640f6447e4e471fa7e5f2a88d429d,637
35879,325768,,,0,"nsfw, lowres, bad anatomy, bad hands, text, er...",7f003b0df3a84d650e83c7beca385f47,896
35880,325769,,,0,"nsfw, lowres, bad anatomy, bad hands, text, er...",7b25ee4860c02bb6ec551e62011091f8,820
35881,325771,,,0,"nsfw, lowres, bad anatomy, bad hands, text, er...",86d87eb24cc7f14137a294780590f2cc,816


In [85]:
results.to_csv('../dataset/long-civitai_prompts.tsv', sep='\t', index=False)

# lexica

In [86]:
dataset = datasets.load_dataset("parquet", data_files={
    'train': '../dataset/Stable-Diffusion-Prompts/data/train.parquet',
    'eval': '../dataset/Stable-Diffusion-Prompts/data/eval.parquet'
})

In [87]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Prompt'],
        num_rows: 73718
    })
    eval: Dataset({
        features: ['Prompt'],
        num_rows: 8192
    })
})

In [88]:
dataset['train'][0]

{'Prompt': 'realistic car 3 d render sci - fi car and sci - fi robotic factory structure in the coronation of napoleon painting and digital billboard with point cloud in the middle, unreal engine 5, keyshot, octane, artstation trending, ultra high detail, ultra realistic, cinematic, 8 k, 1 6 k, in style of zaha hadid, in style of nanospace michael menzelincev, in style of lee souder, in plastic, dark atmosphere, tilt shift, depth of field,'}

In [89]:
results = list()

for i, item in tqdm(enumerate(dataset['train'])):
    
    if not validate_prompt(item['Prompt']):
        continue
 
    if contains_link(item['Prompt']):
        continue

    if re.search('\d{5,}', item['Prompt']):
        continue

    positive_prompt = format_prompt(item['Prompt'])
    
    if len(positive_prompt) == 0:
        continue
        
    positive_hash = hash_prompt(positive_prompt)
    positive_raw_length = len(item['Prompt'])
    
    results.append((i, positive_prompt, positive_hash, positive_raw_length))
    
len(results)

73718it [00:01, 40422.07it/s]


1344

In [96]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,193,"glowing cracks, elven princess, meditating, pe...",5dd89f21b93f6c7f103b81a12d5a780e,617
1,236,a highly detailed epic cinematic concept art C...,1bdecd01416c240f0fa65cc1ae5a5046,514
2,281,"medieval hobbit homes, ornate, beautiful, atmo...",d47517a214618138d9bb4663bfa7c35f,509
3,348,baroque and cyberpunk style full-body sculptur...,f4adf90ee2ad05a2e80f31b2634d30ce,526
4,387,old pipe organ near lake with battle raging ne...,a22a573c3fec9d5d466ed479557fe4d6,545
...,...,...,...,...
123,7799,a painting of a XXL wise elder from Kenya in a...,f1decd18757e502971d9f7464c5a74bf,520
124,7966,"photo of a emo manic pixie dream girl, 8 k, po...",fa1f2a22c2b38ec9e0352326624b5b95,711
125,7986,An extremely psychedelic portrait of SalvadorD...,6974e6566dff3bc4d0310201e9d58db1,503
126,8166,photo of an extremely cute alien fish swimming...,6ce7273ac895d97f140ad964d1ff41cb,575


In [97]:
df = results

In [98]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):

    if type(positive_prompt) != str:
        continue
        
    positive_prompt = worker(positive_prompt)
    if not is_long(positive_prompt):
        continue
        
    positive_hash = hash_prompt(positive_prompt)

    results.append((index, positive_prompt, positive_hash, positive_raw_length))

127it [00:00, 2207.54it/s]


In [99]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,193,"glowing cracks, elven princess, meditating, pe...",5dd89f21b93f6c7f103b81a12d5a780e,617
1,236,a highly detailed epic cinematic concept art c...,27bdb178dc01c4c661385846e07a3c84,514
2,281,"medieval hobbit homes, ornate, beautiful, atmo...",d47517a214618138d9bb4663bfa7c35f,509
3,348,baroque and cyberpunk style full-body sculptur...,9b6deeb14f9a779624003a5430256e27,526
4,387,old pipe organ near lake with battle raging ne...,a22a573c3fec9d5d466ed479557fe4d6,545
...,...,...,...,...
121,7616,"cyber punk, oni mask, 3 d render beeple, portr...",63c3ead6b04e2b46d152003c1d6ef4cf,786
122,7799,a painting of a xxl wise elder from kenya in a...,04f24f69218f1c276c494fc39cce40a3,520
123,7986,an extremely psychedelic portrait of salvadord...,d256ee51af8e3732e426dd024a2aec02,503
124,8166,photo of an extremely cute alien fish swimming...,aae4b477f13c22723874b32655c06786,575


In [94]:
results.to_csv('../dataset/long-lexica_prompts-train.tsv', sep='\t', index=False)

In [95]:
results = list()

for i, item in tqdm(enumerate(dataset['eval'])):
    
    if not validate_prompt(item['Prompt']):
        continue
 
    if contains_link(item['Prompt']):
        continue

    if re.search('\d{5,}', item['Prompt']):
        continue

    positive_prompt = format_prompt(item['Prompt'])
    
    if len(positive_prompt) == 0:
        continue
        
    positive_hash = hash_prompt(positive_prompt)
    positive_raw_length = len(item['Prompt'])
    
    results.append((i, positive_prompt, positive_hash, positive_raw_length))
    
len(results)

8192it [00:00, 27876.93it/s]


128

In [100]:
results.to_csv('../dataset/long-lexica_prompts-eval.tsv', sep='\t', index=False)

# midjourney

In [102]:
df = pd.read_csv('../dataset/nonredundant-midjourney_prompts.tsv', sep='\t')

In [104]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):

    if not is_long(positive_prompt):
        continue
        
    results.append((index, positive_prompt, positive_hash, positive_raw_length))

15525301it [00:08, 1736742.75it/s]


In [106]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,766,design a 2d schematic representation of a tire...,59fceccd0aad9af272dbd6a1dffee759,506
1,894,"midget, you're a creative artist of the highes...",dbfe6562763c71db887f6dc1812f4cff,525
2,940,"8k, 3d animated style, teen panda driving a su...",e810ce8acf85a4358a60114c84b50cc4,538
3,945,"8k, 3d animated style, teen turtle driving a s...",dc4dd3df36b84b256ff723337eab2181,552
4,948,"8k, 3d animated style, child turtle driving a ...",9484b1c93111e3920b04a2ec5a8063aa,554
...,...,...,...,...
204606,55080785,"human brain, the human brain is a complex and ...",40d3481721056b33effe290c1d11e465,744
204607,55080850,"fist human hand, a human hand has transformed ...",38702aaec0a05e49f51d4b233d44bb8d,666
204608,55080906,"chrono fist human hand, a human hand has trans...",3e829007231bc5730545fdf0ce42b2f5,700
204609,55080952,"the human heart of stone, a heart made of ston...",7610ef7e78f0b2da249756ce09ef08f1,604


In [107]:
results.to_csv('../dataset/long-midjourney_prompts.tsv', sep='\t', index=False)

# aesthetic

In [114]:
dataset = datasets.load_dataset('../dataset/laion2B-en-aesthetic/')

Resolving data files:   0%|          | 0/24 [00:00<?, ?it/s]

  table = cls._concat_blocks(blocks, axis=0)


In [115]:
dataset

DatasetDict({
    train: Dataset({
        features: ['URL', 'TEXT', 'WIDTH', 'HEIGHT', 'similarity', 'hash', 'punsafe', 'pwatermark', 'aesthetic'],
        num_rows: 52068913
    })
})

In [117]:
results = list()

raw_hashs = set()

for i, item in tqdm(enumerate(dataset['train']), total=len(dataset['train'])):
    
    if item['TEXT'] is None:
        continue
        
    if not is_long(item['TEXT']):
        continue

    raw_hash = hash_prompt(item['TEXT'])
    if raw_hash in raw_hashs:
        continue
    raw_hashs.add(raw_hash)
        
    if not validate_prompt(item['TEXT']):
        continue
        
    positive_raw_length = len(item['TEXT'])
    
    positive_prompt = format_prompt(item['TEXT'])
    
    positive_hash = hash_prompt(positive_prompt)
        
    results.append((i, positive_prompt, positive_hash, positive_raw_length))
    
len(results)

100%|██████████| 52068913/52068913 [47:24<00:00, 18303.02it/s] 


48768

In [118]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

48471


Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,240,Walt Disney Company Chairman and CEO Michael E...,a23bd335ca09dd8a3f1fd946379e5618,73
1,1373,This 1760s gown features a rose-red silk with ...,5a861602bb209632ba2893d401333bdb,73
2,2509,Wooden Bowl teak red HWB19 SOLD (ViAfrika) Tag...,041fd059667762c2268f382f7e4de831,73
3,2861,Patterson Custom Homes - boy's rooms - bunk ro...,5ae01696c035bbedd273ab4ee58f4deb,73
4,4946,FARK.com: (8475391) Ice Cream Truck Driver Arr...,73dba67db6970fb55e277a505d88db8c,73
...,...,...,...,...
48763,52061219,"""""""Angela Holt """"""""Sunset at Storm's Pass, San...",321027869964f13a2f4906adb6115dd7,73
48764,52063478,"""Motorcycle craftsman Xanti Garcia (Corb Motor...",f54a642795e208542e2c21a49036f507,73
48765,52063710,All students in the 2018 Associate Degree Nurs...,7e015ab94bc9135d9a8d1f0e4a202915,73
48766,52067088,This vertical still life painting shows a gran...,0139ffb8725e2c88800b36df60d44c58,73


In [119]:
df = results

In [150]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):
    
    positive_prompt = positive_prompt.replace('"', ', ')
        
    positive_prompt = re.sub('[^,\s]*\.(co)|(org)[^,\s]+', ', ', positive_prompt)
    positive_prompt = re.sub('[^,\s]*[\d\s]{5,}[^,\s]*', ',', positive_prompt)

    positive_prompt = worker(positive_prompt)
    
    if not is_long(positive_prompt):
        continue
        
    results.append((index, positive_prompt, positive_hash, positive_raw_length))

48471it [00:21, 2269.78it/s]


In [151]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,240,walt disney company chairman and ceo michael e...,a23bd335ca09dd8a3f1fd946379e5618,73
1,2509,wooden bowl teak red hwb19 sold viafrika tagss...,041fd059667762c2268f382f7e4de831,73
2,2861,patterson custom homes - boy's rooms - bunk ro...,5ae01696c035bbedd273ab4ee58f4deb,73
3,4946,mice cream truck driver arrested for dui. hey ...,73dba67db6970fb55e277a505d88db8c,73
4,5617,"writes dalrymple, 'the asylum notes show richa...",edb54f66e2a7e9a1e9dd1ba0be6eab77,73
...,...,...,...,...
33208,52061219,"angela holt, sunset at storm's pass, sandia cr...",321027869964f13a2f4906adb6115dd7,73
33209,52063478,motorcycle craftsman xanti garcia corb motorcy...,f54a642795e208542e2c21a49036f507,73
33210,52063710,"all students in, degree nursing class at east ...",7e015ab94bc9135d9a8d1f0e4a202915,73
33211,52067088,this vertical still life painting shows a gran...,0139ffb8725e2c88800b36df60d44c58,73


In [152]:
results.to_csv('../dataset/long-laion2B-en-aesthetic.tsv', sep='\t', index=False)

# midjourney -2

In [3]:
df = pd.read_csv('../dataset/midjourney_prompts.tsv', sep='\t')

In [4]:
results = list()

for index, positive_prompt, positive_hash, positive_raw_length in tqdm(df.itertuples(index=False, name=None)):

    if type(positive_prompt) != str:
        continue

    if '|' in positive_prompt or '"' in positive_prompt:
        results.append((index, positive_prompt, positive_raw_length))
        continue
        
    positive_prompt = re.sub('(:\s?){2,}', ', ', positive_prompt)
    
    parts = re.findall('[a-zA-Z]+:[^:\.]+\.', positive_prompt)
    
    if len(parts) > 0:
        results.append((index, positive_prompt, positive_raw_length))
        continue

16826971it [02:22, 117958.97it/s]


In [9]:
arch = results

In [46]:
results = list()

for index, positive_prompt, positive_raw_length in tqdm(arch):

    if type(positive_prompt) != str:
        continue
        
    positive_prompt = positive_prompt.lower()
        
    if '.jpg' in positive_prompt or '.png' in positive_prompt:
        continue
        
    positive_prompt = positive_prompt.replace('"', ', ')
        
    positive_prompt = re.sub('[^,\s]*\.(co)|(org)[^,\s]+', ', ', positive_prompt)
    positive_prompt = re.sub('[^,\s]*[\d\s]{5,}[^,\s]*', ',', positive_prompt)
    
    positive_prompt = worker(positive_prompt)
    
    positive_hash = hash_prompt(positive_prompt)
        
    results.append((index, positive_prompt, positive_hash, positive_raw_length))
    
#     parts = re.findall('[a-zA-Z]+:[^:\.]+\.', positive_prompt)
    
#     if len(parts) > 0:
#         positive_hash = hash_prompt(positive_prompt)
#         complexs.append((index, positive_prompt, positive_hash, positive_raw_length))
#     else:
#         pass

100%|██████████| 507915/507915 [01:59<00:00, 4238.96it/s]


In [48]:
results = pd.DataFrame(results, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
results.drop_duplicates(['positive_hash'], inplace=True)

results

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,1,"hyperrealism, in the valley, there lies a lave...",204bd5fd3580fcbb656d75b48858da34,430
1,6,"hyperrealism, in the valley lies a lavender ga...",35068c27b360fdf8f6d57ba0613b585c,305
2,317,contenta sophisticated travel booking mobile a...,03a32fa7b2058e5df9e3d80992409f5c,991
3,328,contenta sophisticated travel booking mobile a...,06b3495371496398e14c5fc0e1d6da89,998
4,341,a captivating landing page for a travel bookin...,80f08794a0e08734695097401ef8c3fa,956
...,...,...,...,...
505438,55081537,"1980s dvd screengrab, yellow gorilla alien cre...",a30f927a26d66c735f68f4325a3f2b7d,277
505439,55081576,"1980s dvd screengrab, alien creatures in an un...",32192e4814140770e90a5b10f3f6f824,262
505440,55081626,halo themed a tall white male with a lean buil...,d84eec327455138f105d008ec557ba34,975
505441,55082231,"chinese iron swords, red, chinese paladin",995d16d2c61e59078205091cc0e470cd,41


In [49]:
results.to_csv('../dataset/long-midjourney_prompts-2.tsv', sep='\t', index=False)

In [27]:
# complexs = pd.DataFrame(complexs, columns=['index', 'positive_prompt', 'positive_hash', 'positive_raw_length'])
# complexs.drop_duplicates(['positive_hash'], inplace=True)

# complexs

Unnamed: 0,index,positive_prompt,positive_hash,positive_raw_length
0,317,content: a sophisticated travel booking mobile...,888960de7d76d5abd652c06e341168f0,991
1,328,content: a sophisticated travel booking mobile...,b61990918c9d7099cd75feb1fd443913,998
2,341,a captivating landing page for a travel bookin...,3ff6c1df3e38ff06e2a44f8b0295a1a9,956
3,351,a sleek landing page for a financial managemen...,24d6643d1a11a465c09b1710c8888afd,914
4,381,"dynamic weather-themed landing page, featuring...",b25385ec8a0764cb85e3dc2a8090bafe,891
...,...,...,...,...
140912,55080885,"scene: underground passage, three people, 1. h...",34a6b6da9bad9fc96a62c8e93df1d99e,306
140913,55081423,"coloring book page, the terraformed planet col...",c8069797fce1a1bd73a9f1be34550b07,892
140914,55081476,"coloring book page, the space station habitat:...",944be49bb7d2de0be3b69bdd771187ad,910
140915,55081481,"coloring book page, the space station habitat:...",341b852326218ffc95f1f44581a0b16c,644
