In [None]:
from tqdm import tqdm
from pathlib import Path

import os
from dotenv import load_dotenv
load_dotenv()


data_folder = Path( os.environ["DATA_FOLDER"] )

imgs_folder = data_folder / "images"
all_tags_folder = data_folder / "tags"
new_tags_folder = data_folder

## Change all names to be 6 digits

example: `0.txt` -> `000000.txt`

### renaming

In [42]:
eq_len_parent = 2
eq_len_file = 6

for folder, suf in zip([imgs_folder, all_tags_folder],
                       [".jpg", ".txt"]):
    for file in tqdm(folder.glob(f"**/*{suf}"), total=1e6):
        parent = file.parent
        folder_num = parent.stem.zfill(eq_len_parent)  # 2
        file_num = file.stem.zfill(eq_len_file)        # 6

        new_parent = Path(f"{parent.parent}/{folder_num}")
        if not new_parent.exists():
            new_parent.mkdir()

        file.rename(new_parent / f"{file_num}{suf}")

1000000it [00:25, 39677.93it/s]
1000000it [00:25, 39808.22it/s]


### remove empty folders

In [None]:
for folder in [imgs_folder, all_tags_folder]:
    for path in folder.iterdir():
        if path.is_dir() and (not list(path.iterdir())):
            print(path)
            path.rmdir()

### check

In [None]:
print( list(all_tags_folder.glob("**/*.txt"))[:10] )
print( sorted(all_tags_folder.glob("**/*.txt"))[:10] )

In [None]:
st = sorted(all_tags_folder.glob("**/*.txt"))

st[19999:20010]

# All tags alltogether

In [3]:
all_tags_folder_sorted = sorted(all_tags_folder.glob("**/*.txt"))

In [4]:
with open( all_tags_folder_sorted[0], "r" ) as f:
    text = f.read().split()

text

['governorsisland',
 'cigarette',
 'tattoos',
 'smoke',
 'red',
 'lipstick',
 'dress',
 'sunglasses',
 'shades',
 'belt']

In [5]:
vocab = set()

for file_name in all_tags_folder_sorted:
    with open(file_name, "r", encoding="utf-8") as f:
        vocab.update( f.read().split() )

vocab = list(vocab)
print( f"{len(vocab):,}" )

861,995


In [6]:
all_tags_txt = open(new_tags_folder / "all_tags.txt", 'w', encoding="utf-8")
temp_txt = ''

for file_name in all_tags_folder_sorted:
    with open(file_name, "r", encoding="utf-8") as f:
        temp_txt = f.read().replace('\n', ' ')[:-1]
    
    all_tags_txt.write(temp_txt)
    all_tags_txt.write('\n')


all_tags_txt.close()

In [7]:
all_tags = []

with open(new_tags_folder / 'all_tags.txt', 'r', encoding='utf-8') as f:
    # for i in range(10):
    #     all_tags.append( f.readline()[:-1] )
    all_tags = f.read().split(sep="\n")


In [8]:
all_tags[490:500]

['vancouver britishcolumbia canada denmanstreet westend cupcake cake originalcupcakes buttercream buttercreamfrosting chocolate chocolatefrosting chocolatebuttercreamfrosting sprinkles rainbowsprinkles',
 'rainbow circle sanfrancisco 22ºradiushalo 22ºhalo',
 'rigs',
 '',
 'osaka doutonbori dotonbori dotombori arcade japan japon nihon ebisu girl fille pretty japanese musume kawaii kawai cute mignonne jolie schoolgirl uniform lyceenne candid street auntie light àlasauvette trash salace triste sad lumiere explore 268',
 'somewhereinmaine maine',
 'whereedin unguessed art sun faces edinburgh scotland square',
 'smile gabba ilford 125 olympus om1 athens street',
 'sunset sole sun romagna cesena tramonto goldstaraward fiveflickrfavs',
 'kentucky locust flowers macro fabaceae tree bloom blossom']

In [9]:
vocab[:20]

['oeshiki',
 'thimbleisland',
 'maslach',
 'butnowitsvoracious',
 'californiaparade4thfourthjuly',
 'vieuw',
 'montgomerycounty',
 'astro:name=cometholmes',
 'elfrage',
 'rubbishbins',
 'brightdaysareahead',
 'boasvindas',
 'geo:lon=100530667',
 'dritte',
 'rek',
 'ilvaloredeisoldi',
 'coupure',
 'kongminglantern',
 'marinestructures',
 'duenbuggy']

In [10]:
## max num of tags for pic
max( map(lambda x: len(x.split()), all_tags) )

75

In [11]:
## avg num of tags for pic
sum( map(lambda x: len(x.split()), all_tags) ) / len(all_tags)

11.35933164066836

In [12]:
## large number of tags targeted
print(
    len(all_tags[2809].split()),
    len(all_tags[8867].split()),
    len(all_tags[5371].split()),
    len(all_tags[6315].split()),
    sep="\n"
)

58
66
74
75


In [13]:
## max len of singular tag
max( map(len, text) )

15

In [14]:
max_tag_len = 20

vocab = set( map(lambda x: x if len(x) < max_tag_len else '', text) )

print( f"{len(vocab):,}" )

10


# Removing hieroglyphs

In [15]:
all_tags_txt = open(new_tags_folder / "text_wout_utf.txt", 'w')
temp_txt = ''

max_tag_len = 20
print_first_n_glyphs = 40

for file_name in all_tags_folder_sorted:
    with open(file_name, "r", encoding="utf-8") as f:
        temp_txt = f.read()[:-1]
    
    for word in temp_txt.split():
        try:
            if len(word) < max_tag_len:
                all_tags_txt.write(word)
                all_tags_txt.write(' ')
                
        except:
            if print_first_n_glyphs > 0:
                print( word )
                print_first_n_glyphs -= 1

    all_tags_txt.write('\n')


all_tags_txt.close()

In [16]:
vocab = set()

with open(new_tags_folder / "text_wout_utf.txt", "r") as f:
     for line in f:
        vocab.update( line.split() )

vocab = list(vocab)
len(vocab)

761278

In [17]:
vocab[:20]

['oeshiki',
 'thimbleisland',
 'maslach',
 'butnowitsvoracious',
 'vieuw',
 'montgomerycounty',
 'elfrage',
 'rubbishbins',
 'brightdaysareahead',
 'boasvindas',
 'geo:lon=100530667',
 'dritte',
 'rek',
 'ilvaloredeisoldi',
 'coupure',
 'kongminglantern',
 'marinestructures',
 'duenbuggy',
 'funnies',
 'virility']

In [18]:
try:
    display( vocab.index('plate') )
except:
    print('netu')
    
try:
    display( vocab.index(' ') )
except:
    print('netu')
try:
    display( vocab.index('') )
except:
    print('netu')

16180

netu
netu


In [19]:
vocab = {''}

with open(new_tags_folder / "text_wout_utf.txt", "r") as f:
     for line in f:
        vocab.update( line.split() )

vocab = list(vocab)
len(vocab)

761279

In [20]:
vocab[:20]

['',
 'oeshiki',
 'thimbleisland',
 'maslach',
 'butnowitsvoracious',
 'vieuw',
 'montgomerycounty',
 'elfrage',
 'rubbishbins',
 'brightdaysareahead',
 'boasvindas',
 'geo:lon=100530667',
 'dritte',
 'rek',
 'ilvaloredeisoldi',
 'coupure',
 'kongminglantern',
 'marinestructures',
 'duenbuggy',
 'funnies']

In [21]:
all_tags = ''
with open(new_tags_folder / 'text_wout_utf.txt', 'r') as f:
    all_tags = f.read()

all_tags_split = all_tags.split(sep='\n')

In [22]:
all_tags_split[490:500]

['vancouver britishcolumbia canada denmanstreet westend cupcake cake originalcupcakes buttercream buttercreamfrosting chocolate chocolatefrosting sprinkles rainbowsprinkles ',
 'rainbow circle sanfrancisco 22ºradiushalo 22ºhalo ',
 'rigs ',
 '',
 'osaka doutonbori dotonbori dotombori arcade japan japon nihon ebisu girl fille pretty japanese musume kawaii kawai cute mignonne jolie schoolgirl uniform lyceenne candid street auntie light àlasauvette trash salace triste sad lumiere explore 268 ',
 'somewhereinmaine maine ',
 'whereedin unguessed art sun faces edinburgh scotland square ',
 'smile gabba ilford 125 olympus om1 athens street ',
 'sunset sole sun romagna cesena tramonto goldstaraward fiveflickrfavs ',
 'kentucky locust flowers macro fabaceae tree bloom blossom ']

In [23]:
max( map(lambda x: len(x.split()), all_tags_split) )

75

In [24]:
## avg num of tags
sum( map(lambda x: len(x.split()), all_tags_split) ) / len(all_tags_split)

11.045443954556045

# Exporting to .npy file

In [25]:
import numpy as np

In [26]:
word_to_token = {word : ind for ind, word in enumerate(vocab)}
word_to_token['']

0

In [84]:
np.save(new_tags_folder / 'words_to_inds.npy', word_to_token)

In [35]:
## imgs are batched by 10,000 examples into separate `npy` files
## tags are much lighter so instead of 100 files, make first axis represent
##  the file number corresponding to the selected "batch" of tags

all_tags_split_np = np.array(all_tags_split[:-1])  # last one is empty string `""`
all_tags_split_np = all_tags_split_np.reshape(100, -1)
print(all_tags_split_np.shape)

np.save(new_tags_folder / 'tags.npy', all_tags_split_np)

(100, 10000)


In [36]:
all_tags_split_np[11]

array(['icebergs ', '',
       'lomolca fujivenus400 filmshots toycamera copenhagen ', ...,
       'telaviv israel kinnernet kinnernet2009 kinnernet09 ', '',
       'mbk stlouis missouri march spring shadow art wall northgrand purgepool p168 savedbydbb1 '],
      shape=(10000,), dtype='<U909')

In [37]:
loaded_tags = np.load(new_tags_folder / 'tags.npy')
loaded_tags.shape

(100, 10000)

In [38]:
loaded_tags[-1]

array(['dead bose pacia art gallery chelsea jhm manhattan nyc ',
       'b3co barcelona lugarbarcelona españa paisespaña cataluña catalunya spain travel viaje suelo siluetas color sombras silhouette contraluz contraluces ',
       'air sign sanfrancisco foundinsf ', ...,
       'nikon d200 laos lao boy children hmong hilltribe phapon luangprabang tamron a16 1750mm f28 苗族 maewmong 苗 miao ',
       'sky white blue orange bleu ',
       'foodporn coffee baileys irish creme glass mugs whipped globalworldawards artcafe '],
      shape=(10000,), dtype='<U909')

In [None]:
read_wtt = np.load(new_tags_folder / 'words_to_inds.npy', allow_pickle='TRUE').item()

In [89]:
display( (read_wtt[''], word_to_token['']) )
read_wtt['plate'] == word_to_token['plate']

(0, 0)

True

In [90]:
read_wtt['belt'], word_to_token['belt']

(97181, 97181)

In [91]:
print( f"{len(read_wtt):,}" )

761,279
