In [1]:
try:
    from google.colab import drive
    #mount drive, set base path
    drive.mount("/gdrive")
    base_path = '/gdrive/MyDrive/nma_dl_metamorphs/data/multi_dsprites'
except:
    base_path = './'
from pathlib import Path
import random
from random import randint, choice

import time

from tqdm.auto import tqdm
import numpy as np



In [2]:
#Maximum categories to bin the data
MAXCATEGORIES = 10
train_obj_prefix = 'training_objareas'
val_obj_prefix = 'validation_objareas'
test_obj_prefix = 'test_objareas'
train_img_prefix = 'training_imgareas'
val_img_prefix = 'validation_imgareas'
test_img_prefix = 'test_imgareas'

suffix = '_rand4_unique.npy'
catstr = f'_cat{MAXCATEGORIES}'
data_path = Path(base_path).joinpath('processed/')

In [3]:
#load the object areas

train_obj_area_path = Path(data_path).joinpath(train_obj_prefix + suffix)
val_obj_area_path = Path(data_path).joinpath(val_obj_prefix + suffix)
test_obj_area_path = Path(data_path).joinpath(test_obj_prefix + suffix)

train_img_area_path = Path(data_path).joinpath(train_img_prefix + suffix)
val_img_area_path = Path(data_path).joinpath(val_img_prefix + suffix)
test_img_area_path = Path(data_path).joinpath(test_img_prefix + suffix)


training_obj_area = np.load(train_obj_area_path)
val_obj_area = np.load(val_obj_area_path)
test_obj_area = np.load(test_obj_area_path)

In [4]:
#collect the min and max ranges of each split of the data (and mean just to have it)
training_min_area = training_obj_area.min()
training_max_area = training_obj_area.max()
training_mean_area = training_obj_area.mean()

val_min_area = val_obj_area.min()
val_max_area = val_obj_area.max()
val_mean_area = val_obj_area.mean()

test_min_area = test_obj_area.min()
test_max_area = test_obj_area.max()
test_mean_area = test_obj_area.mean()

In [5]:
#print them out
print(f'train min area {training_min_area}, max area {training_max_area}, mean area {training_mean_area}')
print(f'val min area {val_min_area}, max area {val_max_area}, mean area {val_mean_area}')
print(f'test min area {test_min_area}, max area {test_max_area}, mean area {test_mean_area}')

train min area 0.0126953125, max area 0.345947265625, mean area 0.10607466796875
val min area 0.0126953125, max area 0.323486328125, mean area 0.105957373046875
test min area 0.012939453125, max area 0.3251953125, mean area 0.1065801025390625


In [7]:
#use the min across all data and the max accross all data for the range to make the bins
min_bin = np.min([training_min_area, val_min_area, test_min_area])
max_bin = np.max([training_max_area, val_max_area, test_max_area])
bins = np.linspace(min_bin, max_bin, MAXCATEGORIES)

In [8]:
#map the real numbers to categories 1..MAX_CATEGORIES
train_cat = np.digitize(training_obj_area, bins)-1
val_cat = np.digitize(val_obj_area, bins)-1
test_cat = np.digitize(test_obj_area, bins)-1

In [9]:
#have a lot at what we got
print(bins)
print(training_obj_area[0:10])
print(train_cat[0:10])
print("\n")
print(bins)
print(val_obj_area[0:10])
print(val_cat[0:10])
print("\n")
print(bins)
print(test_obj_area[0:10])
print(test_cat[0:10])

[0.01269531 0.04972331 0.0867513  0.1237793  0.16080729 0.19783529
 0.23486328 0.27189128 0.30891927 0.34594727]
[0.14355469 0.02734375 0.23510742 0.03393555 0.03173828 0.12646484
 0.09960938 0.05322266 0.11230469 0.0769043 ]
[3 0 6 0 0 3 2 1 2 1]


[0.01269531 0.04972331 0.0867513  0.1237793  0.16080729 0.19783529
 0.23486328 0.27189128 0.30891927 0.34594727]
[0.13525391 0.22021484 0.1340332  0.0546875  0.203125   0.16723633
 0.13476562 0.05664062 0.1809082  0.02270508]
[3 5 3 1 5 4 3 1 4 0]


[0.01269531 0.04972331 0.0867513  0.1237793  0.16080729 0.19783529
 0.23486328 0.27189128 0.30891927 0.34594727]
[0.17700195 0.11547852 0.05566406 0.03369141 0.18115234 0.04223633
 0.08374023 0.14208984 0.1159668  0.07958984]
[4 2 1 0 4 0 1 3 2 1]


In [10]:
#write out the categories.
save_path = Path(data_path).joinpath(train_obj_prefix + catstr + suffix)
np.save(save_path, train_cat)

save_path = Path(data_path).joinpath(val_obj_prefix + catstr + suffix)
np.save(save_path, val_cat)

save_path = Path(data_path).joinpath(test_obj_prefix + catstr + suffix)
np.save(save_path, test_cat)

In [12]:
#can we read it back
readback_test_cat = np.load(save_path)
print(readback_test_cat[0:10]==test_cat[0:10])

[ True  True  True  True  True  True  True  True  True  True]


In [13]:
#repeat for img areas

training_img_area = np.load(train_img_area_path)
val_img_area = np.load(val_img_area_path)
test_img_area = np.load(test_img_area_path)


In [14]:
#collect the min and max ranges of each split of the data (and mean just to have it)
training_min_area = training_img_area.min()
training_max_area = training_img_area.max()
training_mean_area = training_img_area.mean()

val_min_area = val_img_area.min()
val_max_area = val_img_area.max()
val_mean_area = val_img_area.mean()

test_min_area = test_img_area.min()
test_max_area = test_img_area.max()
test_mean_area = test_img_area.mean()

In [15]:
#print them out
print(f'train min area {training_min_area}, max area {training_max_area}, mean area {training_mean_area}')
print(f'val min area {val_min_area}, max area {val_max_area}, mean area {val_mean_area}')
print(f'test min area {test_min_area}, max area {test_max_area}, mean area {test_mean_area}')

train min area 0.0126953125, max area 0.29345703125, mean area 0.09464724609375
val min area 0.0126953125, max area 0.28271484375, mean area 0.094794189453125
test min area 0.012939453125, max area 0.271240234375, mean area 0.0952012939453125


In [16]:
#use the min across all data and the max accross all data for the range to make the bins
min_bin = np.min([training_min_area, val_min_area, test_min_area])
max_bin = np.max([training_max_area, val_max_area, test_max_area])
bins = np.linspace(min_bin, max_bin, MAXCATEGORIES)

In [17]:
#map the real numbers to categories 1..MAX_CATEGORIES
train_cat = np.digitize(training_img_area, bins)-1
val_cat = np.digitize(val_img_area, bins)-1
test_cat = np.digitize(test_img_area, bins)-1

In [18]:
#have a lot at what we got
print(bins)
print(training_img_area[0:10])
print(train_cat[0:10])
print("\n")
print(bins)
print(val_img_area[0:10])
print(val_cat[0:10])
print("\n")
print(bins)
print(test_img_area[0:10])
print(test_cat[0:10])

[0.01269531 0.04389106 0.07508681 0.10628255 0.1374783  0.16867405
 0.19986979 0.23106554 0.26226128 0.29345703]
[0.14355469 0.02734375 0.20263672 0.03393555 0.03173828 0.12548828
 0.09887695 0.05322266 0.10644531 0.06152344]
[4 0 6 0 0 3 2 1 3 1]


[0.01269531 0.04389106 0.07508681 0.10628255 0.1374783  0.16867405
 0.19986979 0.23106554 0.26226128 0.29345703]
[0.1340332  0.17407227 0.12890625 0.0546875  0.203125   0.13208008
 0.11328125 0.05664062 0.17480469 0.02270508]
[3 5 3 1 6 3 3 1 5 0]


[0.01269531 0.04389106 0.07508681 0.10628255 0.1374783  0.16867405
 0.19986979 0.23106554 0.26226128 0.29345703]
[0.13452148 0.09643555 0.05566406 0.03369141 0.16308594 0.04223633
 0.0769043  0.13134766 0.10693359 0.06762695]
[3 2 1 0 4 0 2 3 3 1]


In [19]:
#write out the categories.
save_path = Path(data_path).joinpath(train_img_prefix + catstr + suffix)
np.save(save_path, train_cat)

save_path = Path(data_path).joinpath(val_img_prefix + catstr + suffix)
np.save(save_path, val_cat)

save_path = Path(data_path).joinpath(test_img_prefix + catstr + suffix)
np.save(save_path, test_cat)

In [20]:
#can we read it back
readback_test_cat = np.load(save_path)
print(readback_test_cat[0:10]==test_cat[0:10])

[ True  True  True  True  True  True  True  True  True  True]


In [21]:
!ls -lat processed

total 11205068
-rw-r--r-- 1 root root      80128 Aug 13 14:53 test_imgareas_10_rand4_unique.npy
-rw-r--r-- 1 root root     400128 Aug 13 14:53 training_imgareas_10_rand4_unique.npy
-rw-r--r-- 1 root root      80128 Aug 13 14:53 validation_imgareas_10_rand4_unique.npy
-rw-r--r-- 1 root root      80128 Aug 13 14:51 test_objareas_10_rand4_unique.npy
-rw-r--r-- 1 root root     400128 Aug 13 14:51 training_objareas_10_rand4_unique.npy
-rw-r--r-- 1 root root      80128 Aug 13 14:51 validation_objareas_10_rand4_unique.npy
drwxrwxr-x 6 1000 1000       4096 Aug 13 14:49 ..
drwxrwxr-x 2 1000 1000       4096 Aug 13 14:47 .
-rw-r--r-- 1 root root      80128 Aug 12 21:32 test_objareas_cat10_rand4_unique.npy
-rw-r--r-- 1 root root     400128 Aug 12 21:32 training_objareas_cat10_rand4_unique.npy
-rw-r--r-- 1 root root      80128 Aug 12 21:32 validation_objareas_cat10_rand4_unique.npy
-rw-rw-r-- 1 1000 1000      80080 Aug  9 02:15 test_imgareas_rand4_unique.npy
-rw-rw-r-- 1 1000 1000     