## Play around datasets and create one with hierarchical prior


In [1]:
import os
import sys
from tensorflow.compat.v1 import gfile

sys.path.append(os.path.abspath("/home/mwu34/disentanglement_lib"))
import numpy as np
import pandas as pd

DSPRITES_PATH = os.path.join(
    os.environ.get("DISENTANGLEMENT_LIB_DATA", "."), "dsprites",
    "dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz"
)

In [7]:
with gfile.Open(DSPRITES_PATH, "rb") as data_file:
    # Data was saved originally using python2, so we need to set the encoding.
    data = np.load(data_file, encoding="latin1", allow_pickle=True)
    images = np.array(data["imgs"])  # has shape 737280 * 64 * 64
    factor_sizes = np.array(data["metadata"][()]["latents_sizes"], dtype=np.int64)

In [14]:
from disentanglement_lib.data.ground_truth.util import CorrelatedSplitDiscreteStateSpace


correlated_space = CorrelatedSplitDiscreteStateSpace(factor_sizes, [3, 4])

RuntimeError: Required bindings for `correlation_details` not provided in config: ['corr_indices', 'corr_type']

In [39]:
# trying to get mapping from specific parameter combinations to model num
from disentanglement_lib.config.unsupervised_study_v1.sweep import get_config
import pandas as pd

configs = get_config()
df = pd.DataFrame(configs)

In [3]:
df['evaluate_every_n_steps']

0     1000
1     1000
2     1000
3     1000
4     1000
      ... 
67    1000
68    1000
69    1000
70    1000
71    1000
Name: evaluate_every_n_steps, Length: 72, dtype: int64

In [2]:
# trying to get mapping from specific parameter combinations to model num
# from disentanglement_lib.config.unsupervised_study_v1.sweep import get_config
from disentanglement_lib.config.double_descent_study_v8.sweep import get_config
import pandas as pd

configs = get_config()
df = pd.DataFrame(configs)

# Get the beta_tc_vae models using dsprites and a beta param of 4.
model = df['model.name'] == "beta_vae"
dataset = df['dataset.name'] == "dsprites_full"
# beta = df["vae.beta"] == 1.0

# Give all the possible model configs unused parameters excluded (e.g., annealed_vae.gamma)
# print(df[model & dataset & beta].dropna(axis=1))

# Give the list of model ids with this configuration
df[model & dataset].head(200)


Unnamed: 0,correlation_details.corr_indices,dataset.name,correlation_hyperparameter.line_width,correlation_details.corr_type,encoder.encoder_fn,decoder.decoder_fn,correlation.active_correlation,model.name,vae.beta,model.model,conv_encoder.num_parameters_scale,deconv_decoder.num_parameters_scale,encoder.num_latent,dataset.num_training_data,evaluate_every_n_steps,model.random_seed
0,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,0.125,@vae(),1.0,1.0,1,1000,1000,0
1,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,0.125,@vae(),1.0,1.0,1,3000,1000,0
2,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,0.125,@vae(),1.0,1.0,1,5000,1000,0
3,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,0.125,@vae(),1.0,1.0,1,7000,1000,0
4,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,0.125,@vae(),1.0,1.0,2,1000,1000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,1.000,@vae(),1.0,1.0,8,7000,1000,0
68,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,1.000,@vae(),1.0,1.0,10,1000,1000,0
69,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,1.000,@vae(),1.0,1.0,10,3000,1000,0
70,"[3, 4]",dsprites_full,10.0,line,@conv_encoder,@deconv_decoder,True,beta_vae,1.000,@vae(),1.0,1.0,10,5000,1000,0


In [4]:
from disentanglement_lib.data.ground_truth import named_data
dsprites = named_data.get_named_ground_truth_data('dsprites_full')

In [2]:
from disentanglement_lib.data.ground_truth import named_data


dsprites_subset = named_data.get_named_ground_truth_data('dsprites_full', False, 'valid', 700000)

In [4]:
random_state = np.random.RandomState(0)
factors, images = dsprites_subset.sample(5, random_state)

In [6]:
images.shape

(5, 64, 64, 1)

In [14]:
gg = [1,2,3,4,5,6]
for i in gg[:0:-1]:
    print(i)

6
5
4
3
2


In [42]:
num_factors = 5

def index_to_factors(index_array, factor_bases):
    factor_bases = factor_bases.astype(int)
    print(factor_bases)
    factors = np.zeros((len(index_array), num_factors))
    for factor_idx, factor_base in enumerate(factor_bases[1:]):
        factors[:, factor_idx], index_array = np.divmod(index_array, factor_base)
    return factors

def factors_to_index(factor_array, factor_bases):
    indices = np.array(np.dot(factor_array, factor_bases), dtype=np.int64)
    return indices

In [40]:
gg = np.array([10000, 20000, 30000, 700000])
res = index_to_factors(gg, dsprites.factor_bases)

[737280 245760  40960   1024     32      1]


In [45]:
back = factors_to_index(res, dsprites.factor_bases[1:])
back

array([ 10000,  20000,  30000, 700000])

In [32]:
dsprites.factor_bases

array([7.3728e+05, 2.4576e+05, 4.0960e+04, 1.0240e+03, 3.2000e+01,
       1.0000e+00])

In [7]:

a, b = dsprites.sample(10, random_state)

In [10]:
a.shape

(10, 5)

In [17]:
# split dsprites dataset into two parts

full_size = np.prod(dsprites.factor_sizes)
train_dataset_size = 7000
train_indices = np.random.choice(full_size, train_dataset_size, replace=False)
train_indices = np.sort(train_indices)
valid_indices = np.delete(np.arange(full_size), train_indices)

train_indices_path = os.path.join(
    os.environ.get("DISENTANGLEMENT_LIB_DATA", "."), "dsprites", f"train{train_dataset_size}.npy"
)
valid_indices_path = os.path.join(
    os.environ.get("DISENTANGLEMENT_LIB_DATA", "."), "dsprites", f"valid{train_dataset_size}.npy"
)
np.save(train_indices_path, train_indices)
np.save(valid_indices_path, valid_indices)

In [21]:
gg = np.random.choice([10, 50, 3], 3, replace=True)
gg

array([ 3, 50, 50])

In [62]:
4688 in valid_indices

True

In [23]:
len(gg)

3