In [0]:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import time

from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense

from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM

# Datasets
##Cat_versus_dog

In [7]:
catdog_builder = tfds.builder("cats_vs_dogs")
catdog_builder.download_and_prepare()

catdog_train_dataset, catdog_test_dataset = catdog_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

catdog_num_classes = catdog_builder.info.features['label'].num_classes
catdog_data_type = 'text' if 'text' in catdog_builder.info.features else 'image'

catdog_train_dataset = catdog_train_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))
catdog_test_dataset = catdog_test_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))

print(tf.data.experimental.cardinality(catdog_train_dataset))
print(tf.data.experimental.cardinality(catdog_test_dataset))
print(catdog_num_classes)
print(catdog_data_type)

tf.Tensor(18610, shape=(), dtype=int64)
tf.Tensor(4652, shape=(), dtype=int64)
2
image


##Flowers

In [0]:
flower_builder = tfds.builder("tf_flowers")
flower_builder.download_and_prepare()

flower_train_dataset, flower_test_dataset = flower_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

flower_num_classes = flower_builder.info.features['label'].num_classes
flower_data_type = 'text' if 'text' in flower_builder.info.features else 'image'

flower_train_dataset = flower_train_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))
flower_test_dataset = flower_test_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))

print(tf.data.experimental.cardinality(flower_train_dataset))
print(tf.data.experimental.cardinality(flower_test_dataset))
print(flower_num_classes)
print(flower_data_type)

tf.Tensor(2936, shape=(), dtype=int64)
tf.Tensor(734, shape=(), dtype=int64)
5
image


##Malaria

In [0]:
malaria_builder = tfds.builder("malaria")
malaria_builder.download_and_prepare()

malaria_train_dataset, malaria_test_dataset = malaria_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

malaria_num_classes = malaria_builder.info.features['label'].num_classes
malaria_data_type = 'text' if 'text' in malaria_builder.info.features else 'image'

malaria_train_dataset = malaria_train_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))
malaria_test_dataset = malaria_test_dataset.map(lambda img, label: (tf.image.resize(img, [64, 64]) / 255.0, label))

print(tf.data.experimental.cardinality(malaria_train_dataset))
print(tf.data.experimental.cardinality(malaria_test_dataset))
print(malaria_num_classes)
print(malaria_data_type)

tf.Tensor(22046, shape=(), dtype=int64)
tf.Tensor(5512, shape=(), dtype=int64)
2
image


##Imdb

In [5]:
imdb_builder = tfds.builder("imdb_reviews/subwords8k")
imdb_builder.download_and_prepare()

imdb_train_dataset, imdb_test_dataset = imdb_builder.as_dataset(
    split=['train[:80%]', 'train[80%:]'],
    as_supervised=True
)

imdb_encoder = imdb_builder.info.features['text'].encoder
imdb_num_classes = imdb_builder.info.features['label'].num_classes
imdb_data_type = 'text' if 'text' in imdb_builder.info.features else 'image'

print(tf.data.experimental.cardinality(imdb_train_dataset))
print(tf.data.experimental.cardinality(imdb_test_dataset))
print(imdb_num_classes)
print(imdb_data_type)

tf.Tensor(20000, shape=(), dtype=int64)
tf.Tensor(5000, shape=(), dtype=int64)
2
text


##Yelp
Large Yelp Review Dataset. This is a dataset for binary sentiment classification.

In [6]:
yelp_builder = tfds.builder("yelp_polarity_reviews/subwords8k")
yelp_builder.download_and_prepare()

yelp_train_dataset, yelp_test_dataset = yelp_builder.as_dataset(
    split=['train[:40%]', 'train[40%:50%]'],
    as_supervised=True
)

yelp_encoder = yelp_builder.info.features['text'].encoder
yelp_num_classes = yelp_builder.info.features['label'].num_classes
yelp_data_type = 'text' if 'text' in yelp_builder.info.features else 'image'

print(tf.data.experimental.cardinality(yelp_train_dataset))
print(tf.data.experimental.cardinality(yelp_test_dataset))
print(yelp_num_classes)
print(yelp_data_type)

tf.Tensor(224000, shape=(), dtype=int64)
tf.Tensor(56000, shape=(), dtype=int64)
2
text


# Model
define a CNN model.

In [0]:
def cnn_model(num_classes):
  model = tf.keras.Sequential([
      Conv2D(32, (3, 3), activation='relu',
             kernel_regularizer=tf.keras.regularizers.l2(0.001),
             input_shape=(64, 64, 3)),
      MaxPooling2D((2, 2)),
      Dropout(0.2),
      Conv2D(64, (3, 3), activation='relu',
             kernel_regularizer=tf.keras.regularizers.l2(0.001)),
      MaxPooling2D((2, 2)),
      Dropout(0.2),
      Conv2D(128, (3, 3), activation='relu',
             kernel_regularizer=tf.keras.regularizers.l2(0.001)),
      MaxPooling2D((2, 2)),
      Dropout(0.2),
      Flatten(),
      Dense(128, activation='relu'),
      Dense(64, activation='relu'),
      Dense(num_classes)
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

define a RNN model.

In [0]:
def rnn_model(encoder, num_classes):
  model = tf.keras.Sequential([
      Embedding(encoder.vocab_size, 64),
      Bidirectional(LSTM(64, return_sequences=True)),
      Bidirectional(LSTM(32)),
      Dense(64, activation='relu'),
      Dropout(0.5),
      Dense(1)
  ])

  model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                optimizer=tf.keras.optimizers.Adam(1e-4),
                metrics=['accuracy'])
  
  return model

# Shuffling

In [0]:
def shuffle_test(train_dataset, test_dataset, batch_size = 50, epochs = 10, num_classes = 2, data_type = 'image', encoder = None):
  train_dataset_size = int(tf.data.experimental.cardinality(train_dataset))
  test_dataset_size = int(tf.data.experimental.cardinality(test_dataset))

  if data_type == 'image':
    shuffle_test_dataset = test_dataset.batch(batch_size).repeat()
    t1_shuffle = time.time()
    shuffle_train_dataset = train_dataset.shuffle(1000).batch(batch_size).repeat()
    t2_shuffle = time.time()

    model_shuffle = cnn_model(num_classes)
  
  if data_type == 'text':
    shuffle_test_dataset = test_dataset.padded_batch(batch_size).repeat()
    t1_shuffle = time.time()
    shuffle_train_dataset = train_dataset.shuffle(1000).padded_batch(batch_size).repeat()
    t2_shuffle = time.time()

    model_shuffle = rnn_model(encoder, num_classes)
  
  print("shuffling takes", t2_shuffle-t1_shuffle, "seconds")

  t3_shuffle = time.time()
  model_shuffle.fit(
    shuffle_train_dataset,
    steps_per_epoch = train_dataset_size // batch_size,
    validation_data = shuffle_test_dataset,
    validation_steps = test_dataset_size // batch_size,
    epochs = epochs
  )
  t4_shuffle = time.time()
  print("training based on shuffling takes", t4_shuffle-t3_shuffle, "seconds")

##Cat_versus_dog

In [0]:
shuffle_test(catdog_train_dataset, catdog_test_dataset, num_classes=catdog_num_classes, data_type = catdog_data_type)

shuffling takes 0.0012943744659423828 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on shuffling takes 170.430073261261 seconds


##Flowers

In [0]:
shuffle_test(flower_train_dataset, flower_test_dataset, num_classes=flower_num_classes, data_type = flower_data_type)

shuffling takes 0.0015208721160888672 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on shuffling takes 25.36191463470459 seconds


##Malaria

In [0]:
shuffle_test(malaria_train_dataset, malaria_test_dataset, num_classes = malaria_num_classes, data_type = malaria_data_type)

shuffling takes 0.0013654232025146484 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on shuffling takes 162.58422899246216 seconds


##Imdb

In [0]:
shuffle_test(
    imdb_train_dataset,
    imdb_test_dataset,
    batch_size = 200,
    num_classes = imdb_num_classes,
    data_type = imdb_data_type,
    encoder = imdb_encoder,
    epochs = 5
)

shuffling takes 0.002113819122314453 seconds
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
training based on shuffling takes 433.5717399120331 seconds


Yelp

In [0]:
shuffle_test(
    yelp_train_dataset,
    yelp_test_dataset,
    batch_size = 500,
    num_classes = yelp_num_classes,
    data_type = yelp_data_type,
    encoder = yelp_encoder,
    epochs = 3
)

shuffling takes 0.0020787715911865234 seconds
Epoch 1/3
Epoch 2/3
Epoch 3/3
training based on shuffling takes 1401.9833824634552 seconds


# SWO

In [0]:
def generate_batches_by_swo(dataset, batch_size, data_type = 'image', test_mode = False):
    dataset_size = int(tf.data.experimental.cardinality(dataset))
    steps = dataset_size // batch_size
    indexes = [np.random.choice(dataset_size, size=batch_size, replace=False) for _ in range(steps)]

    if test_mode:
      returned_indexes = indexes.copy()
    
    batch_group_by_index = {}
    for batch_index in range(steps):
      for element_index in indexes[batch_index]:
        if element_index not in batch_group_by_index:
          batch_group_by_index[element_index] = [batch_index]
        else:
          batch_group_by_index[element_index].append(batch_index)

    element_index = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]
    
    if data_type == 'image':
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
        element_index += 1
    
      features_shape = [batch_size, 64, 64, 3]
    
    if data_type == 'text':
      max_text_len = 0
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
            max_text_len = max(len(element[0]), max_text_len)
        element_index += 1
      
      #padding
      for bf in batch_features:
        for i in range(len(bf)):
          bf[i] = np.pad(
              bf[i],
              pad_width=(0, max_text_len-len(bf[i])),
              mode = 'constant',
              constant_values=0
          )
      
      features_shape = [batch_size, max_text_len]

    print("transforming data to dataset object")

    labels_shape = [batch_size, ]
    
    dataset = tf.data.Dataset.from_generator(
        lambda: zip(batch_features, batch_labels),
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    if not test_mode:
      return dataset
    else:
      return dataset, returned_indexes

In [0]:
def swo_test(train_dataset, test_dataset, batch_size = 50, epochs = 10, data_type = 'image', num_classes = 2, encoder = None):
  train_dataset_size = int(tf.data.experimental.cardinality(train_dataset))
  test_dataset_size = int(tf.data.experimental.cardinality(test_dataset))
  train_steps = train_dataset_size // batch_size
  test_steps = test_dataset_size // batch_size

  if data_type == 'image':
    swo_test_dataset = test_dataset.batch(batch_size).repeat()
    model_swo = cnn_model(num_classes)
  
  if data_type == 'text':
    swo_test_dataset = test_dataset.padded_batch(batch_size).repeat()
    model_swo = rnn_model(encoder, num_classes)

  t1_swo = time.time()
  swo_train_dataset = generate_batches_by_swo(train_dataset, batch_size=batch_size, data_type=data_type).repeat()
  t2_swo = time.time()
  print("swo takes", t2_swo-t1_swo, "seconds")

  t3_swo = time.time()
  model_swo.fit(
    swo_train_dataset,
    steps_per_epoch = train_steps,
    validation_data = swo_test_dataset,
    validation_steps = test_steps,
    epochs=epochs
  )
  t4_swo = time.time()
  print("training based on swo takes", t4_swo-t3_swo, "seconds")

##Cat_versus_dog

In [0]:
swo_test(catdog_train_dataset, catdog_test_dataset, num_classes = catdog_num_classes, data_type = catdog_data_type)

transforming data to dataset object
swo takes 10.514053583145142 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on swo takes 54.860501527786255 seconds


##Flowers

In [0]:
swo_test(flower_train_dataset, flower_test_dataset, num_classes = flower_num_classes, data_type = flower_data_type)

transforming data to dataset object
swo takes 2.175234317779541 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on swo takes 10.462100744247437 seconds


##Malaria

In [0]:
swo_test(malaria_train_dataset, malaria_test_dataset, num_classes = malaria_num_classes, data_type = malaria_data_type)

transforming data to dataset object
swo takes 14.002123832702637 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on swo takes 99.83212280273438 seconds


##Imdb

In [0]:
swo_test(
    imdb_train_dataset,
    imdb_test_dataset,
    batch_size = 200,
    num_classes = imdb_num_classes,
    data_type = imdb_data_type,
    encoder = imdb_encoder,
    epochs = 5
)

transforming data to dataset object
swo takes 7.309563159942627 seconds
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
training based on swo takes 925.4301273822784 seconds


##Yelp

In [0]:
swo_test(
    yelp_train_dataset,
    yelp_test_dataset,
    batch_size = 500,
    num_classes = yelp_num_classes,
    data_type = yelp_data_type,
    encoder = yelp_encoder,
    epochs = 3
)

transforming data to dataset object
swo takes 82.04675245285034 seconds
Epoch 1/3
Epoch 2/3
Epoch 3/3

# Poisson

In [0]:
def generate_batches_by_poisson(dataset, batch_size, data_type = 'image', test_mode = False):
    dataset_size = int(tf.data.experimental.cardinality(dataset))
    steps = dataset_size // batch_size
    ratio = batch_size / dataset_size
    
    indexes = []
    for _ in range(steps):
      pros = np.random.uniform(0,1,dataset_size)
      chosen = [i for i in range(dataset_size) if pros[i] < ratio]
      indexes.append(chosen)
    
    if test_mode:
      returned_indexes = indexes.copy()
    
    batch_group_by_index = {}
    for batch_index in range(steps):
      for element_index in indexes[batch_index]:
        # steps * batch_size = n, so O(n)
        if element_index not in batch_group_by_index:
          batch_group_by_index[element_index] = [batch_index]
        else:
          batch_group_by_index[element_index].append(batch_index)

    element_index = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]

    if data_type == 'image':
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            # on average, O(n*1) = O(n), on worst case, O(n*n/m) = O(n^2)
            # the worst case is not gonna happen, statistically
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
        element_index += 1
    
      features_shape = [None, 64, 64, 3]
    
    if data_type == 'text':
      max_text_len = 0
      for element in dataset.as_numpy_iterator():
        if element_index in batch_group_by_index:
          for batch_index in batch_group_by_index[element_index]:
            # on average, O(n*1) = O(n), on worst case, O(n*n/m) = O(n^2)
            # the worst case is not gonna happen, statistically
            batch_features[batch_index].append(element[0])
            batch_labels[batch_index].append(element[1])
            max_text_len = max(len(element[0]), max_text_len)
        element_index += 1
      
      #padding
      for bf in batch_features:
        for i in range(len(bf)):
          bf[i] = list(bf[i]) + [0]*(max_text_len-len(bf[i]))
      
      features_shape = [None, max_text_len]

    print("transforming data to dataset object")

    labels_shape = [None, ]

    dataset = tf.data.Dataset.from_generator(
        lambda: zip(batch_features,batch_labels),
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    if not test_mode:
      return dataset
    else:
      return dataset, returned_indexes

In [0]:
def poisson_test(train_dataset, test_dataset, batch_size = 50, epochs = 10, data_type = 'image', num_classes = 2, encoder = None):
  train_dataset_size = int(tf.data.experimental.cardinality(train_dataset))
  test_dataset_size = int(tf.data.experimental.cardinality(test_dataset))
  train_steps = train_dataset_size // batch_size
  test_steps = test_dataset_size // batch_size

  if data_type == 'image':
    poisson_test_dataset = test_dataset.batch(batch_size).repeat()
    model_poisson = cnn_model(num_classes)
  
  if data_type == 'text':
    poisson_test_dataset = test_dataset.padded_batch(batch_size).repeat()
    model_poisson = rnn_model(encoder, num_classes)

  t1_poisson = time.time()
  poisson_train_dataset = generate_batches_by_poisson(train_dataset, batch_size=batch_size, data_type=data_type).repeat()
  t2_poisson = time.time()
  print("poisson takes", t2_poisson-t1_poisson, "seconds")

  t3_poisson = time.time()
  model_poisson.fit(
    poisson_train_dataset,
    steps_per_epoch=train_steps,
    validation_data = poisson_test_dataset,
    validation_steps = test_steps,
    epochs=epochs
  )
  t4_poisson = time.time()
  print("training based on poisson takes", t4_poisson-t3_poisson, "seconds")

##Cat_versus_dog

In [0]:
poisson_test(catdog_train_dataset, catdog_test_dataset, num_classes = catdog_num_classes, data_type = catdog_data_type)

transforming data to dataset object
poisson takes 15.386031866073608 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on poisson takes 99.72939395904541 seconds


##Flowers

In [0]:
poisson_test(flower_train_dataset, flower_test_dataset, num_classes = flower_num_classes, data_type = flower_data_type)

transforming data to dataset object
poisson takes 2.613868474960327 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on poisson takes 17.675917148590088 seconds


##Malaria

In [0]:
poisson_test(malaria_train_dataset, malaria_test_dataset, num_classes = malaria_num_classes, data_type = malaria_data_type)

transforming data to dataset object
poisson takes 16.633229732513428 seconds
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on poisson takes 99.04749417304993 seconds


##Imdb

In [0]:
poisson_test(
    imdb_train_dataset,
    imdb_test_dataset,
    batch_size = 1000,
    num_classes = imdb_num_classes,
    data_type = imdb_data_type,
    encoder = imdb_encoder,
    epochs = 5
)

transforming data to dataset object
poisson takes 7.836404085159302 seconds
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
training based on poisson takes 639.0689625740051 seconds


##Yelp

In [0]:
poisson_test(
    yelp_train_dataset,
    yelp_test_dataset,
    batch_size = 500,
    num_classes = yelp_num_classes,
    data_type = yelp_data_type,
    encoder = yelp_encoder,
    epochs = 3
)

transforming data to dataset object
poisson takes 94.86305785179138 seconds
Epoch 1/3
Epoch 2/3
Epoch 3/3
training based on poisson takes 1817.2364497184753 seconds


# **Test**
test whether swo and poisson sampling methods perform normal:
*   whether swo and poisson generate enough batches, 1000 in this case
*   whether swo and poisson generate batches of correct shapes
  *   swo has shape of (50, 32, 32, 3)
  *   poisson has shape of (viriable size around, 32, 32, 3)
*   whether swo and poisson can appropriately sample examples from the dataset







##swo test

In [0]:
swo_train_dataset_2, swo_indexes = generate_batches_by_swo(catdog_train_dataset, 50, test_mode=True)
swo_train_dataset_2

transforming data to dataset object


<FlatMapDataset shapes: ((50, 64, 64, 3), (50,)), types: (tf.float32, tf.float32)>

In [0]:
batch_num = 0
for _ in swo_train_dataset_2.as_numpy_iterator():
  batch_num += 1
print(batch_num)

372


In [0]:
for s in swo_train_dataset_2.take(3).as_numpy_iterator():
  print("batch features shape:", np.shape(s[0]), "batch labels shape:", np.shape(s[1]))

batch features shape: (50, 64, 64, 3) batch labels shape: (50,)
batch features shape: (50, 64, 64, 3) batch labels shape: (50,)
batch features shape: (50, 64, 64, 3) batch labels shape: (50,)


In [0]:
for ind in swo_indexes[:3]:
  print(ind)
  print()

[13796  1790  8050  8811 16246  6228  7534  7708 13424  2269  6260  5984
  1543  4552  9910  1433 13618  6401  2161 15890  2616 17588  4106  7839
  6198  7871  8469  1193  1002  9803 18171 18196  7030 18010  7505  6201
 10630 14166 15310 16373 15203  7890 16796 15359 11673 16090  6770 16268
 16364 12855]

[11893 14620 17294 13886  7415 14286  7145   528 18392  2040  9569 16079
 17323  4061 15995  8305 11762 15201  9292  2202 16028 17930  1898 15857
 11122  9540  5683 18235   462 14076 13722   545  2071  4055 10426  3179
  8283  5362 17200 17902   339 14898 12611 11927  3161 10116 10169 10713
 13377  7958]

[16845 18496 10580 11019 11136 13526 11309  3603 15019 13786  5729  7208
  6767 11638   130  2290  6519 14560  4056  4462   538  4814  2834 17256
 14947 14592   230 13234 11048  8851  4208  7173  7531 18201 11679 16642
 13879  1784 16099  3540  4842  6914  9967 14082  6693 13219 12171  9924
  6533  8133]



so, each batch has 50 examples

##poisson test

In [0]:
poisson_train_dataset_2, poisson_indexes = generate_batches_by_poisson(catdog_train_dataset, 50, test_mode=True)
poisson_train_dataset_2

transforming data to dataset object


<FlatMapDataset shapes: ((None, 64, 64, 3), (None,)), types: (tf.float32, tf.float32)>

In [0]:
batch_num = 0
for p in poisson_train_dataset_2.as_numpy_iterator():
  batch_num += 1
print(batch_num)

372


In [0]:
for p in poisson_train_dataset_2.take(5).as_numpy_iterator():
  print("batch features shape:", np.shape(p[0]), "batch labels shape:", np.shape(p[1]))

batch features shape: (48, 64, 64, 3) batch labels shape: (48,)
batch features shape: (46, 64, 64, 3) batch labels shape: (46,)
batch features shape: (48, 64, 64, 3) batch labels shape: (48,)
batch features shape: (45, 64, 64, 3) batch labels shape: (45,)
batch features shape: (42, 64, 64, 3) batch labels shape: (42,)


In [0]:
for ind in poisson_indexes[:3]:
  print(np.array(ind))
  print()

[  446   680   914  1088  1246  1436  2131  2332  2443  2904  3006  3833
  3835  4037  4331  4602  4844  5788  5872  6599  6679  7860  8253  8371
  8932  9800 10446 10489 10506 10669 10852 11196 12167 13852 14147 14446
 15203 16060 16626 16734 16854 17286 17399 17423 18301 18365 18483 18536]

[  346  1310  1448  3356  3595  3734  3764  4480  5276  6237  6421  6636
  6909  7008  7213  7590  7613  7888  8560  8592  8596  8776  9044  9180
  9204  9285  9724  9806  9887 10564 11547 12338 12406 12410 12687 13489
 13936 14152 14648 15499 15684 15818 15828 17391 17901 18349]

[  347   982  1661  1709  1862  1949  2517  2929  2967  3008  4451  5814
  6171  6574  6963  7492  7810  7887  8507  8532  8585  9595  9706 10013
 11383 11778 11893 12025 12875 13578 13871 14408 14417 14617 14640 14755
 15102 15280 15297 15463 15473 16062 16241 16448 17252 17338 17759 18079]

