<a href="https://colab.research.google.com/github/jason-jiankai/sampling/blob/master/sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import tensorflow as tf
import numpy as np
import time
# import ssl
# ssl._create_default_https_context = ssl._create_unverified_context

# Dataset
cifar10, with 50000 training examples, 10000 test examples.

In [0]:
cifar10 = tf.keras.datasets.cifar10
train, test = cifar10.load_data()
# 50000, 10000

train = train[0] / 255.0, train[1]
test = test[0] / 255.0, test[1]

train_dataset = tf.data.Dataset.from_tensor_slices(train)
test_dataset = tf.data.Dataset.from_tensor_slices(test)

# Model
define a CNN model.

In [0]:
def model():
  model = tf.keras.Sequential([
      tf.keras.layers.Conv2D(32, (3, 3), activation='relu',
                             kernel_regularizer=tf.keras.regularizers.l2(0.001),
                             input_shape=(32, 32, 3)),
      tf.keras.layers.MaxPooling2D((2, 2)),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Conv2D(64, (3, 3), activation='relu',
                             kernel_regularizer=tf.keras.regularizers.l2(0.001)),
      tf.keras.layers.MaxPooling2D((2, 2)),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Conv2D(128, (3, 3), activation='relu',
                             kernel_regularizer=tf.keras.regularizers.l2(0.001)),
      tf.keras.layers.MaxPooling2D((2, 2)),
      tf.keras.layers.Dropout(0.2),
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(128, activation='relu'),
      tf.keras.layers.Dense(64, activation='relu'),
      tf.keras.layers.Dense(10)
  ])

  model.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])
  return model

# Shuffling

In [4]:
shuffle_test_dataset = test_dataset.batch(50).repeat()

t1_shuffle = time.time()
shuffle_train_dataset = train_dataset.shuffle(10000).batch(50).repeat()
t2_shuffle = time.time()
print("shuffling takes", t2_shuffle-t1_shuffle, "seconds")

shuffling takes 0.0012438297271728516 seconds


In [5]:
model_shuffle = model()

t3_shuffle = time.time()
model_shuffle.fit(
  shuffle_train_dataset,
  steps_per_epoch = 1000,
  validation_data = shuffle_test_dataset,
  validation_steps = 200,
  epochs = 10
)
t4_shuffle = time.time()
print("training based on shuffling takes", t4_shuffle-t3_shuffle, "seconds")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on shuffling takes 53.32881045341492 seconds


# SWO

In [0]:
def generate_batches_by_swo(dataset, dataset_size, batch_size, steps, test_mode = False):
    indexes = []
    for s in range(steps):
        indexes.append(np.sort(np.random.choice(dataset_size, size=batch_size, replace=False)))

    returned_indexes = indexes.copy()

    i = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]
    for e in dataset.as_numpy_iterator():
      for s in range(steps):
          if indexes[s].size > 0 and i == indexes[s][0]:
              indexes[s] = indexes[s][1:]
              batch_features[s].append(e[0])
              batch_labels[s].append(e[1])
      i += 1
      if i % 5000 == 0: print('{:.2f}%'.format(float(i)/50000*100))
    print("transforming data to dataset object")
    dataset = tf.data.Dataset.from_tensor_slices((batch_features, batch_labels))
    if not test_mode:
      return dataset
    else:
      return dataset, returned_indexes

In [7]:
swo_test_dataset = test_dataset.batch(50).repeat()

t1_swo = time.time()
swo_train_dataset = generate_batches_by_swo(train_dataset, 50000, 50, 1000).repeat()
t2_swo = time.time()
print("swo takes", t2_swo-t1_swo, "seconds")

10.00%
20.00%
30.00%
40.00%
50.00%
60.00%
70.00%
80.00%
90.00%
100.00%
transforming data to dataset object
swo takes 80.2788462638855 seconds


In [8]:
model_swo = model()

t3_swo = time.time()
model_swo.fit(
    swo_train_dataset,
    steps_per_epoch=1000,
    validation_data = swo_test_dataset,
    validation_steps = 200,
    epochs=10
)
t4_swo = time.time()
print("training based on swo takes", t4_swo-t3_swo, "seconds")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on swo takes 39.21904110908508 seconds


# Poisson

In [0]:
def generate_batches_by_poisson(dataset, dataset_size, batch_size, steps, test_mode = False):
    ratio = float(batch_size) / dataset_size
    
    indexes = []
    for _ in range(steps):
      pros = np.random.uniform(0,1,dataset_size)
      chosen = [i for i in range(dataset_size) if pros[i] < ratio]
      indexes.append(chosen)
    
    returned_indexes = indexes.copy()

    i = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]
    for e in dataset.as_numpy_iterator():
      for s in range(steps):
          if indexes[s] != [] and i == indexes[s][0]:
              indexes[s] = indexes[s][1:]
              batch_features[s].append(e[0])
              batch_labels[s].append(e[1])
      i += 1
      if i % 5000 == 0: print('{:.2f}%'.format(float(i)/50000*100))
    
    batch = [(features, labels) for features, labels in zip(batch_features,batch_labels)]

    print("transforming data to dataset object")

    features_shape = [None, 32, 32, 3]
    labels_shape = [None, 1]
    dataset = tf.data.Dataset.from_generator(
        lambda: batch,
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    if not test_mode:
      return dataset
    else:
      return dataset, returned_indexes

In [10]:
poisson_test_dataset = test_dataset.batch(50).repeat()

t1_poisson = time.time()
poisson_train_dataset = generate_batches_by_poisson(train_dataset, 50000, 50, 1000).repeat()
t2_poisson = time.time()
print("swo takes", t2_poisson-t1_poisson, "seconds")

10.00%
20.00%
30.00%
40.00%
50.00%
60.00%
70.00%
80.00%
90.00%
100.00%
transforming data to dataset object
swo takes 25.269413709640503 seconds


In [11]:
model_poisson = model()

t3_poisson = time.time()
model_poisson.fit_generator(
    poisson_train_dataset,
    steps_per_epoch=1000,
    validation_data = poisson_test_dataset,
    validation_steps = 200,
    epochs=10
)
t4_poisson = time.time()
print("training based on poisson takes", t4_poisson-t3_poisson, "seconds")

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
training based on poisson takes 50.22898983955383 seconds


# Time Comparison

In [12]:
shuffle_time = t2_shuffle-t1_shuffle + t4_shuffle-t3_shuffle
swo_time = t2_swo-t1_swo + t4_swo-t3_swo
poisson_time = t2_poisson-t1_poisson + t4_poisson-t3_poisson

print(
    "training based on shuffling takes", shuffle_time, "seconds: \nincluding preparing dataset time",
    t2_shuffle-t1_shuffle, "seconds, and training time", t4_shuffle-t3_shuffle, "seconds.\n"
)
print(
    "training based on swo takes", swo_time, "seconds: \nincluding preparing dataset time",
    t2_swo-t1_swo, "seconds, and training time", t4_swo-t3_swo, "seconds.\n"
)
print(
    "training based on poisson takes", poisson_time, "seconds: \nincluding preparing dataset time",
    t2_poisson-t1_poisson, "seconds, and training time", t4_poisson-t3_poisson, "seconds.\n"
)

training based on shuffling takes 53.33005428314209 seconds: 
including preparing dataset time 0.0012438297271728516 seconds, and training time 53.32881045341492 seconds.

training based on swo takes 119.49788737297058 seconds: 
including preparing dataset time 80.2788462638855 seconds, and training time 39.21904110908508 seconds.

training based on poisson takes 75.49840354919434 seconds: 
including preparing dataset time 25.269413709640503 seconds, and training time 50.22898983955383 seconds.



Basically, training based on shuffling and sampling methods have similar accuracies, about 70%.

For a dataset of 50000, it takes "swo" 86 seconds to prepare the dataset, and takes "poisson" 25 seconds to prepare the dataset. As a comparison, shuffling only takes 0.001 seconds.

Poisson takes less time because it uses a generator, which generate batches during training and takes less space.

# **Test**
test whether swo and poisson sampling methods perform normal:
*   whether swo and poisson generate enough batches, 1000 in this case
*   whether swo and poisson generate batches of correct shapes
  *   swo has shape of (50, 32, 32, 3)
  *   poisson has shape of (viriable size around 50, 32, 32, 3)
*   whether swo and poisson can appropriately sample examples from the dataset







##swo test

In [13]:
swo_train_dataset_2, swo_indexes = generate_batches_by_swo(train_dataset, 50000, 50, 1000, test_mode=True)
swo_train_dataset_2

10.00%
20.00%
30.00%
40.00%
50.00%
60.00%
70.00%
80.00%
90.00%
100.00%
transforming data to dataset object


<TensorSliceDataset shapes: ((50, 32, 32, 3), (50, 1)), types: (tf.float64, tf.int32)>

In [14]:
batch_num = 0
for _ in swo_train_dataset_2.as_numpy_iterator():
  batch_num += 1
print(batch_num)

1000


In [15]:
for s in swo_train_dataset_2.take(3).as_numpy_iterator():
  print("batch features shape:", np.shape(s[0]), "batch labels shape:", np.shape(s[1]))

batch features shape: (50, 32, 32, 3) batch labels shape: (50, 1)
batch features shape: (50, 32, 32, 3) batch labels shape: (50, 1)
batch features shape: (50, 32, 32, 3) batch labels shape: (50, 1)


In [16]:
for ind in swo_indexes[:3]:
  print(ind)
  print()

[ 1101  1242  2983  3095  3262  3466  5958  6850  6912  7258  9966 10680
 11946 12450 13435 13985 14453 17849 19762 20126 20380 23634 24844 25650
 25809 26342 27983 31742 32042 33214 33405 33978 34388 34954 37519 38839
 42529 42999 44870 46059 46089 46225 47282 47730 49137 49259 49326 49341
 49603 49846]

[  916  1676  3987  4044  4172  4569  6827  7537  7721  8250 11195 11544
 12546 12771 15223 16146 16451 17333 18671 19095 19917 20609 21778 22603
 23929 24008 24286 26123 27261 31304 31760 31766 33264 33599 35116 36503
 38045 40159 40202 41206 41555 45777 45995 46788 47129 47203 48812 48893
 48943 49667]

[ 2558  2842  4606  5310  5844  8399  8424  8494  8586 10731 11442 14133
 15015 15397 17295 18656 18666 20009 21724 26281 26498 27333 27728 29479
 29719 29872 30825 31493 31587 31645 33215 33752 34333 34590 40282 40296
 40359 41504 41523 41821 42212 43496 44323 45450 45898 46461 48604 49040
 49728 49849]



so, each batch has 50 examples

##poisson test

In [17]:
poisson_train_dataset_2, poisson_indexes = generate_batches_by_poisson(train_dataset, 50000, 50, 1000, test_mode=True)
poisson_train_dataset_2

10.00%
20.00%
30.00%
40.00%
50.00%
60.00%
70.00%
80.00%
90.00%
100.00%
transforming data to dataset object


<FlatMapDataset shapes: ((None, 32, 32, 3), (None, 1)), types: (tf.float32, tf.float32)>

In [18]:
batch_num = 0
for p in poisson_train_dataset_2.as_numpy_iterator():
  batch_num += 1
print(batch_num)

1000


In [19]:
for p in poisson_train_dataset_2.take(5).as_numpy_iterator():
  print("batch features shape:", np.shape(p[0]), "batch labels shape:", np.shape(p[1]))

batch features shape: (53, 32, 32, 3) batch labels shape: (53, 1)
batch features shape: (56, 32, 32, 3) batch labels shape: (56, 1)
batch features shape: (52, 32, 32, 3) batch labels shape: (52, 1)
batch features shape: (58, 32, 32, 3) batch labels shape: (58, 1)
batch features shape: (51, 32, 32, 3) batch labels shape: (51, 1)


In [20]:
for ind in poisson_indexes[:3]:
  print(np.array(ind))
  print()

[  260  3051  3060  3511  6225  7340  8629  8658  9662 11777 11828 12332
 13033 13351 13888 14814 14975 16707 17518 18186 18327 18457 18912 19918
 23041 23158 24729 25082 25112 26425 28395 28525 31923 32684 32716 34575
 34596 37157 37705 38507 39045 41119 41525 42353 42547 44102 44111 44959
 45284 45955 46674 48709 49938]

[  120  1227  3352  4073  6329  8218  9701 10331 10809 12942 13288 13337
 16513 16787 16961 17536 17673 19012 19315 19381 21008 21544 21783 22281
 22376 23710 25367 26164 26669 26833 28332 28550 28930 29701 30621 31449
 31560 32811 34474 35087 35133 37049 37056 37650 42115 42508 43548 44218
 45484 45638 45940 46649 47149 47467 48507 49196]

[  394   599  2223  2249  2297  2533  3570  3746  6794  6856  7351  8089
  8629  8868 10657 10790 13666 19423 19547 19579 22063 22236 22332 23658
 23865 23906 25528 28767 30390 30413 30856 31386 32624 33544 34015 34767
 34808 35055 35706 36945 36976 38798 40214 43899 45676 45680 46328 46500
 46613 46794 47750 47830]

