Rejection resampling test. See https://www.tensorflow.org/guide/data#resampling

In [6]:
import tensorflow as tf
import numpy as np

In [2]:
zip_path = tf.keras.utils.get_file(
    origin='https://storage.googleapis.com/download.tensorflow.org/data/creditcard.zip',
    fname='creditcard.zip',
    extract=True)

csv_path = zip_path.replace('.zip', '.csv')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/creditcard.zip


In [3]:
creditcard_ds = tf.data.experimental.make_csv_dataset(
    csv_path, batch_size=1024, label_name="Class",
    # Set the column types: 30 floats and an int.
    column_defaults=[float()]*30+[int()])

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
Instructions for updating:
Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by `tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take care of using the fused implementation.


In [4]:
def count(counts, batch):
  features, labels = batch
  class_1 = labels == 1
  class_1 = tf.cast(class_1, tf.int32)

  class_0 = labels == 0
  class_0 = tf.cast(class_0, tf.int32)

  counts['class_0'] += tf.reduce_sum(class_0)
  counts['class_1'] += tf.reduce_sum(class_1)

  return counts

In [23]:
counts = creditcard_ds.take(10).reduce(
    initial_state={'class_0': 0, 'class_1': 0},
    reduce_func = count)

counts = np.array([counts['class_0'].numpy(),
                   counts['class_1'].numpy()]).astype(np.float32)

fractions = counts/counts.sum()
print(fractions)

[0.9952148  0.00478516]


### Resampling - dataset as two different tf.data datasets

In [9]:
negative_ds = (
  creditcard_ds
    .unbatch()
    .filter(lambda features, label: label==0)
    .repeat())
positive_ds = (
  creditcard_ds
    .unbatch()
    .filter(lambda features, label: label==1)
    .repeat())



In [12]:
for features, label in positive_ds.batch(10).take(1):
  print(label.numpy())

[1 1 1 1 1 1 1 1 1 1]


In [13]:
balanced_ds = tf.data.experimental.sample_from_datasets(
    [negative_ds, positive_ds], [0.5, 0.5]).batch(10)

In [17]:
for features, labels in balanced_ds.take(10):
  print(labels.numpy())

[0 0 0 1 0 0 1 0 0 1]
[1 1 0 0 1 0 0 1 0 0]
[0 1 1 0 0 0 0 0 1 0]
[1 1 1 0 0 1 0 1 0 1]
[1 1 0 1 1 1 0 1 1 0]
[0 0 1 0 0 0 0 1 0 0]
[0 0 1 1 0 1 1 1 1 0]
[0 1 1 0 0 0 1 0 0 1]
[0 1 0 1 1 1 1 0 1 0]
[1 1 1 1 0 0 0 0 0 0]


In [16]:
counts = balanced_ds.take(10).reduce(
    initial_state={'class_0': 0, 'class_1': 0},
    reduce_func = count)

counts = np.array([counts['class_0'].numpy(),
                   counts['class_1'].numpy()]).astype(np.float32)

print(counts/counts.sum())

[0.55 0.45]


### Rejection resampling

In [18]:
def class_func(features, label):
  return label

In [24]:
resampler = tf.data.experimental.rejection_resample(
    class_func, target_dist=[0.5, 0.5], initial_dist=fractions)

In [25]:
resample_ds = creditcard_ds.unbatch().apply(resampler).batch(10)

In [26]:
balanced_ds = resample_ds.map(lambda extra_label, features_and_label: features_and_label)

In [27]:
for features, labels in balanced_ds.take(10):
  print(labels.numpy())

[1 1 1 0 1 0 0 1 1 1]
[0 1 1 1 1 0 1 1 1 0]
[0 1 0 1 0 0 1 1 0 1]
[1 0 0 0 0 0 0 0 1 0]
[0 1 1 0 0 1 0 0 0 1]
[1 1 0 1 1 0 0 1 0 1]
[0 1 0 1 1 1 1 1 1 1]
[0 0 0 0 0 0 1 0 0 1]
[0 1 1 0 0 0 0 1 0 1]
[1 0 1 0 0 1 1 0 1 0]


In [28]:
counts = balanced_ds.take(10).reduce(
    initial_state={'class_0': 0, 'class_1': 0},
    reduce_func = count)

counts = np.array([counts['class_0'].numpy(),
                   counts['class_1'].numpy()]).astype(np.float32)

print(counts/counts.sum())

[0.5 0.5]
