In [0]:
import random
import pandas as pd
import tensorflow as tf
import numpy as np
import time

from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras import metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [0]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
experiment_num = 5

#Prepare the imbalanced dataset

##Credit card

In [3]:
credit_card = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')

credit_card.pop('Time')
eps=0.001
credit_card['Log Ammount'] = np.log(credit_card.pop('Amount')+eps)

credit_card_train, credit_card_test = train_test_split(credit_card, test_size = 0.2)

credit_card_train_labels = credit_card_train.pop('Class')
credit_card_test_labels = credit_card_test.pop('Class')

# scaler = StandardScaler()
# credit_card_train = scaler.fit_transform(credit_card_train)
# credit_card_test = scaler.transform(credit_card_test)

credit_card_train_dataset = tf.data.Dataset.from_tensor_slices((credit_card_train, credit_card_train_labels))
credit_card_test_dataset = tf.data.Dataset.from_tensor_slices((credit_card_test, credit_card_test_labels))

credit_num_class = len(set(credit_card['Class']))

print(tf.data.experimental.cardinality(credit_card_train_dataset))
print(tf.data.experimental.cardinality(credit_card_test_dataset))

neg, pos = np.bincount(credit_card['Class'])
print("classes:", *set(credit_card['Class']))
print("negative: {:.2f}%".format(neg/(neg+pos)*100))
print("positive: {:.2f}%".format(pos/(neg+pos)*100))

tf.Tensor(227845, shape=(), dtype=int64)
tf.Tensor(56962, shape=(), dtype=int64)
classes: 0 1
negative: 99.83%
positive: 0.17%


##Thyroid

In [4]:
from sklearn import preprocessing

thyroid = pd.read_csv('https://www.openml.org/data/get_csv/16787449/phpgtTkZa')
le = preprocessing.LabelEncoder()
thyroid['Target'] = le.fit_transform(thyroid['Target'])

thyroid_train, thyroid_test = train_test_split(thyroid, test_size = 0.1)
thyroid_train_labels = thyroid_train.pop('Target')
thyroid_test_labels = thyroid_test.pop('Target')

# scaler_3 = StandardScaler()
# thyroid_train = scaler_3.fit_transform(thyroid_train)
# thyroid_test = scaler_3.transform(thyroid_test)

thyroid_train_dataset = tf.data.Dataset.from_tensor_slices((thyroid_train, thyroid_train_labels))
thyroid_test_dataset = tf.data.Dataset.from_tensor_slices((thyroid_test, thyroid_test_labels))

thyroid_num_class = len(set(thyroid['Target']))

print(tf.data.experimental.cardinality(thyroid_train_dataset))
print(tf.data.experimental.cardinality(thyroid_test_dataset))
print("classes:", *set(thyroid['Target']))
neg, pos = np.bincount(list(thyroid_train_labels)+list(thyroid_test_labels))
print("negative: {:.2f}%".format(neg/(neg+pos)*100))
print("positive: {:.2f}%".format(pos/(neg+pos)*100))

tf.Tensor(6480, shape=(), dtype=int64)
tf.Tensor(720, shape=(), dtype=int64)
classes: 0 1
negative: 7.42%
positive: 92.58%


##Pulsar

In [5]:
pulsar = pd.read_csv('https://github.com/alexandrehsd/Predicting-Pulsar-Stars/raw/master/pulsar_stars.csv')

pulsar_train, pulsar_test = train_test_split(pulsar, test_size = 0.1)
pulsar_train_labels = pulsar_train.pop('target_class')
pulsar_test_labels = pulsar_test.pop('target_class')

# scaler_4 = StandardScaler()
# pulsar_train = scaler_4.fit_transform(pulsar_train)
# pulsar_test = scaler_4.transform(pulsar_test)

pulsar_train_dataset = tf.data.Dataset.from_tensor_slices((pulsar_train, pulsar_train_labels))
pulsar_test_dataset = tf.data.Dataset.from_tensor_slices((pulsar_test, pulsar_test_labels))

pulsar_num_class = len(set(pulsar['target_class']))

print(tf.data.experimental.cardinality(pulsar_train_dataset))
print(tf.data.experimental.cardinality(pulsar_test_dataset))
print("classes:", *set(pulsar['target_class']))
neg, pos = np.bincount(pulsar['target_class'])
print("negative: {:.2f}%".format(neg/(neg+pos)*100))
print("positive: {:.2f}%".format(pos/(neg+pos)*100))

tf.Tensor(16108, shape=(), dtype=int64)
tf.Tensor(1790, shape=(), dtype=int64)
classes: 0 1
negative: 90.84%
positive: 9.16%


#Define mode and metrics

In [0]:
def model(num_class):
  ms = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.AUC(name='auc')
  ]

  model = tf.keras.Sequential([
      Dense(16, activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.001),),
      Dropout(0.4),
      Dense(1, activation='sigmoid')
  ])

  model.compile(
      optimizer=tf.keras.optimizers.Adam(lr=1e-3),
      loss=tf.keras.losses.BinaryCrossentropy(),
      # loss=tf.keras.metrics.SparseCategoricalCrossentropy,
      metrics=ms
  )
  
  return model

#Shuffle test

In [0]:
def shuffle_test(train_ds, test_ds, num_class, batch_size=100, epochs=5):
  train_size = int(tf.data.experimental.cardinality(train_ds))
  test_size = int(tf.data.experimental.cardinality(test_ds))

  train_dataset = train_ds.shuffle(10000).batch(batch_size).repeat()
  test_dataset = test_ds.shuffle(10000).batch(batch_size).repeat()

  model_shuffle = model(num_class)
  train_his = model_shuffle.fit(
      train_dataset,
      steps_per_epoch = train_size // batch_size,
      validation_data = test_dataset,
      validation_steps = test_size // batch_size,
      epochs = epochs,
      verbose=0
  )
  return (train_his.history['val_accuracy'][-1], train_his.history['val_auc'][-1])

##Credit card shuffle test

In [44]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = shuffle_test(credit_card_train_dataset, credit_card_test_dataset, credit_num_class)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.9989982366561889
auc:  0.9376248598098755


##Thyroid shuffle test

In [17]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = shuffle_test(thyroid_train_dataset, thyroid_test_dataset, thyroid_num_class, epochs=10)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.924571442604065
auc:  0.7716545701026917


##Pulsar

In [18]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = shuffle_test(pulsar_train_dataset, pulsar_test_dataset, pulsar_num_class)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.968117642402649
auc:  0.9400691628456116


#Random Over-sampling

In [0]:
def generate_balanced_batches_random(dataset):
    element_index = 0
    features_group_by_class = {}
    labels_group_by_class = {}
    dataset_class = {}

    for element in dataset.as_numpy_iterator():
      if element[1] not in features_group_by_class:
        features_group_by_class[element[1]] = [element[0]]
      else:
        features_group_by_class[element[1]].append(element[0])
      element_index += 1
    
    class_num = [(c, len(features)) for c, features in features_group_by_class.items()]
    max_class, max_class_num = max(class_num, key=lambda x: x[1])

    for c in features_group_by_class:
      dataset_class[c] = tf.data.Dataset.from_tensor_slices((features_group_by_class[c], [c]*len(features_group_by_class[c])))

    ds = dataset_class[max_class]

    for c, num in class_num:
      if c == max_class:
        continue
      times = int(np.round(max_class_num // num))
      for i in range(times):
        ds = ds.concatenate(dataset_class[c])
    return ds.shuffle(10000)

#Random over-sampling test

In [0]:
def random_over_sample_test(train_ds, test_ds, num_class, batch_size=100, epochs=5):
  balanced_train_ds= generate_balanced_batches_random(train_ds)

  test_size = int(tf.data.experimental.cardinality(test_ds))
  train_size = int(tf.data.experimental.cardinality(balanced_train_ds))

  test_dataset = test_ds.shuffle(10000).batch(batch_size).repeat()
  train_dataset = balanced_train_ds.shuffle(10000).batch(batch_size).repeat()

  model_balanced = model(num_class)
  train_his=model_balanced.fit(
      train_dataset,
      steps_per_epoch = train_size // batch_size,
      validation_data = test_dataset,
      validation_steps = test_size // batch_size,
      epochs = epochs,
      verbose=0
  )
  return (train_his.history['val_accuracy'][-1], train_his.history['val_auc'][-1])

##Credit card random oversample test

In [42]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = random_over_sample_test(credit_card_train_dataset, credit_card_test_dataset, credit_num_class)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.3457926154136658
auc:  0.9414769887924195


##Thyroid random oversample test

In [26]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = random_over_sample_test(thyroid_train_dataset, thyroid_test_dataset, thyroid_num_class, epochs=10)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.8085714340209961
auc:  0.8424130439758301


##Plusar

In [47]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = random_over_sample_test(pulsar_train_dataset, pulsar_test_dataset, pulsar_num_class)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.9452941298484803
auc:  0.9728948593139648


#Dynamic sampling
oversampling + boost

##Initialization

In [0]:
def initialize_dataset(dataset):
    features_group_by_class = {}
    labels_group_by_class = {}
    dataset_class = {}

    for element in dataset.as_numpy_iterator():
      if element[1] not in features_group_by_class:
        features_group_by_class[element[1]] = [element[0]]
      else:
        features_group_by_class[element[1]].append(element[0])
    
    class_num = [(c, len(features)) for c, features in features_group_by_class.items()]
    max_class, max_class_num = max(class_num, key=lambda x: x[1])

    for c in features_group_by_class:
      dataset_class[c] = tf.data.Dataset.from_tensor_slices((features_group_by_class[c], [c]*len(features_group_by_class[c])))

    ds = dataset_class[max_class]

    for c, num in class_num:
      if c == max_class:
        continue
      times = int(np.round(np.sqrt(max_class_num // num)))
      for i in range(times):
        ds = ds.concatenate(dataset_class[c])

    return ds.shuffle(10000)

##Dynamic sampling integrated with training

In [0]:
def dynamic_sampling(dataset, dataset_size, steps, batch_size, wrong = None, correct = None, wrong_ratio = 0.7):
    if not wrong:
      indexes = [np.random.choice(dataset_size, size=batch_size, replace=False) for _ in range(steps)]
    else:
      wrong_length = len(wrong)
      max_wrong_sample_len = int(batch_size*wrong_ratio)
      wrong_sample_len = min(wrong_length, max_wrong_sample_len)
      correct_sample_len = batch_size - wrong_sample_len
      correct_indexes = [np.random.choice(correct, size=correct_sample_len, replace=False) for _ in range(steps)]
      wrong_indexes = [np.random.choice(wrong, size=wrong_sample_len, replace=False) for _ in range(steps)]
      indexes = [np.concatenate((c,m), axis=0) for c,m in zip(correct_indexes, wrong_indexes)]

    batch_group_by_index = {}
    for batch_index in range(steps):
      for element_index in indexes[batch_index]:
        if element_index not in batch_group_by_index:
          batch_group_by_index[element_index] = [batch_index]
        else:
          batch_group_by_index[element_index].append(batch_index)

    element_index = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]
    for element in dataset.as_numpy_iterator():
      if element_index in batch_group_by_index:
        for batch_index in batch_group_by_index[element_index]:
          batch_features[batch_index].append(element[0])
          batch_labels[batch_index].append(element[1])
      element_index += 1

    features_shape = [batch_size, np.shape(batch_features)[-1]]
    labels_shape = [batch_size, ]
    
    dataset = tf.data.Dataset.from_generator(
        lambda: zip(batch_features, batch_labels),
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    return dataset

In [0]:
def dynamic_training(train_ds, test_ds, num_class, batch_size=100, epochs=5, wrong_ratio=0.7):
    test_size = int(tf.data.experimental.cardinality(test_ds))
    test_steps = test_size // batch_size
    test_ds = test_ds.batch(batch_size)

    initial_dataset = initialize_dataset(train_ds)
    train_size = int(tf.data.experimental.cardinality(initial_dataset))
    train_steps = train_size // batch_size
    wrong, correct = None, None

    train_features = initial_dataset.map(lambda features, labels: features).batch(batch_size)
    real_labels = np.array(list(initial_dataset.map(lambda features, labels: labels).as_numpy_iterator()))
    real_labels = np.reshape(real_labels, (len(real_labels), ))

    dt_model = model(num_class)

    for epoch_num in range(epochs):
      # print('epoch:', epoch_num+1)
      current_train = dynamic_sampling(
          initial_dataset,
          train_size,
          train_steps,
          batch_size,
          wrong,
          correct,
          wrong_ratio=wrong_ratio
      )
      train_his=dt_model.fit(
          current_train,
          steps_per_epoch = train_steps,
          validation_data = test_ds,
          validation_steps = test_steps,
          epochs=1,
          verbose=0
      )

      if epoch_num == epochs-1: break
      predictions = dt_model.predict(train_features)
      predicted_labels = np.argmax(predictions, axis=1)
      wrong = [i for i in range(len(real_labels)) if predicted_labels[i]!=real_labels[i]]
      correct = [i for i in range(len(real_labels)) if predicted_labels[i]==real_labels[i]]
      # wrong_ratio += 0.05
      # wrong_ratio = min(1, wrong_ratio)
    return (train_his.history['val_accuracy'][-1], train_his.history['val_auc'][-1])

##Credit card dynamic sample test
dynamic sample = oversample + boost

In [43]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = dynamic_training(credit_card_train_dataset, credit_card_test_dataset, credit_num_class)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.9908119559288024
auc:  0.9744839906692505


##Thyroid

In [39]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = dynamic_training(thyroid_train_dataset, thyroid_test_dataset, thyroid_num_class, epochs=10)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.9291428327560425
auc:  0.8548848509788514


##Plusar

In [48]:
accs, aucs = [], []
for _ in range(experiment_num):
  acc, auc = dynamic_training(pulsar_train_dataset, pulsar_test_dataset, pulsar_num_class)
  accs.append(acc)
  aucs.append(auc)
average_acc = np.mean(accs)
average_auc = np.mean(aucs)
print('acc: ', average_acc)
print('auc: ', average_auc)

acc:  0.9671764731407165
auc:  0.9718711495399475
