In [0]:
import random
import pandas as pd
import tensorflow as tf
import numpy as np
import time
import tensorflow_addons as tfa
import keras
import keras.backend as K

from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Dense
from tensorflow.keras import metrics

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [0]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

experiment_num = 5

#Prepare the imbalanced dataset

##White wine quality
11 attributes

In [153]:
white_wine_quality = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv', sep=';')
white_wine_quality['quality'] -= 3
ww_quality_train, ww_quality_test = train_test_split(white_wine_quality, test_size = 0.2)

ww_quality_train_labels = ww_quality_train.pop('quality')
ww_quality_test_labels = ww_quality_test.pop('quality')

scaler_1 = StandardScaler()
ww_quality_train = scaler_1.fit_transform(ww_quality_train)
ww_quality_test = scaler_1.transform(ww_quality_test)

ww_quality_train_dataset = tf.data.Dataset.from_tensor_slices((ww_quality_train, ww_quality_train_labels))
ww_quality_test_dataset = tf.data.Dataset.from_tensor_slices((ww_quality_test, ww_quality_test_labels))

wwq_num_class = len(set(white_wine_quality['quality']))

print(tf.data.experimental.cardinality(ww_quality_train_dataset))
print(tf.data.experimental.cardinality(ww_quality_test_dataset))
print("classes:", *set(white_wine_quality['quality']))
print("num of each class:", *np.bincount(white_wine_quality['quality']))

tf.Tensor(3918, shape=(), dtype=int64)
tf.Tensor(980, shape=(), dtype=int64)
classes: 0 1 2 3 4 5 6
num of each class: 20 163 1457 2198 880 175 5


##Page
10 attributes

In [154]:
page = pd.read_csv('https://www.openml.org/data/get_csv/30/dataset_30_page-blocks.arff')
page['class'] -= 1

page_train, page_test = train_test_split(page, test_size = 0.2)

page_train_labels = page_train.pop('class')
page_test_labels = page_test.pop('class')

scaler_2 = StandardScaler()
page_train = scaler_2.fit_transform(page_train)
page_test = scaler_2.transform(page_test)

page_train_dataset = tf.data.Dataset.from_tensor_slices((page_train, page_train_labels))
page_test_dataset = tf.data.Dataset.from_tensor_slices((page_test, page_test_labels))

page_num_class = len(set(page['class']))

print(tf.data.experimental.cardinality(page_train_dataset))
print(tf.data.experimental.cardinality(page_test_dataset))
print("classes:", *set(page['class']))
print("num of each class:", *np.bincount(page['class']))

tf.Tensor(4378, shape=(), dtype=int64)
tf.Tensor(1095, shape=(), dtype=int64)
classes: 0 1 2 3 4
num of each class: 4913 329 28 88 115


##Thyroid-allhypo

In [155]:
thyroid = pd.read_csv('https://www.openml.org/data/get_csv/4533694/phpqJqmHb')
thyroid['Class'] -= 1

thyroid_train, thyroid_test = train_test_split(thyroid, test_size = 0.1)

thyroid_train_labels = thyroid_train.pop('Class')
thyroid_test_labels = thyroid_test.pop('Class')

scaler_3 = StandardScaler()
thyroid_train = scaler_3.fit_transform(thyroid_train)
thyroid_test = scaler_3.transform(thyroid_test)

thyroid_train_dataset = tf.data.Dataset.from_tensor_slices((thyroid_train, thyroid_train_labels))
thyroid_test_dataset = tf.data.Dataset.from_tensor_slices((thyroid_test, thyroid_test_labels))

thyroid_num_class = len(set(thyroid['Class']))

print(tf.data.experimental.cardinality(thyroid_train_dataset))
print(tf.data.experimental.cardinality(thyroid_test_dataset))
print("classes:", *set(thyroid['Class']))
print("num of each class:", *np.bincount(thyroid['Class']))

tf.Tensor(2520, shape=(), dtype=int64)
tf.Tensor(280, shape=(), dtype=int64)
classes: 0 1 2 3 4
num of each class: 1632 91 275 31 771


##Abalone

In [156]:
abalone = pd.read_csv('https://www.openml.org/data/get_csv/3620/dataset_187_abalone.arff')
abalone.pop('Sex')

delete = [1, 2, 3, 21, 22, 23, 24, 25, 26, 27, 29]
for d in delete:
  indexNames = abalone[abalone['Class_number_of_rings']==d].index
  abalone.drop(indexNames , inplace=True)
abalone['Class_number_of_rings'] -= 4

abalone_train, abalone_test = train_test_split(abalone, test_size = 0.1)

abalone_train_labels = abalone_train.pop('Class_number_of_rings')
abalone_test_labels = abalone_test.pop('Class_number_of_rings')

scaler_4 = StandardScaler()
abalone_train = scaler_4.fit_transform(abalone_train)
abalone_test = scaler_4.transform(abalone_test)

abalone_train_dataset = tf.data.Dataset.from_tensor_slices((abalone_train, abalone_train_labels))
abalone_test_dataset = tf.data.Dataset.from_tensor_slices((abalone_test, abalone_test_labels))

abalone_num_class = len(set(abalone['Class_number_of_rings']))

print(tf.data.experimental.cardinality(abalone_train_dataset))
print(tf.data.experimental.cardinality(abalone_test_dataset))
print("classes:", *set(abalone['Class_number_of_rings']))
print("num of each class:", *np.bincount(abalone['Class_number_of_rings']))

tf.Tensor(3711, shape=(), dtype=int64)
tf.Tensor(413, shape=(), dtype=int64)
classes: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
num of each class: 57 115 259 391 568 689 634 487 267 203 126 103 67 58 42 32 26


#Define mode and metrics

In [0]:
# class CategoricalTruePositives(tf.keras.metrics.Metric):

#     def __init__(self, num_classes, batch_size,
#                  name="categorical_true_positives", **kwargs):
#         super(CategoricalTruePositives, self).__init__(name=name, **kwargs)

#         self.batch_size = batch_size
#         self.num_classes = num_classes    

#         self.cat_true_positives = self.add_weight(name="ctp", initializer="zeros")

#     def update_state(self, y_true, y_pred, sample_weight=None):     
#         y_true = K.flatten(y_true)
#         y_pred = tf.argmax(y_pred, axis=-1)

#         y_true = tf.cast(y_true, tf.float32)
#         y_pred = tf.cast(y_pred, tf.float32)

#         true_poss = K.sum(K.cast((K.equal(y_true, y_pred)), dtype=tf.float32))

#         self.cat_true_positives.assign_add(true_poss)

#     def result(self):
#         return self.cat_true_positives

In [0]:
def model(num_class):
  ms = [
    metrics.SparseCategoricalAccuracy(name='accuracy')
  ]

  model = tf.keras.Sequential([
      Dense(12, activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.01),),
      Dropout(0.2),
      Dense(8, activation='relu',
            kernel_regularizer=tf.keras.regularizers.l2(0.01),),
      Dropout(0.2),
      Dense(num_class, activation='softmax')
  ])

  model.compile(
      optimizer=tf.keras.optimizers.Adam(lr=1e-2),
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=ms
  )
  
  return model

In [0]:
def recall_each_class(num_class, preds, labels):
  res_tp, res_tn, res_fp, res_fn = [], [], [], []

  for i in range(num_class):
    tp = tf.keras.metrics.TruePositives(name='tp')    
    tp.update_state(labels[:, i], preds[:, i])
    res_tp.append(int(tp.result()))

    tn = tf.keras.metrics.TrueNegatives(name='tn')
    tn.update_state(labels[:, i], preds[:, i])
    res_tn.append(int(tn.result()))

    fp = tf.keras.metrics.FalsePositives(name='fp')
    fp.update_state(labels[:, i], preds[:, i])
    res_fp.append(int(fp.result()))

    fn = tf.keras.metrics.FalseNegatives(name='fp')
    fn.update_state(labels[:, i], preds[:, i])
    res_fn.append(int(fn.result()))
  
  recall = []
  for i in range(num_class):
    recall.append(res_tp[i]/(res_tp[i]+res_fn[i]))
    
  return recall

#Shuffle test

In [0]:
def shuffle_test(train_ds, test_ds, num_class, batch_size=100, epochs=5):
  train_size = int(tf.data.experimental.cardinality(train_ds))
  test_size = int(tf.data.experimental.cardinality(test_ds))

  train_dataset = train_ds.shuffle(10000).batch(batch_size)
  test_dataset = test_ds.shuffle(10000).batch(batch_size)

  model_shuffle = model(num_class)
  train_his = model_shuffle.fit(
      train_dataset.repeat(),
      steps_per_epoch = train_size // batch_size,
      validation_data = test_dataset.repeat(),
      validation_steps = test_size // batch_size,
      epochs = epochs,
      verbose=0
  )

  labels = np.array(list(train_dataset.unbatch().map(lambda features, labels: labels).as_numpy_iterator()))
  labels = tf.compat.v1.one_hot(labels, depth=num_class)
  preds = model_shuffle.predict(train_dataset)
  preds = K.argmax(preds, axis=-1)
  preds = K.one_hot(preds, num_class)

  recalls = np.round(recall_each_class(num_class, preds, labels), decimals=4)
  return (train_his.history['val_accuracy'][-1], recalls)

##White wine quality shuffle test

In [193]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = shuffle_test(ww_quality_train_dataset, ww_quality_test_dataset, wwq_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.5437777876853943
min recall: 0.0
median recall: 0.0
max recall: 0.6700200000000001


##Page shuffle test

In [194]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = shuffle_test(page_train_dataset, page_test_dataset, page_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.9370000123977661
min recall: 0.0
median recall: 0.006900000000000001
max recall: 0.9410399999999999


##Thyroid shuffle test

In [195]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = shuffle_test(thyroid_train_dataset, thyroid_test_dataset, thyroid_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.6830000042915344
min recall: 0.0
median recall: 0.046200000000000005
max recall: 0.6639


##Abalone shuffle test

In [196]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = shuffle_test(abalone_train_dataset, abalone_test_dataset, abalone_num_class, batch_size=30, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.24410257041454314
min recall: 0.0
median recall: 0.0
max recall: 0.43196


#Random Over-sampling

In [0]:
def generate_balanced_batches_random(dataset):
    element_index = 0
    features_group_by_class = {}
    labels_group_by_class = {}
    dataset_class = {}

    for element in dataset.as_numpy_iterator():
      if element[1] not in features_group_by_class:
        features_group_by_class[element[1]] = [element[0]]
      else:
        features_group_by_class[element[1]].append(element[0])
      element_index += 1
    
    class_num = [(c, len(features)) for c, features in features_group_by_class.items()]
    max_class, max_class_num = max(class_num, key=lambda x: x[1])

    for c in features_group_by_class:
      dataset_class[c] = tf.data.Dataset.from_tensor_slices((features_group_by_class[c], [c]*len(features_group_by_class[c])))

    ds = dataset_class[max_class]

    for c, num in class_num:
      if c == max_class:
        continue
      times = int(np.round(max_class_num // num))
      for i in range(times):
        ds = ds.concatenate(dataset_class[c])

    return ds.shuffle(10000)

#Random over-sampling test

In [0]:
def random_over_sample_test(train_ds, test_ds, num_class, batch_size=100, epochs=5):
  balanced_train_ds= generate_balanced_batches_random(train_ds)

  test_size = int(tf.data.experimental.cardinality(test_ds))
  train_size = int(tf.data.experimental.cardinality(balanced_train_ds))

  test_dataset = test_ds.shuffle(10000).batch(batch_size)
  train_dataset = balanced_train_ds.batch(batch_size)

  model_balanced = model(num_class)
  train_his = model_balanced.fit(
      train_dataset.repeat(),
      steps_per_epoch = train_size // batch_size,
      validation_data = test_dataset.repeat(),
      validation_steps = test_size // batch_size,
      epochs = epochs,
      verbose=0
  )

  labels = np.array(list(train_dataset.unbatch().map(lambda features, labels: labels).as_numpy_iterator()))
  labels = tf.compat.v1.one_hot(labels, depth=num_class)
  preds = model_balanced.predict(train_dataset)
  preds = K.argmax(preds, axis=-1)
  preds = K.one_hot(preds, num_class)

  recalls = np.round(recall_each_class(num_class, preds, labels), decimals=4)
  return (train_his.history['val_accuracy'][-1], recalls)

##White wine quality random oversample test

In [182]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = random_over_sample_test(ww_quality_train_dataset, ww_quality_test_dataset, wwq_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.29333332777023313
min recall: 0.0
median recall: 0.16502
max recall: 0.26566


##Page random oversample test

In [183]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = random_over_sample_test(page_train_dataset, page_test_dataset, page_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.8804000020027161
min recall: 0.20021999999999998
median recall: 0.22382
max recall: 0.31208


##Thyroid random oversample test

In [184]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = random_over_sample_test(thyroid_train_dataset, thyroid_test_dataset, thyroid_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.5740000009536743
min recall: 0.12844
median recall: 0.23516000000000004
max recall: 0.24813999999999997


##Adalone random oversample test

In [185]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = random_over_sample_test(abalone_train_dataset, abalone_test_dataset, abalone_num_class, batch_size=30, epochs=8)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.20153846442699433
min recall: 0.0004
median recall: 0.0439
max recall: 0.2976


#Dynamic sampling
oversampling + boost

##Initialization

In [0]:
def initialize_dataset(dataset):
    features_group_by_class = {}
    labels_group_by_class = {}
    dataset_class = {}

    for element in dataset.as_numpy_iterator():
      if element[1] not in features_group_by_class:
        features_group_by_class[element[1]] = [element[0]]
      else:
        features_group_by_class[element[1]].append(element[0])
    
    class_num = [(c, len(features)) for c, features in features_group_by_class.items()]
    max_class, max_class_num = max(class_num, key=lambda x: x[1])

    for c in features_group_by_class:
      dataset_class[c] = tf.data.Dataset.from_tensor_slices((features_group_by_class[c], [c]*len(features_group_by_class[c])))

    ds = dataset_class[max_class]

    for c, num in class_num:
      if c == max_class:
        continue
      times = int(np.round(np.sqrt(max_class_num // num)))
      for i in range(times):
        ds = ds.concatenate(dataset_class[c])

    return ds.shuffle(10000)

##Dynamic sampling integrated with training

In [0]:
def dynamic_sampling(dataset, dataset_size, steps, batch_size, wrong=None, correct=None, wrong_ratio=0.7):
    if not wrong:
      indexes = [np.random.choice(dataset_size, size=batch_size, replace=False) for _ in range(steps)]
    else:
      wrong_length = len(wrong)
      max_wrong_sample_len = int(batch_size*wrong_ratio)
      wrong_sample_len = min(wrong_length, max_wrong_sample_len)
      correct_sample_len = batch_size - wrong_sample_len
      correct_indexes = [np.random.choice(correct, size=correct_sample_len, replace=False) for _ in range(steps)]
      wrong_indexes = [np.random.choice(wrong, size=wrong_sample_len, replace=False) for _ in range(steps)]
      indexes = [np.concatenate((c,m), axis=0) for c,m in zip(correct_indexes, wrong_indexes)]

    batch_group_by_index = {}
    for batch_index in range(steps):
      for element_index in indexes[batch_index]:
        if element_index not in batch_group_by_index:
          batch_group_by_index[element_index] = [batch_index]
        else:
          batch_group_by_index[element_index].append(batch_index)

    element_index = 0
    batch_features = [[] for _ in range(steps)]
    batch_labels = [[] for _ in range(steps)]
    for element in dataset.as_numpy_iterator():
      if element_index in batch_group_by_index:
        for batch_index in batch_group_by_index[element_index]:
          batch_features[batch_index].append(element[0])
          batch_labels[batch_index].append(element[1])
      element_index += 1

    features_shape = [batch_size, np.shape(batch_features)[-1]]
    labels_shape = [batch_size, ]
    
    dataset = tf.data.Dataset.from_generator(
        lambda: zip(batch_features, batch_labels),
        (tf.float32, tf.float32),
        (features_shape, labels_shape)
    )

    return dataset

In [0]:
def dynamic_training(train_ds, test_ds, num_class, batch_size=100, epochs=5, wrong_ratio=0.7):
    test_size = int(tf.data.experimental.cardinality(test_ds))
    test_steps = test_size // batch_size
    test_ds = test_ds.batch(batch_size)

    initial_dataset = initialize_dataset(train_ds)
    train_size = int(tf.data.experimental.cardinality(initial_dataset))
    train_steps = train_size // batch_size
    wrong, correct = None, None

    real_labels = np.array(list(initial_dataset.map(lambda features, labels: labels).as_numpy_iterator()), dtype=np.int32)
    dt_model = model(num_class)

    for epoch_num in range(epochs):
      # print('epoch:', epoch_num+1)
      current_train = dynamic_sampling(
          initial_dataset,
          train_size,
          train_steps,
          batch_size,
          wrong,
          correct,
          wrong_ratio
      )
      train_his=dt_model.fit(
          current_train,
          steps_per_epoch = train_steps,
          validation_data = test_ds,
          validation_steps = test_steps,
          epochs=1,
          verbose=0
      )

      if epoch_num == epochs-1: break
      predictions = dt_model.predict(initial_dataset.batch(batch_size))
      predicted_labels = np.argmax(predictions, axis=-1).astype(np.int32)
      wrong = [i for i in range(len(real_labels)) if predicted_labels[i]!=real_labels[i]]
      correct = [i for i in range(len(real_labels)) if predicted_labels[i]==real_labels[i]]
      # print('predict acc:', len(correct)/ (len(correct)+len(wrong)))
      # wrong_ratio += 0.05
      # wrong_ratio = min(1, wrong_ratio)
    # dt_model.evaluate(initial_dataset.batch(batch_size))
    labels = tf.compat.v1.one_hot(real_labels, depth=num_class)
    preds = dt_model.predict(initial_dataset.batch(batch_size))
    preds = K.argmax(preds, axis=-1)
    preds = K.one_hot(preds, num_class)

    recalls = np.round(recall_each_class(num_class, preds, labels), decimals=4)
    return (train_his.history['val_accuracy'], recalls)

##White wine quality dynamic sample test

In [188]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = dynamic_training(ww_quality_train_dataset, ww_quality_test_dataset, wwq_num_class, epochs = 10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.5333333373069763
min recall: 0.0
median recall: 0.038740000000000004
max recall: 0.66512


##Page dynamic sample test

In [189]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = dynamic_training(page_train_dataset, page_test_dataset, page_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.9527999997138977
min recall: 0.0327
median recall: 0.08038000000000001
max recall: 0.65334


##Thyroid dynamic sample test

In [190]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = dynamic_training(thyroid_train_dataset, thyroid_test_dataset, thyroid_num_class, epochs=10)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.6919999957084656
min recall: 0.0077
median recall: 0.08286
max recall: 0.63412


##Abalone dynamic sample test

In [191]:
accs, recalls_10 = [], []
for _ in range(experiment_num):
  acc, recalls = dynamic_training(abalone_train_dataset, abalone_test_dataset, abalone_num_class, batch_size=30, epochs=8)
  accs.append(acc)
  recalls_10.append(recalls)
average_acc = np.mean(accs)
average_recalls = np.mean(recalls_10, axis=0)
print('acc: ', average_acc)
print('min recall:', np.min(average_recalls))
print('median recall:', np.median(average_recalls))
print('max recall:', np.max(average_recalls))

acc:  0.23538461923599244
min recall: 0.0
median recall: 0.029560000000000003
max recall: 0.3054
