# Dealing with imbalanced datasets, combining oversampling with VAE and undersampling to improve model recognition over all classes.  

Import packages, classifiers and etc.

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

from sklearn.metrics import confusion_matrix, make_scorer
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score
from collections import Counter

from imblearn.over_sampling import SMOTE, RandomOverSampler

Import VAEOversampler.

In [2]:
from VAEOversampler import VAEOversampler

2023-07-10 16:39:31.852189: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-07-10 16:39:31.852212: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data  
You can load some dataset from Imbalanced Learn list (https://imbalanced-learn.org/stable/datasets/index.html) or use your own data.  


In [3]:
from imblearn.datasets import fetch_datasets

dset_name = 'sick_euthyroid'
dset = fetch_datasets()[dset_name]

X, y = StandardScaler().fit_transform(dset.data), dset.target

In [4]:
X

array([[ 1.04145908, -1.49139564,  1.57590614, ..., -0.4373301 ,
         0.29926998, -0.29926998],
       [-0.27007446,  0.67051289, -0.63455556, ..., -0.1040866 ,
         0.29926998, -0.29926998],
       [ 0.65285655,  0.67051289, -0.63455556, ...,  0.04254054,
         0.29926998, -0.29926998],
       ...,
       [ 0.36140465,  0.67051289, -0.63455556, ..., -0.33069218,
         0.29926998, -0.29926998],
       [-1.04727952,  0.67051289, -0.63455556, ..., -0.29070296,
         0.29926998, -0.29926998],
       [ 0.26425402,  0.67051289, -0.63455556, ...,  0.30913534,
         0.29926998, -0.29926998]])

In [5]:
y[y == -1] = 0

We split data into train and test partitions.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

This is a simple function to undersample freely.  

In [7]:
# RUS

def RUS(X_res, y_res, frac=1, minority_class_id=1, random_state=42):
    X_res = pd.DataFrame(X_res)
    X_res['Class'] = y_res  
    
    X_neg = X_res[y_res != minority_class_id].sample(frac=frac, random_state=random_state)
    X_pos = X_res[y_res == minority_class_id].sample(frac=1, random_state=random_state)
    
    X_rus = pd.concat([X_neg, X_pos], ignore_index=True)

    X_eq = X_rus.drop('Class', axis=1)
    y_eq = X_rus['Class']

    return X_eq, y_eq

In [8]:
def train_val(X, y, Xt, yt, random_state=42):
    classifiers = {
        "CatBoostClassifier": CatBoostClassifier(verbose=False, random_seed=random_state),
        "LGBMClassifier": LGBMClassifier(random_state=random_state),
        "XGBClassifier": XGBClassifier(random_state=random_state),
        "BaggingClassifier": BaggingClassifier(random_state=random_state),
        "RandomForestClassifier": RandomForestClassifier(random_state=random_state),
    }
    scores = []
    predictions = []
    for key, classifier in classifiers.items():
        print('_' * 50)
        name = key
        classifier.fit(X, y)
        print("Classifier: ", name)
        y_pred = classifier.predict(Xt)
        cm = confusion_matrix(yt, y_pred)
        print(cm)
        print('')
        predictions.append(y_pred)
        tn = cm[0,0]
        fp = cm[0,1]
        fn = cm[1,0]
        tp = cm[1,1]
        tnr = tn / (tn + fp)
        tpr = tp / (tp + fn)
        scores.append(tnr * tpr)
        print('TNR:', round(tnr, 5))
        print('TPR:', round(tpr, 5))
        print('TNRxTPR:', round(tnr * tpr, 5))
        print('G-mean:', round(np.sqrt(tnr * tpr), 5))

    print('_' * 50)
    print('Ensemble predictions (majority voting):')
    predictions = np.sum(predictions, axis=0)
    predictions[predictions < 3] = 0
    predictions[predictions >= 3] = 1

    cm = confusion_matrix(yt, predictions)
    print(cm)
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    tnr = tn / (tn + fp)
    tpr = tp / (tp + fn)
    print('')
    print('TNR:', round(tnr, 5))
    print('TPR:', round(tpr, 5))
    print('TNRxTPR:', round(tnr * tpr, 5))
    print('G-mean:', round(np.sqrt(tnr * tpr), 5))
    

## Without resampling (base line) 
Which is starting score?


In [9]:
print('Original dataset shape %s' % Counter(y))
print('Ratio->', round(Counter(y)[0]/Counter(y)[1], 1), ': 1')

Original dataset shape Counter({0: 2870, 1: 293})
Ratio-> 9.8 : 1


In [10]:
train_val(X_train, y_train, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[564  10]
 [  9  50]]

TNR: 0.98258
TPR: 0.84746
TNRxTPR: 0.83269
G-mean: 0.91252
__________________________________________________
Classifier:  LGBMClassifier
[[561  13]
 [  9  50]]

TNR: 0.97735
TPR: 0.84746
TNRxTPR: 0.82826
G-mean: 0.91009
__________________________________________________
Classifier:  XGBClassifier
[[562  12]
 [  9  50]]

TNR: 0.97909
TPR: 0.84746
TNRxTPR: 0.82974
G-mean: 0.9109
__________________________________________________
Classifier:  BaggingClassifier
[[562  12]
 [  9  50]]

TNR: 0.97909
TPR: 0.84746
TNRxTPR: 0.82974
G-mean: 0.9109
__________________________________________________
Classifier:  RandomForestClassifier
[[561  13]
 [  9  50]]

TNR: 0.97735
TPR: 0.84746
TNRxTPR: 0.82826
G-mean: 0.91009
__________________________________________________
Ensemble predictions (majority voting):
[[564  10]
 [  9  50]]

TNR: 0.98258
TPR: 0.84746
TNRxTPR: 0.83269
G-mean: 0.91252


## Ratio 1:1  
Let's see classifiers scores when dataset is balanced.  


In [10]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f80ac4e8040>
Traceback (most recent call last):
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


In [11]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 2296, 1: 2296})
Ratio->  1 : 1.0


In [12]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[557  17]
 [  7  52]]

TNR: 0.97038
TPR: 0.88136
TNRxTPR: 0.85525
G-mean: 0.9248
__________________________________________________
Classifier:  LGBMClassifier
[[557  17]
 [  8  51]]

TNR: 0.97038
TPR: 0.86441
TNRxTPR: 0.83881
G-mean: 0.91586
__________________________________________________
Classifier:  XGBClassifier
[[556  18]
 [  9  50]]

TNR: 0.96864
TPR: 0.84746
TNRxTPR: 0.82088
G-mean: 0.90603
__________________________________________________
Classifier:  BaggingClassifier
[[559  15]
 [ 10  49]]

TNR: 0.97387
TPR: 0.83051
TNRxTPR: 0.80881
G-mean: 0.89934
__________________________________________________
Classifier:  RandomForestClassifier
[[555  19]
 [  8  51]]

TNR: 0.9669
TPR: 0.86441
TNRxTPR: 0.83579
G-mean: 0.91422
__________________________________________________
Ensemble predictions (majority voting):
[[556  18]
 [  8  51]]

TNR: 0.96864
TPR: 0.86441
TNRxTPR: 0.8373
G-mean: 0.91504


In [13]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [14]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0: 2296, 1: 2296})
Ratio->  1 : 1.0


In [15]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[559  15]
 [  8  51]]

TNR: 0.97387
TPR: 0.86441
TNRxTPR: 0.84182
G-mean: 0.91751
__________________________________________________
Classifier:  LGBMClassifier
[[561  13]
 [  8  51]]

TNR: 0.97735
TPR: 0.86441
TNRxTPR: 0.84483
G-mean: 0.91915
__________________________________________________
Classifier:  XGBClassifier
[[560  14]
 [  8  51]]

TNR: 0.97561
TPR: 0.86441
TNRxTPR: 0.84332
G-mean: 0.91833
__________________________________________________
Classifier:  BaggingClassifier
[[559  15]
 [  8  51]]

TNR: 0.97387
TPR: 0.86441
TNRxTPR: 0.84182
G-mean: 0.91751
__________________________________________________
Classifier:  RandomForestClassifier
[[561  13]
 [ 10  49]]

TNR: 0.97735
TPR: 0.83051
TNRxTPR: 0.8117
G-mean: 0.90094
__________________________________________________
Ensemble predictions (majority voting):
[[560  14]
 [  8  51]]

TNR: 0.97561
TPR: 0.86441
TNRxTPR: 0.84332
G-mean: 0.91833


In [11]:
# VAEOversampler

vae_sampler = VAEOversampler(epochs=500,
                              intermediate_dim=512,
                              batch_size=64,
                              random_state=42,
                              verbose=False)
X_res, y_res = vae_sampler.fit_resample(X_train, y_train)

2023-07-10 16:40:19.812609: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-07-10 16:40:19.812640: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-07-10 16:40:19.812664: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (a-Modern-15-A5M): /proc/driver/nvidia/version does not exist
2023-07-10 16:40:19.812971: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
print('Resampled dataset shape %s' % Counter(y_res))
print('Ratio->  1 :', round(Counter(y_res)[1]/Counter(y_res)[0], 1))

Resampled dataset shape Counter({0.0: 2296, 1.0: 2296})
Ratio->  1 : 1.0


In [13]:
train_val(X_res, y_res, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[562  12]
 [  9  50]]

TNR: 0.97909
TPR: 0.84746
TNRxTPR: 0.82974
G-mean: 0.9109
__________________________________________________
Classifier:  LGBMClassifier
[[561  13]
 [  9  50]]

TNR: 0.97735
TPR: 0.84746
TNRxTPR: 0.82826
G-mean: 0.91009
__________________________________________________
Classifier:  XGBClassifier
[[561  13]
 [  9  50]]

TNR: 0.97735
TPR: 0.84746
TNRxTPR: 0.82826
G-mean: 0.91009
__________________________________________________
Classifier:  BaggingClassifier
[[562  12]
 [ 12  47]]

TNR: 0.97909
TPR: 0.79661
TNRxTPR: 0.77996
G-mean: 0.88315
__________________________________________________
Classifier:  RandomForestClassifier
[[564  10]
 [  9  50]]

TNR: 0.98258
TPR: 0.84746
TNRxTPR: 0.83269
G-mean: 0.91252
__________________________________________________
Ensemble predictions (majority voting):
[[562  12]
 [  9  50]]

TNR: 0.97909
TPR: 0.84746
TNRxTPR: 0.82974
G-mean: 0.9109


## Under/Oversampling combination  
Now we can tuning the number of instances for each class to optimize metric.  


In [25]:
# SMOTE

sm = SMOTE(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = sm.fit_resample(X_train, y_train)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7f80eacef1f0>
Traceback (most recent call last):
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
  File "/home/a/anaconda3/lib/python3.9/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'


In [28]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.2)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 2296, 0: 459})
Ratio->  1 : 5.0


In [29]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[534  40]
 [  4  55]]

TNR: 0.93031
TPR: 0.9322
TNRxTPR: 0.86724
G-mean: 0.93126
__________________________________________________
Classifier:  LGBMClassifier
[[530  44]
 [  3  56]]

TNR: 0.92334
TPR: 0.94915
TNRxTPR: 0.8764
G-mean: 0.93616
__________________________________________________
Classifier:  XGBClassifier
[[527  47]
 [  4  55]]

TNR: 0.91812
TPR: 0.9322
TNRxTPR: 0.85587
G-mean: 0.92513
__________________________________________________
Classifier:  BaggingClassifier
[[534  40]
 [  7  52]]

TNR: 0.93031
TPR: 0.88136
TNRxTPR: 0.81994
G-mean: 0.9055
__________________________________________________
Classifier:  RandomForestClassifier
[[529  45]
 [  5  54]]

TNR: 0.9216
TPR: 0.91525
TNRxTPR: 0.8435
G-mean: 0.91842
__________________________________________________
Ensemble predictions (majority voting):
[[536  38]
 [  4  55]]

TNR: 0.9338
TPR: 0.9322
TNRxTPR: 0.87049
G-mean: 0.933


In [22]:
# ROS

ros = RandomOverSampler(random_state=42, sampling_strategy=1) # N_rm / N_M
X_res, y_res = ros.fit_resample(X_train, y_train)

In [23]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.3)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1: 2296, 0: 689})
Ratio->  1 : 3.3


In [24]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[544  30]
 [  6  53]]

TNR: 0.94774
TPR: 0.89831
TNRxTPR: 0.85136
G-mean: 0.92269
__________________________________________________
Classifier:  LGBMClassifier
[[548  26]
 [  6  53]]

TNR: 0.9547
TPR: 0.89831
TNRxTPR: 0.85762
G-mean: 0.92608
__________________________________________________
Classifier:  XGBClassifier
[[541  33]
 [  2  57]]

TNR: 0.94251
TPR: 0.9661
TNRxTPR: 0.91056
G-mean: 0.95423
__________________________________________________
Classifier:  BaggingClassifier
[[544  30]
 [  4  55]]

TNR: 0.94774
TPR: 0.9322
TNRxTPR: 0.88348
G-mean: 0.93994
__________________________________________________
Classifier:  RandomForestClassifier
[[548  26]
 [  7  52]]

TNR: 0.9547
TPR: 0.88136
TNRxTPR: 0.84143
G-mean: 0.9173
__________________________________________________
Ensemble predictions (majority voting):
[[547  27]
 [  5  54]]

TNR: 0.95296
TPR: 0.91525
TNRxTPR: 0.8722
G-mean: 0.93392


In [14]:
# VAEOversampler

X_res, y_res = vae_sampler.resample(X_train, y_train, sampling_strategy=1)

In [15]:
# RUS

X_eq, y_eq = RUS(X_res, y_res, frac=.14)

print('Resampled dataset shape %s' % Counter(y_eq))
print('Ratio->  1 :', round(Counter(y_eq)[1]/Counter(y_eq)[0], 1))

Resampled dataset shape Counter({1.0: 2296, 0.0: 321})
Ratio->  1 : 7.2


In [16]:
train_val(X_eq, y_eq, X_test, y_test)

__________________________________________________
Classifier:  CatBoostClassifier
[[541  33]
 [  4  55]]

TNR: 0.94251
TPR: 0.9322
TNRxTPR: 0.87861
G-mean: 0.93734
__________________________________________________
Classifier:  LGBMClassifier
[[538  36]
 [  5  54]]

TNR: 0.93728
TPR: 0.91525
TNRxTPR: 0.85785
G-mean: 0.9262
__________________________________________________
Classifier:  XGBClassifier
[[537  37]
 [  3  56]]

TNR: 0.93554
TPR: 0.94915
TNRxTPR: 0.88797
G-mean: 0.94232
__________________________________________________
Classifier:  BaggingClassifier
[[534  40]
 [  6  53]]

TNR: 0.93031
TPR: 0.89831
TNRxTPR: 0.83571
G-mean: 0.91417
__________________________________________________
Classifier:  RandomForestClassifier
[[544  30]
 [  5  54]]

TNR: 0.94774
TPR: 0.91525
TNRxTPR: 0.86742
G-mean: 0.93135
__________________________________________________
Ensemble predictions (majority voting):
[[542  32]
 [  4  55]]

TNR: 0.94425
TPR: 0.9322
TNRxTPR: 0.88023
G-mean: 0.93821


## References  

  - Classification with Imbalanced Datasets:  
    https://sci2s.ugr.es/imbalanced  
  - Computer Vision:  Models, Learning, and Inference (Simon J.D. Prince):  
    http://www.computervisionmodels.com/  
  - Oversampling with VAEs:  
    https://towardsdatascience.com/oversampling-with-vaes-e410887fe51  
