# Evaluation of oversamplers with a set of classifiers on one database

In this notebook we give an example of optimizing oversamplers and classifiers for given dataset.

In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import smote_variants as sv

import imbalanced_databases as imbd

In [2]:
# the evaluation procedure uses a directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# specifying the dataset to be used

dataset= imbd.load_glass0()

In [4]:
# specifying the classifiers

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()

In [5]:
# executing the evaluation using 5 parallel jobs and at most 35 random but meaningful parameter combinations
# with the 5 quickest oversamplers

results= sv.evaluate_oversamplers(datasets= [dataset],
                                    samplers= sv.get_n_quickest_oversamplers(5),
                                    classifiers= [knn_classifier, dt_classifier],
                                    cache_path= cache_path,
                                    n_jobs= 5,
                                    max_samp_par_comb= 35)

2022-03-27 16:02:06,725:INFO:creating cache directory
2022-03-27 16:02:06,728:INFO:dataset: glass0, samplings_available: False, evaluations_available: False
2022-03-27 16:02:06,728:INFO:doing the folding
2022-03-27 16:02:06,729:INFO:Folding doing folding folding_glass0.pickle
2022-03-27 16:02:06,734:INFO:Folding dumping to file folding_glass0.pickle
2022-03-27 16:02:06,738:INFO:do the samplings
2022-03-27 16:02:06,738:INFO:create sampling objects, random_state: 
2022-03-27 16:02:06,739:INFO:samplers: [<class 'smote_variants._smote_variants.SPY'>, <class 'smote_variants._smote_variants.OUPS'>, <class 'smote_variants._smote_variants.SMOTE_D'>, <class 'smote_variants._smote_variants.NT_SMOTE'>, <class 'smote_variants._smote_variants.Gazzah'>]
2022-03-27 16:02:06,740:INFO:[{'n_neighbors': 3, 'threshold': 0.3}, {'n_neighbors': 3, 'threshold': 0.5}, {'n_neighbors': 3, 'threshold': 0.7}, {'n_neighbors': 5, 'threshold': 0.3}, {'n_neighbors': 5, 'threshold': 0.5}, {'n_neighbors': 5, 'threshold'

["('glass0', OrderedDict([('n_neighbors', 7), ('random_state', None), ('threshold', 0.3)]), 'KNeighborsClassifier', OrderedDict([('algorithm', 'auto'), ('leaf_size', 30), ('metric', 'minkowski'), ('metric_params', None), ('n_jobs', None), ('n_neighbors', 5), ('p', 2), ('weights', 'uniform')]))", "('glass0', OrderedDict([('n_neighbors', 7), ('random_state', None), ('threshold', 0.3)]), 'DecisionTreeClassifier', OrderedDict([('ccp_alpha', 0.0), ('class_weight', None), ('criterion', 'gini'), ('max_depth', None), ('max_features', None), ('max_leaf_nodes', None), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 2), ('min_weight_fraction_leaf', 0.0), ('random_state', None), ('splitter', 'best')]))"]
["('glass0', OrderedDict([('n_neighbors', 3), ('random_state', None), ('threshold', 0.3)]), 'KNeighborsClassifier', OrderedDict([('algorithm', 'auto'), ('leaf_size', 30), ('metric', 'minkowski'), ('metric_params', None), ('n_jobs', None), ('n_neighbors', 5), ('p', 2)

2022-03-27 16:02:10,733:INFO:Sampling sampler parameter string OrderedDict([('proportion', 0.25), ('random_state', None)])
2022-03-27 16:02:10,735:INFO:{'k': 5, 'proportion': 0.25, 'random_state': None}
2022-03-27 16:02:10,736:INFO:Sampling sampler parameter string OrderedDict([('k', 5), ('proportion', 0.25), ('random_state', None)])
2022-03-27 16:02:10,738:INFO:{'k': 3, 'proportion': 0.25, 'random_state': None}
2022-03-27 16:02:10,739:INFO:Sampling sampler parameter string OrderedDict([('k', 3), ('proportion', 0.25), ('random_state', None)])
2022-03-27 16:02:10,741:INFO:{'k': 7, 'proportion': 0.25, 'random_state': None}
2022-03-27 16:02:10,742:INFO:Sampling sampler parameter string OrderedDict([('k', 7), ('proportion', 0.25), ('random_state', None)])
2022-03-27 16:02:10,744:INFO:{'proportion': 0.25, 'random_state': None}
2022-03-27 16:02:10,745:INFO:Sampling sampler parameter string OrderedDict([('proportion', 0.25), ('random_state', None)])
2022-03-27 16:02:10,746:INFO:{'n_components

["('glass0', OrderedDict([('proportion', 0.25), ('random_state', None)]), 'KNeighborsClassifier', OrderedDict([('algorithm', 'auto'), ('leaf_size', 30), ('metric', 'minkowski'), ('metric_params', None), ('n_jobs', None), ('n_neighbors', 5), ('p', 2), ('weights', 'uniform')]))", "('glass0', OrderedDict([('proportion', 0.25), ('random_state', None)]), 'DecisionTreeClassifier', OrderedDict([('ccp_alpha', 0.0), ('class_weight', None), ('criterion', 'gini'), ('max_depth', None), ('max_features', None), ('max_leaf_nodes', None), ('min_impurity_decrease', 0.0), ('min_samples_leaf', 1), ('min_samples_split', 2), ('min_weight_fraction_leaf', 0.0), ('random_state', None), ('splitter', 'best')]))"]
["('glass0', OrderedDict([('k', 5), ('proportion', 0.25), ('random_state', None)]), 'KNeighborsClassifier', OrderedDict([('algorithm', 'auto'), ('leaf_size', 30), ('metric', 'minkowski'), ('metric_params', None), ('n_jobs', None), ('n_neighbors', 5), ('p', 2), ('weights', 'uniform')]))", "('glass0', Or

2022-03-27 16:02:13,347:INFO:concatenating the results
2022-03-27 16:02:13,496:INFO:aggregating the results


In [6]:
# determining oversampler and classifier combination with highest AUC score

highest_auc_score= results['auc'].idxmax()

In [7]:
# querying classifier and oversampler parameters with highest AUC score

cl, cl_par, samp, samp_par= results.loc[highest_auc_score][['classifier',
                                                           'classifier_parameters_auc',
                                                           'sampler',
                                                           'sampler_parameters_auc']]

In [8]:
# instantiating oversampler and classifier objects providing the highest AUC score

samp_obj= getattr(sv, samp)(**eval(samp_par))
cl_obj= eval(cl)(**eval(cl_par))

In [9]:
# oversampling the entire dataset and fitting a classifier

X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])
cl_obj.fit(X_samp, y_samp)

2022-03-27 16:02:35,149:INFO:OUPS: Running sampling via ('OUPS', "{'proportion': 2.0, 'n_jobs': 1, 'random_state': None}")
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


KNeighborsClassifier()