From 7e34701daab1f4b9a9599156ce9f49520776d422 Mon Sep 17 00:00:00 2001 From: joaopfonseca Date: Fri, 17 Dec 2021 13:10:00 +0000 Subject: [PATCH] ENH add GeometricSMOTE implementation (#881) --- imblearn/over_sampling/__init__.py | 2 + imblearn/over_sampling/_smote/__init__.py | 3 + imblearn/over_sampling/_smote/geometric.py | 324 ++++++++++++++++++ .../_smote/tests/test_geometric_smote.py | 209 +++++++++++ 4 files changed, 538 insertions(+) create mode 100644 imblearn/over_sampling/_smote/geometric.py create mode 100644 imblearn/over_sampling/_smote/tests/test_geometric_smote.py diff --git a/imblearn/over_sampling/__init__.py b/imblearn/over_sampling/__init__.py index a959cbb43..36504d3d4 100644 --- a/imblearn/over_sampling/__init__.py +++ b/imblearn/over_sampling/__init__.py @@ -8,6 +8,7 @@ from ._smote import SMOTE from ._smote import BorderlineSMOTE from ._smote import KMeansSMOTE +from ._smote import GeometricSMOTE from ._smote import SVMSMOTE from ._smote import SMOTENC from ._smote import SMOTEN @@ -16,6 +17,7 @@ "ADASYN", "RandomOverSampler", "KMeansSMOTE", + "GeometricSMOTE", "SMOTE", "BorderlineSMOTE", "SVMSMOTE", diff --git a/imblearn/over_sampling/_smote/__init__.py b/imblearn/over_sampling/_smote/__init__.py index aaf4dd348..42cd89ce8 100644 --- a/imblearn/over_sampling/_smote/__init__.py +++ b/imblearn/over_sampling/_smote/__init__.py @@ -4,6 +4,8 @@ from .cluster import KMeansSMOTE +from .geometric import GeometricSMOTE + from .filter import BorderlineSMOTE from .filter import SVMSMOTE @@ -12,6 +14,7 @@ "SMOTEN", "SMOTENC", "KMeansSMOTE", + "GeometricSMOTE", "BorderlineSMOTE", "SVMSMOTE", ] diff --git a/imblearn/over_sampling/_smote/geometric.py b/imblearn/over_sampling/_smote/geometric.py new file mode 100644 index 000000000..a7601a01e --- /dev/null +++ b/imblearn/over_sampling/_smote/geometric.py @@ -0,0 +1,324 @@ +"""Class to perform over-sampling using Geometric SMOTE.""" + +# Author: Georgios Douzas +# License: BSD 3 clause + +import numpy as np +from numpy.linalg import norm +from sklearn.utils import check_random_state +from imblearn.over_sampling.base import BaseOverSampler +from imblearn.utils import check_neighbors_object, Substitution +from imblearn.utils._docstring import _random_state_docstring + +SELECTION_STRATEGY = ('combined', 'majority', 'minority') + + +def _make_geometric_sample( + center, surface_point, truncation_factor, deformation_factor, random_state +): + """A support function that returns an artificial point inside + the geometric region defined by the center and surface points. + + Parameters + ---------- + center : ndarray, shape (n_features, ) + Center point of the geometric region. + + surface_point : ndarray, shape (n_features, ) + Surface point of the geometric region. + + truncation_factor : float, optional (default=0.0) + The type of truncation. The values should be in the [-1.0, 1.0] range. + + deformation_factor : float, optional (default=0.0) + The type of geometry. The values should be in the [0.0, 1.0] range. + + random_state : int, RandomState instance or None + Control the randomization of the algorithm. + + Returns + ------- + point : ndarray, shape (n_features, ) + Synthetically generated sample. + + """ + + # Zero radius case + if np.array_equal(center, surface_point): + return center + + # Generate a point on the surface of a unit hyper-sphere + radius = norm(center - surface_point) + normal_samples = random_state.normal(size=center.size) + point_on_unit_sphere = normal_samples / norm(normal_samples) + point = (random_state.uniform(size=1) ** (1 / center.size)) * point_on_unit_sphere + + # Parallel unit vector + parallel_unit_vector = (surface_point - center) / norm(surface_point - center) + + # Truncation + close_to_opposite_boundary = ( + truncation_factor > 0 + and np.dot(point, parallel_unit_vector) < truncation_factor - 1 + ) + close_to_boundary = ( + truncation_factor < 0 + and np.dot(point, parallel_unit_vector) > truncation_factor + 1 + ) + if close_to_opposite_boundary or close_to_boundary: + point -= 2 * np.dot(point, parallel_unit_vector) * parallel_unit_vector + + # Deformation + parallel_point_position = np.dot(point, parallel_unit_vector) * parallel_unit_vector + perpendicular_point_position = point - parallel_point_position + point = ( + parallel_point_position + + (1 - deformation_factor) * perpendicular_point_position + ) + + # Translation + point = center + radius * point + + return point + + +@Substitution( + sampling_strategy=BaseOverSampler._sampling_strategy_docstring, + random_state=_random_state_docstring, +) +class GeometricSMOTE(BaseOverSampler): + """Class to to perform over-sampling using Geometric SMOTE. + + This algorithm is an implementation of Geometric SMOTE, a geometrically + enhanced drop-in replacement for SMOTE as presented in [1]_. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + {sampling_strategy} + + {random_state} + + truncation_factor : float, optional (default=0.0) + The type of truncation. The values should be in the [-1.0, 1.0] range. + + deformation_factor : float, optional (default=0.0) + The type of geometry. The values should be in the [0.0, 1.0] range. + + selection_strategy : str, optional (default='combined') + The type of Geometric SMOTE algorithm with the following options: + ``'combined'``, ``'majority'``, ``'minority'``. + + k_neighbors : int or object, optional (default=5) + If ``int``, number of nearest neighbours to use when synthetic + samples are constructed for the minority method. If object, an estimator + that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that + will be used to find the k_neighbors. + + n_jobs : int, optional (default=1) + The number of threads to open if possible. + + Notes + ----- + See the original paper: [1]_ for more details. + + Supports multi-class resampling. A one-vs.-rest scheme is used as + originally proposed in [2]_. + + References + ---------- + + .. [1] G. Douzas, F. Bacao, "Geometric SMOTE: + a geometrically enhanced drop-in replacement for SMOTE", + Information Sciences, vol. 501, pp. 118-135, 2019. + + .. [2] N. V. Chawla, K. W. Bowyer, L. O. Hall, W. P. Kegelmeyer, "SMOTE: + synthetic minority over-sampling technique", Journal of Artificial + Intelligence Research, vol. 16, pp. 321-357, 2002. + + Examples + -------- + + >>> from collections import Counter + >>> from sklearn.datasets import make_classification + >>> from gsmote import GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE + >>> X, y = make_classification(n_classes=2, class_sep=2, + ... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, + ... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{1: 900, 0: 100}}) + >>> gsmote = GeometricSMOTE(random_state=1) + >>> X_res, y_res = gsmote.fit_resample(X, y) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 900, 1: 900}}) + + """ + + def __init__( + self, + sampling_strategy='auto', + random_state=None, + truncation_factor=1.0, + deformation_factor=0.0, + selection_strategy='combined', + k_neighbors=5, + n_jobs=1, + ): + super(GeometricSMOTE, self).__init__(sampling_strategy=sampling_strategy) + self.random_state = random_state + self.truncation_factor = truncation_factor + self.deformation_factor = deformation_factor + self.selection_strategy = selection_strategy + self.k_neighbors = k_neighbors + self.n_jobs = n_jobs + + def _validate_estimator(self): + """Create the necessary attributes for Geometric SMOTE.""" + + # Check random state + self.random_state_ = check_random_state(self.random_state) + + # Validate strategy + if self.selection_strategy not in SELECTION_STRATEGY: + error_msg = ( + 'Unknown selection_strategy for Geometric SMOTE algorithm. ' + 'Choices are {}. Got {} instead.' + ) + raise ValueError( + error_msg.format(SELECTION_STRATEGY, self.selection_strategy) + ) + + # Create nearest neighbors object for positive class + if self.selection_strategy in ('minority', 'combined'): + self.nns_pos_ = check_neighbors_object( + 'nns_positive', self.k_neighbors, additional_neighbor=1 + ) + self.nns_pos_.set_params(n_jobs=self.n_jobs) + + # Create nearest neighbors object for negative class + if self.selection_strategy in ('majority', 'combined'): + self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1) + self.nn_neg_.set_params(n_jobs=self.n_jobs) + + def _make_geometric_samples(self, X, y, pos_class_label, n_samples): + """A support function that returns an artificials samples inside + the geometric region defined by nearest neighbors. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Matrix containing the data which have to be sampled. + y : array-like, shape (n_samples, ) + Corresponding label for each sample in X. + pos_class_label : str or int + The minority class (positive class) target value. + n_samples : int + The number of samples to generate. + + Returns + ------- + X_new : ndarray, shape (n_samples_new, n_features) + Synthetically generated samples. + y_new : ndarray, shape (n_samples_new, ) + Target values for synthetic samples. + + """ + + # Return zero new samples + if n_samples == 0: + return ( + np.array([], dtype=X.dtype).reshape(0, X.shape[1]), + np.array([], dtype=y.dtype), + ) + + # Select positive class samples + X_pos = X[y == pos_class_label] + + # Force minority strategy if no negative class samples are present + self.selection_strategy_ = ( + 'minority' if len(X) == len(X_pos) else self.selection_strategy + ) + + # Minority or combined strategy + if self.selection_strategy_ in ('minority', 'combined'): + self.nns_pos_.fit(X_pos) + points_pos = self.nns_pos_.kneighbors(X_pos)[1][:, 1:] + samples_indices = self.random_state_.randint( + low=0, high=len(points_pos.flatten()), size=n_samples + ) + rows = np.floor_divide(samples_indices, points_pos.shape[1]) + cols = np.mod(samples_indices, points_pos.shape[1]) + + # Majority or combined strategy + if self.selection_strategy_ in ('majority', 'combined'): + X_neg = X[y != pos_class_label] + self.nn_neg_.fit(X_neg) + points_neg = self.nn_neg_.kneighbors(X_pos)[1] + if self.selection_strategy_ == 'majority': + samples_indices = self.random_state_.randint( + low=0, high=len(points_neg.flatten()), size=n_samples + ) + rows = np.floor_divide(samples_indices, points_neg.shape[1]) + cols = np.mod(samples_indices, points_neg.shape[1]) + + # Generate new samples + X_new = np.zeros((n_samples, X.shape[1])) + for ind, (row, col) in enumerate(zip(rows, cols)): + + # Define center point + center = X_pos[row] + + # Minority strategy + if self.selection_strategy_ == 'minority': + surface_point = X_pos[points_pos[row, col]] + + # Majority strategy + elif self.selection_strategy_ == 'majority': + surface_point = X_neg[points_neg[row, col]] + + # Combined strategy + else: + surface_point_pos = X_pos[points_pos[row, col]] + surface_point_neg = X_neg[points_neg[row, 0]] + radius_pos = norm(center - surface_point_pos) + radius_neg = norm(center - surface_point_neg) + surface_point = ( + surface_point_neg if radius_pos > radius_neg else surface_point_pos + ) + + # Append new sample + X_new[ind] = _make_geometric_sample( + center, + surface_point, + self.truncation_factor, + self.deformation_factor, + self.random_state_, + ) + + # Create new samples for target variable + y_new = np.array([pos_class_label] * len(samples_indices)) + + return X_new, y_new + + def _fit_resample(self, X, y): + + # Validate estimator's parameters + self._validate_estimator() + + # Copy data + X_resampled, y_resampled = X.copy(), y.copy() + + # Resample data + for class_label, n_samples in self.sampling_strategy_.items(): + + # Apply gsmote mechanism + X_new, y_new = self._make_geometric_samples(X, y, class_label, n_samples) + + # Append new data + X_resampled, y_resampled = ( + np.vstack((X_resampled, X_new)), + np.hstack((y_resampled, y_new)), + ) + + return X_resampled, y_resampled diff --git a/imblearn/over_sampling/_smote/tests/test_geometric_smote.py b/imblearn/over_sampling/_smote/tests/test_geometric_smote.py new file mode 100644 index 000000000..1af600f94 --- /dev/null +++ b/imblearn/over_sampling/_smote/tests/test_geometric_smote.py @@ -0,0 +1,209 @@ +""" +Test the geometric_smote module. +""" + +from collections import Counter + +import pytest +import numpy as np +from numpy.linalg import norm +from sklearn.utils import check_random_state +from sklearn.datasets import make_classification + +from ..geometric_smote import _make_geometric_sample, GeometricSMOTE, SELECTION_STRATEGY + +RND_SEED = 0 +RANDOM_STATE = check_random_state(RND_SEED) +CENTERS = [ + RANDOM_STATE.random_sample((2,)), + 2.6 * RANDOM_STATE.random_sample((4,)), + 3.2 * RANDOM_STATE.random_sample((10,)), + -0.5 * RANDOM_STATE.random_sample((1,)), +] +SURFACE_POINTS = [ + RANDOM_STATE.random_sample((2,)), + 5.2 * RANDOM_STATE.random_sample((4,)), + -3.5 * RANDOM_STATE.random_sample((10,)), + -10.9 * RANDOM_STATE.random_sample((1,)), +] +TRUNCATION_FACTORS = [-1.0, -0.5, 0.0, 0.5, 1.0] +DEFORMATION_FACTORS = [0.0, 0.25, 0.5, 0.75, 1.0] + + +@pytest.mark.parametrize( + 'center,surface_point', + [ + (CENTERS[0], SURFACE_POINTS[0]), + (CENTERS[1], SURFACE_POINTS[1]), + (CENTERS[2], SURFACE_POINTS[2]), + (CENTERS[3], SURFACE_POINTS[3]), + ], +) +def test_make_geometric_sample_hypersphere(center, surface_point): + """Test the generation of points inside a hypersphere.""" + point = _make_geometric_sample(center, surface_point, 0.0, 0.0, RANDOM_STATE) + rel_point = point - center + rel_surface_point = surface_point - center + np.testing.assert_array_less(0.0, norm(rel_surface_point) - norm(rel_point)) + + +@pytest.mark.parametrize( + 'surface_point,deformation_factor', + [ + (np.array([1.0, 0.0]), 0.0), + (2.6 * np.array([0.0, 1.0]), 0.25), + (3.2 * np.array([0.0, 1.0, 0.0, 0.0]), 0.50), + (0.5 * np.array([0.0, 0.0, 1.0]), 0.75), + (6.7 * np.array([0.0, 0.0, 1.0, 0.0, 0.0]), 1.0), + ], +) +def test_make_geometric_sample_half_hypersphere(surface_point, deformation_factor): + """Test the generation of points inside a hypersphere.""" + center = np.zeros(surface_point.shape) + point = _make_geometric_sample( + center, surface_point, 1.0, deformation_factor, RANDOM_STATE + ) + np.testing.assert_array_less(0.0, norm(surface_point) - norm(point)) + np.testing.assert_array_less(0.0, np.dot(point, surface_point)) + + +@pytest.mark.parametrize( + 'center,surface_point,truncation_factor', + [ + (center, surface_point, truncation_factor) + for center, surface_point in zip(CENTERS, SURFACE_POINTS) + for truncation_factor in TRUNCATION_FACTORS + ], +) +def test_make_geometric_sample_line_segment(center, surface_point, truncation_factor): + """Test the generation of points on a line segment.""" + point = _make_geometric_sample( + center, surface_point, truncation_factor, 1.0, RANDOM_STATE + ) + rel_point = point - center + rel_surface_point = surface_point - center + dot_product = np.dot(rel_point, rel_surface_point) + norms_product = norm(rel_point) * norm(rel_surface_point) + np.testing.assert_array_less(0.0, norm(rel_surface_point) - norm(rel_point)) + dot_product = ( + np.abs(dot_product) if truncation_factor == 0.0 else (-1) * dot_product + ) + np.testing.assert_allclose(np.abs(dot_product) / norms_product, 1.0) + + +def test_gsmote_default_init(): + """Test the intialization with default parameters.""" + gsmote = GeometricSMOTE() + assert gsmote.sampling_strategy == 'auto' + assert gsmote.random_state is None + assert gsmote.truncation_factor == 1.0 + assert gsmote.deformation_factor == 0.0 + assert gsmote.selection_strategy == 'combined' + assert gsmote.k_neighbors == 5 + assert gsmote.n_jobs == 1 + + +def test_gsmote_fit(): + """Test fit method.""" + n_samples, weights = 200, [0.6, 0.4] + X, y = make_classification( + random_state=RND_SEED, n_samples=n_samples, weights=weights + ) + gsmote = GeometricSMOTE(random_state=RANDOM_STATE).fit(X, y) + assert gsmote.sampling_strategy_ == {1: 40} + + +def test_gsmote_invalid_selection_strategy(): + """Test invalid selection strategy.""" + n_samples, weights = 200, [0.6, 0.4] + X, y = make_classification( + random_state=RND_SEED, n_samples=n_samples, weights=weights + ) + gsmote = GeometricSMOTE(random_state=RANDOM_STATE, selection_strategy='Minority') + with pytest.raises(ValueError): + gsmote.fit_resample(X, y) + + +@pytest.mark.parametrize('selection_strategy', ['combined', 'minority', 'majority']) +def test_gsmote_nn(selection_strategy): + """Test nearest neighbors object.""" + n_samples, weights = 200, [0.6, 0.4] + X, y = make_classification( + random_state=RND_SEED, n_samples=n_samples, weights=weights + ) + gsmote = GeometricSMOTE( + random_state=RANDOM_STATE, selection_strategy=selection_strategy + ) + _ = gsmote.fit_resample(X, y) + if selection_strategy in ('minority', 'combined'): + assert gsmote.nns_pos_.n_neighbors == gsmote.k_neighbors + 1 + if selection_strategy in ('majority', 'combined'): + assert gsmote.nn_neg_.n_neighbors == 1 + + +@pytest.mark.parametrize( + 'selection_strategy, truncation_factor, deformation_factor', + [ + (selection_strategy, truncation_factor, deformation_factor) + for selection_strategy in SELECTION_STRATEGY + for truncation_factor in TRUNCATION_FACTORS + for deformation_factor in DEFORMATION_FACTORS + ], +) +def test_gsmote_fit_resample_binary( + selection_strategy, truncation_factor, deformation_factor +): + """Test fit and sample for binary class case.""" + n_maj, n_min, step, min_coor, max_coor = 12, 5, 0.5, 0.0, 8.5 + X = np.repeat(np.arange(min_coor, max_coor, step), 2).reshape(-1, 2) + y = np.concatenate([np.repeat(0, n_maj), np.repeat(1, n_min)]) + radius = np.sqrt(0.5) * step + k_neighbors = 1 + gsmote = GeometricSMOTE( + 'auto', + RANDOM_STATE, + truncation_factor, + deformation_factor, + selection_strategy, + k_neighbors, + ) + X_resampled, y_resampled = gsmote.fit_resample(X, y) + assert gsmote.sampling_strategy_ == {1: (n_maj - n_min)} + assert y_resampled.sum() == n_maj + np.testing.assert_array_less(X[n_maj - 1] - radius, X_resampled[n_maj + n_min]) + + +@pytest.mark.parametrize( + 'selection_strategy, truncation_factor, deformation_factor', + [ + (selection_strategy, truncation_factor, deformation_factor) + for selection_strategy in SELECTION_STRATEGY + for truncation_factor in TRUNCATION_FACTORS + for deformation_factor in DEFORMATION_FACTORS + ], +) +def test_gsmote_fit_resample_multiclass( + selection_strategy, truncation_factor, deformation_factor +): + """Test fit and sample for multiclass case.""" + n_samples, weights = 100, [0.75, 0.15, 0.10] + X, y = make_classification( + random_state=RND_SEED, + n_samples=n_samples, + weights=weights, + n_classes=3, + n_informative=5, + ) + k_neighbors, majority_label = 1, 0 + gsmote = GeometricSMOTE( + 'auto', + RANDOM_STATE, + truncation_factor, + deformation_factor, + selection_strategy, + k_neighbors, + ) + _, y_resampled = gsmote.fit_resample(X, y) + assert majority_label not in gsmote.sampling_strategy_.keys() + np.testing.assert_array_equal(np.unique(y), np.unique(y_resampled)) + assert len(set(Counter(y_resampled).values())) == 1