Skip to content

Commit

Permalink
ENH add GeometricSMOTE implementation (scikit-learn-contrib#881)
Browse files Browse the repository at this point in the history
  • Loading branch information
joaopfonseca committed Dec 17, 2021
1 parent f1abf75 commit 7e34701
Show file tree
Hide file tree
Showing 4 changed files with 538 additions and 0 deletions.
2 changes: 2 additions & 0 deletions imblearn/over_sampling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ._smote import SMOTE
from ._smote import BorderlineSMOTE
from ._smote import KMeansSMOTE
from ._smote import GeometricSMOTE
from ._smote import SVMSMOTE
from ._smote import SMOTENC
from ._smote import SMOTEN
Expand All @@ -16,6 +17,7 @@
"ADASYN",
"RandomOverSampler",
"KMeansSMOTE",
"GeometricSMOTE",
"SMOTE",
"BorderlineSMOTE",
"SVMSMOTE",
Expand Down
3 changes: 3 additions & 0 deletions imblearn/over_sampling/_smote/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from .cluster import KMeansSMOTE

from .geometric import GeometricSMOTE

from .filter import BorderlineSMOTE
from .filter import SVMSMOTE

Expand All @@ -12,6 +14,7 @@
"SMOTEN",
"SMOTENC",
"KMeansSMOTE",
"GeometricSMOTE",
"BorderlineSMOTE",
"SVMSMOTE",
]
324 changes: 324 additions & 0 deletions imblearn/over_sampling/_smote/geometric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
"""Class to perform over-sampling using Geometric SMOTE."""

# Author: Georgios Douzas <gdouzas@icloud.com>
# License: BSD 3 clause

import numpy as np
from numpy.linalg import norm
from sklearn.utils import check_random_state
from imblearn.over_sampling.base import BaseOverSampler
from imblearn.utils import check_neighbors_object, Substitution
from imblearn.utils._docstring import _random_state_docstring

SELECTION_STRATEGY = ('combined', 'majority', 'minority')


def _make_geometric_sample(
center, surface_point, truncation_factor, deformation_factor, random_state
):
"""A support function that returns an artificial point inside
the geometric region defined by the center and surface points.
Parameters
----------
center : ndarray, shape (n_features, )
Center point of the geometric region.
surface_point : ndarray, shape (n_features, )
Surface point of the geometric region.
truncation_factor : float, optional (default=0.0)
The type of truncation. The values should be in the [-1.0, 1.0] range.
deformation_factor : float, optional (default=0.0)
The type of geometry. The values should be in the [0.0, 1.0] range.
random_state : int, RandomState instance or None
Control the randomization of the algorithm.
Returns
-------
point : ndarray, shape (n_features, )
Synthetically generated sample.
"""

# Zero radius case
if np.array_equal(center, surface_point):
return center

# Generate a point on the surface of a unit hyper-sphere
radius = norm(center - surface_point)
normal_samples = random_state.normal(size=center.size)
point_on_unit_sphere = normal_samples / norm(normal_samples)
point = (random_state.uniform(size=1) ** (1 / center.size)) * point_on_unit_sphere

# Parallel unit vector
parallel_unit_vector = (surface_point - center) / norm(surface_point - center)

# Truncation
close_to_opposite_boundary = (
truncation_factor > 0
and np.dot(point, parallel_unit_vector) < truncation_factor - 1
)
close_to_boundary = (
truncation_factor < 0
and np.dot(point, parallel_unit_vector) > truncation_factor + 1
)
if close_to_opposite_boundary or close_to_boundary:
point -= 2 * np.dot(point, parallel_unit_vector) * parallel_unit_vector

# Deformation
parallel_point_position = np.dot(point, parallel_unit_vector) * parallel_unit_vector
perpendicular_point_position = point - parallel_point_position
point = (
parallel_point_position
+ (1 - deformation_factor) * perpendicular_point_position
)

# Translation
point = center + radius * point

return point


@Substitution(
sampling_strategy=BaseOverSampler._sampling_strategy_docstring,
random_state=_random_state_docstring,
)
class GeometricSMOTE(BaseOverSampler):
"""Class to to perform over-sampling using Geometric SMOTE.
This algorithm is an implementation of Geometric SMOTE, a geometrically
enhanced drop-in replacement for SMOTE as presented in [1]_.
Read more in the :ref:`User Guide <user_guide>`.
Parameters
----------
{sampling_strategy}
{random_state}
truncation_factor : float, optional (default=0.0)
The type of truncation. The values should be in the [-1.0, 1.0] range.
deformation_factor : float, optional (default=0.0)
The type of geometry. The values should be in the [0.0, 1.0] range.
selection_strategy : str, optional (default='combined')
The type of Geometric SMOTE algorithm with the following options:
``'combined'``, ``'majority'``, ``'minority'``.
k_neighbors : int or object, optional (default=5)
If ``int``, number of nearest neighbours to use when synthetic
samples are constructed for the minority method. If object, an estimator
that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` that
will be used to find the k_neighbors.
n_jobs : int, optional (default=1)
The number of threads to open if possible.
Notes
-----
See the original paper: [1]_ for more details.
Supports multi-class resampling. A one-vs.-rest scheme is used as
originally proposed in [2]_.
References
----------
.. [1] G. Douzas, F. Bacao, "Geometric SMOTE:
a geometrically enhanced drop-in replacement for SMOTE",
Information Sciences, vol. 501, pp. 118-135, 2019.
.. [2] N. V. Chawla, K. W. Bowyer, L. O. Hall, W. P. Kegelmeyer, "SMOTE:
synthetic minority over-sampling technique", Journal of Artificial
Intelligence Research, vol. 16, pp. 321-357, 2002.
Examples
--------
>>> from collections import Counter
>>> from sklearn.datasets import make_classification
>>> from gsmote import GeometricSMOTE # doctest: +NORMALIZE_WHITESPACE
>>> X, y = make_classification(n_classes=2, class_sep=2,
... weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
... n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
>>> print('Original dataset shape %s' % Counter(y))
Original dataset shape Counter({{1: 900, 0: 100}})
>>> gsmote = GeometricSMOTE(random_state=1)
>>> X_res, y_res = gsmote.fit_resample(X, y)
>>> print('Resampled dataset shape %s' % Counter(y_res))
Resampled dataset shape Counter({{0: 900, 1: 900}})
"""

def __init__(
self,
sampling_strategy='auto',
random_state=None,
truncation_factor=1.0,
deformation_factor=0.0,
selection_strategy='combined',
k_neighbors=5,
n_jobs=1,
):
super(GeometricSMOTE, self).__init__(sampling_strategy=sampling_strategy)
self.random_state = random_state
self.truncation_factor = truncation_factor
self.deformation_factor = deformation_factor
self.selection_strategy = selection_strategy
self.k_neighbors = k_neighbors
self.n_jobs = n_jobs

def _validate_estimator(self):
"""Create the necessary attributes for Geometric SMOTE."""

# Check random state
self.random_state_ = check_random_state(self.random_state)

# Validate strategy
if self.selection_strategy not in SELECTION_STRATEGY:
error_msg = (
'Unknown selection_strategy for Geometric SMOTE algorithm. '
'Choices are {}. Got {} instead.'
)
raise ValueError(
error_msg.format(SELECTION_STRATEGY, self.selection_strategy)
)

# Create nearest neighbors object for positive class
if self.selection_strategy in ('minority', 'combined'):
self.nns_pos_ = check_neighbors_object(
'nns_positive', self.k_neighbors, additional_neighbor=1
)
self.nns_pos_.set_params(n_jobs=self.n_jobs)

# Create nearest neighbors object for negative class
if self.selection_strategy in ('majority', 'combined'):
self.nn_neg_ = check_neighbors_object('nn_negative', nn_object=1)
self.nn_neg_.set_params(n_jobs=self.n_jobs)

def _make_geometric_samples(self, X, y, pos_class_label, n_samples):
"""A support function that returns an artificials samples inside
the geometric region defined by nearest neighbors.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Matrix containing the data which have to be sampled.
y : array-like, shape (n_samples, )
Corresponding label for each sample in X.
pos_class_label : str or int
The minority class (positive class) target value.
n_samples : int
The number of samples to generate.
Returns
-------
X_new : ndarray, shape (n_samples_new, n_features)
Synthetically generated samples.
y_new : ndarray, shape (n_samples_new, )
Target values for synthetic samples.
"""

# Return zero new samples
if n_samples == 0:
return (
np.array([], dtype=X.dtype).reshape(0, X.shape[1]),
np.array([], dtype=y.dtype),
)

# Select positive class samples
X_pos = X[y == pos_class_label]

# Force minority strategy if no negative class samples are present
self.selection_strategy_ = (
'minority' if len(X) == len(X_pos) else self.selection_strategy
)

# Minority or combined strategy
if self.selection_strategy_ in ('minority', 'combined'):
self.nns_pos_.fit(X_pos)
points_pos = self.nns_pos_.kneighbors(X_pos)[1][:, 1:]
samples_indices = self.random_state_.randint(
low=0, high=len(points_pos.flatten()), size=n_samples
)
rows = np.floor_divide(samples_indices, points_pos.shape[1])
cols = np.mod(samples_indices, points_pos.shape[1])

# Majority or combined strategy
if self.selection_strategy_ in ('majority', 'combined'):
X_neg = X[y != pos_class_label]
self.nn_neg_.fit(X_neg)
points_neg = self.nn_neg_.kneighbors(X_pos)[1]
if self.selection_strategy_ == 'majority':
samples_indices = self.random_state_.randint(
low=0, high=len(points_neg.flatten()), size=n_samples
)
rows = np.floor_divide(samples_indices, points_neg.shape[1])
cols = np.mod(samples_indices, points_neg.shape[1])

# Generate new samples
X_new = np.zeros((n_samples, X.shape[1]))
for ind, (row, col) in enumerate(zip(rows, cols)):

# Define center point
center = X_pos[row]

# Minority strategy
if self.selection_strategy_ == 'minority':
surface_point = X_pos[points_pos[row, col]]

# Majority strategy
elif self.selection_strategy_ == 'majority':
surface_point = X_neg[points_neg[row, col]]

# Combined strategy
else:
surface_point_pos = X_pos[points_pos[row, col]]
surface_point_neg = X_neg[points_neg[row, 0]]
radius_pos = norm(center - surface_point_pos)
radius_neg = norm(center - surface_point_neg)
surface_point = (
surface_point_neg if radius_pos > radius_neg else surface_point_pos
)

# Append new sample
X_new[ind] = _make_geometric_sample(
center,
surface_point,
self.truncation_factor,
self.deformation_factor,
self.random_state_,
)

# Create new samples for target variable
y_new = np.array([pos_class_label] * len(samples_indices))

return X_new, y_new

def _fit_resample(self, X, y):

# Validate estimator's parameters
self._validate_estimator()

# Copy data
X_resampled, y_resampled = X.copy(), y.copy()

# Resample data
for class_label, n_samples in self.sampling_strategy_.items():

# Apply gsmote mechanism
X_new, y_new = self._make_geometric_samples(X, y, class_label, n_samples)

# Append new data
X_resampled, y_resampled = (
np.vstack((X_resampled, X_new)),
np.hstack((y_resampled, y_new)),
)

return X_resampled, y_resampled

0 comments on commit 7e34701

Please sign in to comment.