
#### 环境配置:

* 依赖包
    - numpy
    - scipy
    - scikit-learn
    - keras
    - tensorflow

* 安装包

```shell
pip install -U imblanced-learn
# or 
conda install -c conda-forge imbalanced-learn
# or 
git clone https://github.com/scikit-learn-contrib/imbalanced-learn.git
cd imbalanced-learn
pip install .
```




# 1.过采样

## 1.1 Naive random over-sampling with replacement

In [15]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# origin data
X, y = make_classification(n_samples = 5000,
                           n_features = 2,
                           n_informative = 2,
                           n_redundant = 0,
                           n_repeated = 0,
                           n_classes = 3,
                           n_clusters_per_class = 1,
                           weights = [0.01, 0.05, 0.94],
                           class_sep = 0.8,
                           random_state = 0)
print(X.shape, y.shape)
print(sorted(Counter(y).items()))

(5000, 2) (5000,)
[(0, 64), (1, 262), (2, 4674)]


In [16]:
# resampled data
ros = RandomOverSampler(random_state = 0)
X_resampled, y_resampled = ros.fit_resample(X, y)
print(X_resampled.shape, y_resampled.shape)
print(sorted(Counter(y_resampled).items()))

(14022, 2) (14022,)
[(0, 4674), (1, 4674), (2, 4674)]


### 异构数据过采样

In [3]:
# origin data
X_hetero = np.array([
    ['xxx', 1, 1.0],
    ['yyy', 2, 2.0],
    ['zzz', 3, 3.0]
], dtype = np.object)
y_hetero = np.array([0, 0, 1])
print(X_hetero)
print(y_hetero)

# resample data
X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero)
print('=' * 50)
print(X_resampled)
print(y_resampled)

[['xxx' 1 1.0]
 ['yyy' 2 2.0]
 ['zzz' 3 3.0]]
[0 0 1]
[['xxx' 1 1.0]
 ['yyy' 2 2.0]
 ['zzz' 3 3.0]
 ['zzz' 3 3.0]]
[0 0 1 1]


## 1.2 SMOTE and ADASYN

* SMOTE(Synthetic Minority Oversampling Technique)
* Adaptive Synthetic(ADASYN)

In [43]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from collections import Counter

# origin data
X, y = make_classification(n_samples = 5000,
                           n_features = 2,
                           n_informative = 2,
                           n_redundant = 0,
                           n_repeated = 0,
                           n_classes = 3,
                           n_clusters_per_class = 1,
                           weights = [0.01, 0.05, 0.94],
                           class_sep = 0.8,
                           random_state = 0)
print(X.shape, y.shape)
print(sorted(Counter(y).items()))

(5000, 2) (5000,)
[(0, 64), (1, 262), (2, 4674)]


### 1.2.1 SMOTE

#### 1.2.1.1 SMOTE

In [44]:
# smote resampled data
from imblearn.over_sampling import SMOTE, ADASYN

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)
print(X_resampled.shape, y_resampled.shape)
print(sorted(Counter(y_resampled).items()))

(14022, 2) (14022,)
[(0, 4674), (1, 4674), (2, 4674)]


#### 1.2.1.2 BorderlineSMOTE

In [42]:
from imblearn.over_sampling import BorderlineSMOTE

X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[(0, 4674), (1, 4674), (2, 4674)]


#### 1.2.1.3 SMOTENC

In [29]:
from imblearn.over_sampling import SMOTENC

rng = np.random.RandomState(42)
n_samples = 50
X = np.empty((n_samples, 3), dtype = object)
X[:, 0] = rng.choice(['A', 'B', 'C'], size = n_samples).astype(object)
X[:, 1] = rng.randn(n_samples)
X[:, 2] = rng.randint(3, size = n_samples)
y = np.array([0] * 20 + [1] * 30)
print(X, y)

smote_nc = SMOTENC(categorical_features = [0, 2], random_state = 0)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
print(X_resampled, y_resampled)
print(sorted(Counter(y_resampled).items()))

[['C' -0.14021849735700803 2]
 ['A' -0.033193400066544886 2]
 ['C' -0.7490765234433554 1]
 ['C' -0.7783820070908942 2]
 ['A' 0.948842857719016 2]
 ['A' 1.580850586039026 0]
 ['C' -0.3681709413435413 2]
 ['B' 0.375564633840694 2]
 ['C' -1.1931582294890923 1]
 ['C' -0.4090519017239313 1]
 ['C' -0.44674147097314604 0]
 ['C' 1.5242416328552062 2]
 ['A' 0.3229997972782976 2]
 ['C' -1.3934169408727037 2]
 ['B' -2.1783342468237086 0]
 ['A' -1.043896408234835 0]
 ['B' 0.17269370652551178 1]
 ['B' 0.32419876755273175 0]
 ['B' 0.7458595418254313 2]
 ['B' -1.8365832393175525 2]
 ['A' 0.5644642385551357 0]
 ['A' 0.025500668081348373 2]
 ['B' 0.4731932494600233 2]
 ['B' 0.6591905984289843 0]
 ['A' 2.3407463341030463 0]
 ['A' 1.070985187017369 2]
 ['A' 0.09641647903025843 2]
 ['C' 0.41910210559764305 2]
 ['C' -0.9530277892165022 1]
 ['C' -1.0478706025818139 1]
 ['B' -1.875676772206517 1]
 ['C' -1.3667821388862977 0]
 ['B' 0.6363051083451703 1]
 ['B' -0.9067206685799298 0]
 ['C' 0.4760425874269718 0]

#### 1.2.1.4 KMeansSMOTE

In [30]:
from imblearn.over_sampling import KMeansSMOTE

### 1.2.2 ADASYN

In [45]:
# ANASYN resampled data
from imblearn.over_sampling import SMOTE, ADASYN

adasyn = ADASYN()
X_resampled, y_resampled = adasyn.fit_resample(X, y)
print(X_resampled.shape, y_resampled.shape)
print(sorted(Counter(y_resampled).items()))

(14009, 2) (14009,)
[(0, 4673), (1, 4662), (2, 4674)]


# 2. Under-Sampling(欠采样)

In [46]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.datasets import make_classification

X, y = make_classification(n_samples = 5000, 
                           n_features = 2, 
                           n_informative = 2,
                           n_redundant = 0, 
                           n_repeated = 0,
                           n_classes = 3,
                           n_clusters_per_class = 1,
                           weights = [0.01, 0.05, 0.94],
                           class_sep = 0.8, 
                           random_state = 0)
print(sorted(Counter(y).items()))

[(0, 64), (1, 262), (2, 4674)]


### 2.1 Prototype generation

In [40]:
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state = 0)
X_resampled, y_resampled = cc.fit_resample(X, y)

print(sorted(Counter(y_resampled).items()))

[(0, 64), (1, 64), (2, 64)]


### 2.2 Prototype selection

#### 2.2.1 Controlled under-sampling

In [59]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state = 0)
X_resampled, y_resampled = rus.fit_resample(X, y)

print(sorted(Counter(y_resampled).items()))

[(0, 64), (1, 64), (2, 64)]


In [62]:
from imblearn.under_sampling import RandomUnderSampler

print(np.vstack([tuple(row) for row in X_resampled]).shape)
rus = RandomUnderSampler(random_state = 0, replacement = True)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(np.vstack(np.unique([tuple(row) for row in X_resampled], axis = 0)).shape)

(192, 2)
(181, 2)


In [65]:
X_hetero = np.array([
    ['xxx', 1, 1.0],
    ['yyy', 2, 2.0],
    ['zzz', 3, 3.0]
], dtype = np.object)
y_hetero = np.array([0, 0, 1])
print(X_hetero)
print(y_hetero)

X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero)
print('=' * 30)
print(X_resampled) 
print(y_resampled)

[['xxx' 1 1.0]
 ['yyy' 2 2.0]
 ['zzz' 3 3.0]]
[0 0 1]
[['xxx' 1 1.0]
 ['zzz' 3 3.0]]
[0 1]


#### 2.2.2 Cleaning under-sampling