In [117]:
import random
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

def SMOTE(X_min, y_min, xx=1.0, n_sampling=0, noise_weight=0.1):
    """
    SMOTE 过采样
    :param X_min:  pandas.DataFrame，需要进行过采样的特征
    :param y_min:  pandas.Series，需要进行过采样的标签
    :param n_sampling:  int，过采样的数量
    :param xx: float，>1，过采样后的倍率，会覆盖 n_sampling
    :param noise_weight: float，0-1，噪声的权重
    :return:
    X_res:  pandas.DataFrame， 包含 X_min 的过采样的特征
    y_res:  pandas.Series，包含 y_min 的过采样的标签
    """
    if xx is not None:
        assert xx > 1.0
        n_sampling = int(len(X_min) * (xx-1.0))
    # 数据集内找五个最近的数据，组成一个数组
    n_nearests = NearestNeighbors(n_neighbors=5, metric='euclidean', algorithm='kd_tree')\
                 .fit(X_min).kneighbors(X_min)[1]
    # 全部置0，生成过采样的数据集的数据集
    X_res = np.zeros((n_sampling, X_min.shape[1]))
    y_res = np.zeros(n_sampling)
    for i in range(n_sampling):
        # 随机选取五个临近点
        reference = random.randint(0, len(n_nearests)-1)
        # 五个点
        all_point = n_nearests[reference]
        ser = y_min[y_min.index.isin(all_point)].sum(skipna = True)
        y_res[i] = 1 if ser > 2 else 0  # 如果大于一半都是正例那就是正例
        # 随机选一个邻居点，第一个点是中心点，要去掉，所以是 1:
        neighbour = random.choice(n_nearests[reference, 1:])
        noise = random.random() * noise_weight  # 随机的噪声
        # 中心点减去一个随机的邻居点，作为距离
        gap = (X_min.loc[reference, :] - X_min.loc[neighbour, :])
        # 根据中心点生成一个新的数据
        X_res[i] = np.array(X_min.loc[reference, :] + noise * gap)
    X_res = pd.DataFrame(X_res, columns=X_min.columns)
    y_res = pd.Series(y_res, name=y_min.name, dtype=int)
    X_con = pd.concat([X_min, X_res], axis=0, ignore_index=True)
    y_con = pd.concat([y_min, y_res], axis=0, ignore_index=True)
    return X_con.astype(X_min.dtypes), y_con.astype(y_min.dtypes)


In [131]:
X = pd.DataFrame(np.random.random(size=(100, 4)), columns=['f0', 'f1', 'f2', 'f3'])
y = pd.Series(np.random.choice([0, 1], size=100), name='class')
X.shape, y.shape

((100, 4), (100,))

In [132]:
X_new, y_new = SMOTE(X_min=X, y_min=y, xx=1.6)

In [133]:
X_new.tail()

Unnamed: 0,f0,f1,f2,f3
155,0.226072,0.2727,0.318518,0.451901
156,0.406734,0.666264,0.170311,0.97815
157,0.076741,0.11683,0.989552,0.479542
158,0.253476,0.706161,0.738644,0.396206
159,0.296319,0.921743,0.47461,0.57247


In [134]:
y_new.tail()

155    0
156    1
157    0
158    1
159    0
Name: class, dtype: int64