# Synthetic Minority Oversampling Technique (SMOTE) for Imbalanced classification
1. [Blog: SMOTE for Imbalanced Classification with Python](https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/)

## 预处理
### 装载数据

In [None]:
from others import load_all_dataset, rename_dataset
X_train, y_train, X_test, y_test = load_all_dataset(show=False)
import numpy as np
np.set_printoptions(edgeitems=5,
                    linewidth=1000,
                    formatter={"float":lambda x: "{:.3f}".format(x)})

In [None]:
# 去除NaN
from numpy import newaxis
class FeatureExtractor:
    def transform(self, X):
        '''
        Parameters
        ----------
        `X`: ndarray of (sample, 672, 10)
            3D input dataset(sample, time, features)
        
        Returns
        -------
        `X`: ndarray of (sample, 6720)
            The filtered dataset
        '''
        np.nan_to_num(X, copy=False)
        X = X.reshape(X.shape[0], -1)
        return X

fe = FeatureExtractor()
[X_source, X_source_bkg, X_target, X_target_unlabeled, X_target_bkg,
    y_source, y_target, X_test] = rename_dataset(
    fe, X_train, y_train, X_test, y_test, show_imbalance=True)

### 整理数据（Normalization, Oversampling, ...)

In [None]:
import imblearn as il
from collections import Counter
over = il.over_sampling.RandomOverSampler(sampling_strategy=self.sampling_strategy,) # 0.5 is the best for random state 44, 0.3 is generally the best for all
# over = il.over_sampling.ADASYN(sampling_strategy=self.sampling_strategy) # 0.2 is the best
# over = il.over_sampling.BorderlineSMOTE(sampling_strategy=self.sampling_strategy) # 0.2 is the best
# over = il.over_sampling.KMeansSMOTE(sampling_strategy=self.sampling_strategy) # error
# over = il.over_sampling.SMOTE(sampling_strategy=self.sampling_strategy) # 0.2
# over = il.over_sampling.SMOTENC((0,0,0,0,0,0,0,0,0,0), sampling_strategy=self.sampling_strategy) #
# over = il.over_sampling.SVMSMOTE() # abandon, long training time
X_source, y_source = over.fit_resample(X_source, y_source)

# under = il.under_sampling.RandomUnderSampler(sampling_strategy=1.0)
# X_source, y_source = under.fit_resample(X_source, y_source)

# over = il.over_sampling.SVMSMOTE() # abandon, long training time
over = il.over_sampling.BorderlineSMOTE()
X_source, y_source = over.fit_resample(X_source, y_source)

print(X_source.shape, y_source.shape)
print(Counter(y_source))

## 搭建模型

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
model_DT = DecisionTreeClassifier(max_depth=2, random_state=44,)
model_RF = RandomForestClassifier(
    n_estimators=2, max_depth=2, random_state=44, n_jobs=-1)

# define pipeline
# over = il.over_sampling.SMOTE(sampling_strategy=0.1)
# under = il.under_sampling.RandomUnderSampler(sampling_strategy=0.5)
# steps = [('over', over), ('under', under), ('model', model_DT)]
# pipeline_DT = il.pipeline.Pipeline(steps=steps)

# steps = [('over', over), ('under', under), ('model', model_RF)]
# pipeline_RF = il.pipeline.Pipeline(steps=steps)

In [None]:
model_DT.fit(X_source, y_source)
model_RF.fit(X_source, y_source)

## 预测概率

In [None]:
print("X_test.target.shape:", X_test.target.shape)
y_pred = model_DT.predict(X_test.target)
print("[Target] Decision Tree:", model_DT.score(X_target, y_target))
print("[Target] Decision Tree:", model_DT.score(X_test.target, y_test.target))
print("Predicted:", Counter(y_pred), y_pred.shape)
print("True:      ", Counter(y_test.target), y_test.target.shape)

print("Random Forest:", model_RF.score(X_target, y_target))
print("Random Forest:", model_RF.score(X_test.target, y_test.target))

# 调试超参
直接使用华为提供的比赛工具包来评价模型

In [1]:
import rampwf as rw
import numpy as np
from others import cd

hp_range = [i/10 for i in range(2, 10, 1)]
# hp_range = ['not minority', 'not majority', 'all', 'majority',]
ap_bagged_test = []
for ss1 in hp_range:
    for ss2 in hp_range:
        # 载入数据
        with cd("~/Codes/HuaweiRAMP"):
            problem = rw.utils.assert_read_problem()
            X_train, y_train = problem.get_train_data(show=False)
            X_test, y_test = problem.get_test_data(show=False)
        # 导入评价函数
        ap    = problem.score_types[0]
        # 设置crossvalidation
        splits = problem.get_cv(X_train, y_train, n_splits=10) # 默认10
        # 开始实验
        ap_train, ap_valid, ap_test, = [], [], []
        y_test_preds = []
        for fold_i, (train_is, valid_is) in enumerate(splits):
            with cd("~/Codes/HuaweiRAMP"):
                X_test, y_test = problem.get_test_data(show=False)
            trained_workflow = problem.workflow.train_submission(
                '.', X_train, y_train, train_is,
                sampling_strategy_over=ss1,
                sampling_strategy_under=ss2,
                )
            X_fold_train = X_train.slice(train_is)
            X_fold_valid = X_train.slice(valid_is)
            
            y_train_pred = problem.workflow.test_submission(trained_workflow, X_fold_train)
            y_valid_pred = problem.workflow.test_submission(trained_workflow, X_fold_valid)
            y_test_pred = problem.workflow.test_submission(trained_workflow, X_test)
            ap_train.append( ap(y_train.slice(train_is).target, y_train_pred[:,1]) )
            ap_valid.append( ap(y_train.slice(valid_is).target, y_valid_pred[:,1]) )
            ap_test.append( ap(y_test.target, y_test_pred[:,1]) )
            # print('-------------------------------------')
            # print('training ap on fold {} = {:.3f}'.format(fold_i, ap_train[-1]))
            # print('validation ap on fold {} = {:.3f}'.format(fold_i, ap_valid[-1]))
            # print('test ap on fold {} = {:.3f}'.format(fold_i, ap_test[-1]))
            
            y_test_preds.append(y_test_pred)

        # 计算排名指标: bagged average precision on test dataset
        score = ap(y_test.target, np.array([y_test_pred for y_test_pred in y_test_preds]).mean(axis=0)[:,1])
        ap_bagged_test.append({"over":ss1, "under":ss2, "score":score,
            "ap_train":ap_train, "ap_valid":ap_valid, "ap_test":ap_test})
        print('over {}, under {}: Bagged ap score = {}'.format(ss1, ss2, score))
        del problem, X_train, y_train, X_test, y_test, ap, splits, y_test_preds

Train data
Test data
over 0.2, under 0.2: Bagged ap score = 0.18577221956242598
Train data
Test data
over 0.2, under 0.3: Bagged ap score = 0.18272019978403817
Train data
Test data
over 0.2, under 0.4: Bagged ap score = 0.18195265719671538
Train data
Test data
over 0.2, under 0.5: Bagged ap score = 0.17747303981613757
Train data
Test data
over 0.2, under 0.6: Bagged ap score = 0.17256657035777528
Train data
Test data
over 0.2, under 0.7: Bagged ap score = 0.17258657703842828
Train data
Test data
over 0.2, under 0.8: Bagged ap score = 0.17475288065806577
Train data
Test data
over 0.2, under 0.9: Bagged ap score = 0.16778444152133087
Train data
Test data


ValueError: The specified ratio required to generate new sample in the majority class while trying to remove samples. Please increase the ratio.

## 画图

In [None]:
import json
with open("RandomOverUnder.json", "w") as f:
    json.dump(ap_bagged_test, f)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()
fig.add_trace(go.Scatter(x=hp_range, y=ap_bagged_test, mode="lines",
                         name="sampling_strategy", showlegend=True))
fig.update_layout(
    title="RandomOverSampler+RandomUnderSampler: s",
    xaxis_title="Hyperparameter",
    yaxis_title="Bagged ap",
    )
fig.show()