# 使用数据集的统计特征（statistics of dataset）作为输入
## 预处理
### 装载数据

In [2]:
# np.seterr(all='warn')
import warnings
from scipy import stats

class FeatureExtractor:
    def transform(self, X):
        '''
        Parameters
        ----------
        `X`: ndarray of (sample, 672, 10)
            3D input dataset(sample, time, features)
        
        Returns
        -------
        `X`: ndarray of (sample, 6720)
            The filtered dataset
        '''
        X = X.astype(np.float64)
        tmp_X = []
        for x in X:
            # 处理NaN数据
            x[:, np.all(~np.isfinite(x), axis=0)] = 0 # 用0填充全部是NaN的列
            # ------------------------------------------------------------------
            # 计算统计量
            _ = []
            # There is a bug in `np.nanpercentile` which computes very slow
            if np.any(~np.isfinite(x)) == True:
                b = []
                for row in x.T:
                    tmp = row[np.isfinite(row)]
                    pct = np.percentile(tmp, [25, 50, 75])
                    b.append(pct)
                b = np.array(b).T
            else:
                b = np.percentile(x, [25, 50, 75], axis=0)
            _.append( b[0] ) # 一分位数@25
            _.append( b[1] ) # 二分位数
            _.append( b[2] ) # 三分位数@75

            # with warnings.catch_warnings():
            #     warnings.filterwarnings('error')
            #     try:
            _.append( np.nanmean(x, axis=0) ) # 均值
            _.append( np.nanstd(x, axis=0)) # 标准差
            _.append( np.nanmax(x, axis=0) ) # 最大值
            _.append( np.nanmin(x, axis=0) ) # 最小值
            _.append( stats.mode(x, axis=0, nan_policy="omit")[0][0]) # 众数
            _.append( stats.kurtosis(x, axis=0, nan_policy="omit", fisher=False)) # 峰度 # RuntimeWarning: overflow => change type to np.float64
            _.append( stats.skew(x, axis=0, nan_policy="omit")) # 偏度
            _.append( np.sum(np.isfinite(x), axis=0)/x.shape[0] ) # 有效值数量占比
                # except Warning as e:
                #     print("x is", x.shape)
                #     print("_ is", len(_))
                #     TEST = x
                #     raise e
            # ------------------------------------------------------------------
            # 加入第3维数组
            tmp_X.append( np.array(_) )
            # tmp_X.append( x )
        X = np.array(tmp_X)

        # flatten
        # X = X.reshape(X.shape[0], -1) # required for outlier detection
        return X


==== TRAIN SET ====
  | X_source: (46110, 11, 10) ; y_source: (46110,)
  | 	imbalance: 4769(10.3%) failure, 41341(89.7%) weak
A | X_source_bkg: (50862, 11, 10)
----
  | X_target: (438, 11, 10) ; y_target: (438,)
  | 	imbalance: 89(20.3%) failure, 349(79.7%) weak
B | X_target_bkg: (29592, 11, 10)
  | X_target_unlabeled: (8202, 11, 10)
==== TEST SET ====
  | X_test.target: (17758, 11, 10) ; y_test.target: (17758,)
  | 	imbalance: 2294(12.9%) failure, 15464(87.1%) weak
B | X_test.target_bkg: (47275, 11, 10)
  | X_test.target_unlabeled: None



1. 读取原始数据

In [2]:
from others import load_all_dataset, rename_dataset
X_train, y_train, X_test, y_test = load_all_dataset(show=False)
import numpy as np
np.set_printoptions(edgeitems=5,
                    linewidth=1000,
                    formatter={"float":lambda x: "{:.3f}".format(x)})

fe = FeatureExtractor()
# X_source = fe.transform(X_train.source)
# np.all(np.isfinite(X_source)) # expected True
[X_source, X_source_bkg, X_target, X_target_unlabeled, X_target_bkg,
    y_source, y_target, X_test] = rename_dataset(
    fe, X_train, y_train, X_test, y_test, show_imbalance=True)

Train data
Test data


NameError: name 'FeatureExtractor' is not defined

2. 读取统计数据

In [23]:
# 把统计数值保存到硬盘，已检查所有array里面的数值都是finite
# np.save("./data_stats/X_source", X_source)
# np.save("./data_stats/X_source_bkg", X_source_bkg)
# np.save("./data_stats/X_target", X_target)
# np.save("./data_stats/X_target_unlabeled", X_target_unlabeled)
# np.save("./data_stats/X_target_bkg", X_target_bkg)
# np.save("./data_stats/y_source", y_source)
# np.save("./data_stats/y_target", y_target)

True
True
True
True
True
True
True


In [19]:
X_source = np.load("./data_stats/X_source.npy")
X_source_bkg = np.load("./data_stats/X_source_bkg.npy")
X_target = np.load("./data_stats/X_target.npy")
X_target_unlabeled = np.load("./data_stats/X_target_unlabeled.npy")
X_target_bkg = np.load("./data_stats/X_target_bkg.npy")
y_source = np.load("./data_stats/y_source.npy")
y_target = np.load("./data_stats/y_target.npy")

fe = FeatureExtractor()
X_test.target = fe.transform(X_test.target)
X_test.target_bkg = fe.transform(X_test.target_bkg)

### 整理数据（Normalization, Oversampling, ...)

In [27]:
X_source = X_source.reshape(X_source.shape[0], -1)
X_source_bkg = X_source_bkg.reshape(X_source_bkg.shape[0], -1)
X_target = X_target.reshape(X_target.shape[0], -1)
X_target_unlabeled = X_target_unlabeled.reshape(X_target_unlabeled.shape[0], -1)
X_target_bkg = X_target_bkg.reshape(X_target_bkg.shape[0], -1)
X_test.target = X_test.target.reshape(X_test.target.shape[0], -1)
X_test.target_bkg = X_test.target_bkg.reshape(X_test.target_bkg.shape[0], -1)

## 搭建模型

In [24]:
from sklearn.ensemble import RandomForestClassifier

model_RF = RandomForestClassifier(
    n_estimators=2, max_depth=2, random_state=44, n_jobs=-1)

In [28]:
# model_DT.fit(X_source, y_source)
model_RF.fit(X_source, y_source)

RandomForestClassifier(max_depth=2, n_estimators=2, n_jobs=-1, random_state=44)

## 使用模型

In [29]:
from collections import Counter
print("X_test.target.shape:", X_test.target.shape)

print("[train: B] Random Forest:", model_RF.score(X_target, y_target))
print("[test:  B] Random Forest:", model_RF.score(X_test.target, y_test.target))
y_pred = model_RF.predict(X_test.target)
print("Predicted:", Counter(y_pred), y_pred.shape)
print("True:      ", Counter(y_test.target), y_test.target.shape)
exit()

X_test.target.shape: (17758, 110)
[train: B] Random Forest: 0.7922374429223744
[test:  B] Random Forest: 0.8645117693433946
Predicted: Counter({0.0: 17542, 1.0: 216}) (17758,)
True:       Counter({0.0: 15464, 1.0: 2294}) (17758,)


# 检验模型

In [None]:
import rampwf as rw
import numpy as np
from others import cd

# 载入数据
with cd("~/Codes/HuaweiRAMP"):
    problem = rw.utils.assert_read_problem()
    X_train, y_train = problem.get_train_data(show=False)
    X_test, y_test = problem.get_test_data(show=False)
    
# 导入评价函数
ap    = problem.score_types[0]
# 设置crossvalidation
splits = problem.get_cv(X_train, y_train, n_splits=10) # 默认10

# 开始实验
ap_train, ap_valid, ap_test, = [], [], []
y_test_preds = []
for fold_i, (train_is, valid_is) in enumerate(splits):
    trained_workflow = problem.workflow.train_submission(
        '.', X_train, y_train, train_is, sampling_strategy=ss)
    X_fold_train = X_train.slice(train_is)
    X_fold_valid = X_train.slice(valid_is)
    
    y_train_pred = problem.workflow.test_submission(trained_workflow, X_fold_train)
    y_valid_pred = problem.workflow.test_submission(trained_workflow, X_fold_valid)
    y_test_pred = problem.workflow.test_submission(trained_workflow, X_test)
    ap_train.append( ap(y_train.slice(train_is).target, y_train_pred[:,1]) )
    ap_valid.append( ap(y_train.slice(valid_is).target, y_valid_pred[:,1]) )
    ap_test.append( ap(y_test.target, y_test_pred[:,1]) )
    print('-------------------------------------')
    print('training ap on fold {} = {:.3f}'.format(fold_i, ap_train[-1]))
    print('validation ap on fold {} = {:.3f}'.format(fold_i, ap_valid[-1]))
    print('test ap on fold {} = {:.3f}'.format(fold_i, ap_test[-1]))
    
    y_test_preds.append(y_test_pred)

# 计算排名指标: bagged average precision on test dataset
ap_bagged_test.append(
    ap(y_test.target, np.array([y_test_pred for y_test_pred in y_test_preds]).mean(axis=0)[:,1]))
print('{}: Bagged ap score = {}'.format(ss, ap_bagged_test[-1]))

# 调试超参
直接使用华为提供的比赛工具包来评价模型

In [None]:
import rampwf as rw
import numpy as np
from others import cd

hp_range = np.arange(0.2, 1.1, 0.1)
ap_bagged_test = []

for ss in hp_range:
    # 载入数据
    with cd("~/Codes/HuaweiRAMP"):
        problem = rw.utils.assert_read_problem()
        X_train, y_train = problem.get_train_data(show=False)
        X_test, y_test = problem.get_test_data(show=False)
    # 导入评价函数
    ap    = problem.score_types[0]
    # 设置crossvalidation
    splits = problem.get_cv(X_train, y_train, n_splits=10) # 默认10
    # 开始实验
    ap_train, ap_valid, ap_test, = [], [], []
    y_test_preds = []
    for fold_i, (train_is, valid_is) in enumerate(splits):
        trained_workflow = problem.workflow.train_submission(
            '.', X_train, y_train, train_is, sampling_strategy=ss)
        X_fold_train = X_train.slice(train_is)
        X_fold_valid = X_train.slice(valid_is)
        
        y_train_pred = problem.workflow.test_submission(trained_workflow, X_fold_train)
        y_valid_pred = problem.workflow.test_submission(trained_workflow, X_fold_valid)
        y_test_pred = problem.workflow.test_submission(trained_workflow, X_test)
        ap_train.append( ap(y_train.slice(train_is).target, y_train_pred[:,1]) )
        ap_valid.append( ap(y_train.slice(valid_is).target, y_valid_pred[:,1]) )
        ap_test.append( ap(y_test.target, y_test_pred[:,1]) )
        # print('-------------------------------------')
        # print('training ap on fold {} = {:.3f}'.format(fold_i, ap_train[-1]))
        # print('validation ap on fold {} = {:.3f}'.format(fold_i, ap_valid[-1]))
        # print('test ap on fold {} = {:.3f}'.format(fold_i, ap_test[-1]))
        
        y_test_preds.append(y_test_pred)

    # 计算排名指标: bagged average precision on test dataset
    ap_bagged_test.append(
        ap(y_test.target, np.array([y_test_pred for y_test_pred in y_test_preds]).mean(axis=0)[:,1]))
    print('{}: Bagged ap score = {}'.format(ss, ap_bagged_test[-1]))
    del problem, X_train, y_train, X_test, y_test, ap, splits, y_test_preds

## 画图

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()
fig.add_trace(go.Scatter(x=hp_range, y=ap_bagged_test, mode="lines",
                         name="sampling_strategy"))
fig.update_layout(xaxis_title="Hyperparameter", yaxis_title="Bagged ap")
fig.show()