# 预处理探究
## 装载数据

In [1]:
from others import load_all_dataset, rename_dataset
X_train, y_train, X_test, y_test = load_all_dataset(show=False)
import numpy as np
np.set_printoptions(edgeitems=5,
                    linewidth=1000,
                    formatter={"float":lambda x: "{:.3f}".format(x)})

Train data
Test data


## NaN值处理

In [2]:
class FeatureExtractor:
    def transform(self, X):
        '''
        Parameters
        ----------
        `X`: ndarray of (sample, 672, 10)
            3D input dataset(sample, time, features)
        
        Returns
        -------
        `X`: ndarray of (sample, 6720)
            The filtered dataset
        '''
        #! ATTENTION
        # The idea is supposed to eliminate the common columns filled entirely 
        # by NaN. But in this competition, since we don't have access to
        # `OpticalDataset` object, it's impossible to communicate informations
        # between datasets. So, here it deletes columns that are found on public
        # dataset.
        X = np.delete(X, [3,], axis=2)

        new_X = []
        # Deal with NaN
        all_nan_col, part_nan = {}, {}
        for i in range(X.shape[0]):
            x = X[i]
            
            # Columns with full Nan
            col_is_nan = np.all(~np.isfinite(x), axis=0)
            if (col_is_nan == True).any():
                col = np.where(col_is_nan == True)[0]
                all_nan_col[i] = col[0]
            
            # Rows with full Nan
            row_is_nan = np.all(~np.isfinite(x), axis=1)
            if (row_is_nan == True).any():
                row = np.where(row_is_nan == True)[0]
                # x = np.delete(x, row, axis=0) # leads to diff dim
                if len(row) >= x.shape[0]/4: # drop sample
                    continue
            
            # TODO move to 2nd loop
            # Columns with partial NaN
            part_is_nan = np.any(~np.isfinite(x), axis=0)
            if (part_is_nan == True).any():
                col = np.where(part_is_nan == True)[0]
                part_nan[i] = col[0]

            
            # Construct new array
            new_X.append(x)
        
        print("\t{}({:.2f}%) samples have all NaN, "
              "{}({:.2f}%) samples have partial NaN.".format(
              len(all_nan_col), len(all_nan_col)/X.shape[0]*100,
              len(part_nan), len(part_nan)/X.shape[0]*100))
        diff = X.shape[0]-len(new_X)
        print("All NaN columns:", set(all_nan_col.values()),
              "Partial NaN columns:", set(part_nan.values()),
              "Dropped samples: {}({:.2f}%)".format(diff, diff/X.shape[0]*100,)
             )
        
        XX = np.array(new_X)
        return XX

fe = FeatureExtractor()
X_target = fe.transform(X_train.target)
np.all(np.isfinite(X_target)) # expected True
[X_source, X_source_bkg, X_target, X_target_unlabeled, X_target_bkg,
    y_source, y_target, X_test] = rename_dataset(
    fe, X_train, y_train, X_test, y_test, show_imbalance=0)

	0(0.00%) samples have all NaN, 341(77.85%) samples have partial NaN.
All NaN columns: set() Partial NaN columns: {0, 1} Dropped samples: 97(22.15%)
==== TRAIN SET ====
	0(0.00%) samples have all NaN, 18200(39.47%) samples have partial NaN.
All NaN columns: set() Partial NaN columns: {0} Dropped samples: 7783(16.88%)
  | X_source: (38327, 672, 9) ; y_source: (46110,)
	79(0.16%) samples have all NaN, 17969(35.33%) samples have partial NaN.
All NaN columns: {0} Partial NaN columns: {0} Dropped samples: 14660(28.82%)
A | X_source_bkg: (36202, 672, 9)
	0(0.00%) samples have all NaN, 341(77.85%) samples have partial NaN.
All NaN columns: set() Partial NaN columns: {0, 1} Dropped samples: 97(22.15%)
----
  | X_target: (341, 672, 9) ; y_target: (438,)
	1094(3.70%) samples have all NaN, 21663(73.21%) samples have partial NaN.
All NaN columns: {0, 1} Partial NaN columns: {0, 1} Dropped samples: 7897(26.69%)
B | X_target_bkg: (21695, 672, 9)
	11(0.13%) samples have all NaN, 6220(75.84%) samples 

In [3]:
print(X_target.shape)

(341, 672, 9)


## Outlier detection

Requirement:
- input array must be 2D, i.e. (n_samples, n_features)
- input array must not contain NaN, Inf

In [4]:
from copy import deepcopy
X = deepcopy(X_source_bkg)
np.nan_to_num(X, copy=False)
# LocalOutlierFactor: 204s X_source_bkg
lof = LocalOutlierFactor(contamination=0.01, novelty=False)
flag = lof.fit_predict(X)
print(flag.shape)
X[flag==True].shape
# IsolationForest: 296s
from sklearn.ensemble import IsolationForest
IF = IsolationForest(contamination=0.01)
flag = IF.fit_predict(X)
print(flag.shape)
X[flag==True].shape

NameError: name 'LocalOutlierFactor' is not defined

## 整理数据（Normalization, Oversampling, ...)

In [None]:
stop
X_source = X_source.reshape(X_source.shape[0], -1)
X_source_bkg = X_source_bkg.reshape(X_source_bkg.shape[0], -1)
X_target = X_target.reshape(X_target.shape[0], -1)
X_target_unlabeled = X_target_unlabeled.reshape(X_target_unlabeled.shape[0], -1)
X_target_bkg = X_target_bkg.reshape(X_target_bkg.shape[0], -1)
X_test.target = X_test.target.reshape(X_test.target.shape[0], -1)
np.all(np.isfinite(X_source))

## 搭建模型

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model_DT = DecisionTreeClassifier(max_depth=2, random_state=44,)

model_RF = RandomForestClassifier(
    n_estimators=2, max_depth=2, random_state=44, n_jobs=-1)

In [None]:
# model_DT.fit(X_source, y_source)
model_RF.fit(X_source, y_source)

## 预测概率

In [None]:
from collections import Counter
print("X_test.target.shape:", X_test.target.shape)

print("[train: B] Random Forest:", model_RF.score(X_target, y_target))
print("[test:  B] Random Forest:", model_RF.score(X_test.target, y_test.target))
y_pred = model_RF.predict(X_test.target)
print("Predicted:", Counter(y_pred), y_pred.shape)
print("True:      ", Counter(y_test.target), y_test.target.shape)
exit()

# 调试超参
直接使用华为提供的比赛工具包来评价模型

In [None]:
import rampwf as rw
import numpy as np
from others import cd

hp_range = np.arange(0.2, 1.1, 0.1)
ap_bagged_test = []

for ss in hp_range:
    # 载入数据
    with cd("~/Codes/HuaweiRAMP"):
        problem = rw.utils.assert_read_problem()
        X_train, y_train = problem.get_train_data(show=False)
        X_test, y_test = problem.get_test_data(show=False)
    # 导入评价函数
    ap    = problem.score_types[0]
    # 设置crossvalidation
    splits = problem.get_cv(X_train, y_train, n_splits=10) # 默认10
    # 开始实验
    ap_train, ap_valid, ap_test, = [], [], []
    y_test_preds = []
    for fold_i, (train_is, valid_is) in enumerate(splits):
        trained_workflow = problem.workflow.train_submission(
            '.', X_train, y_train, train_is, sampling_strategy=ss)
        X_fold_train = X_train.slice(train_is)
        X_fold_valid = X_train.slice(valid_is)
        
        y_train_pred = problem.workflow.test_submission(trained_workflow, X_fold_train)
        y_valid_pred = problem.workflow.test_submission(trained_workflow, X_fold_valid)
        y_test_pred = problem.workflow.test_submission(trained_workflow, X_test)
        ap_train.append( ap(y_train.slice(train_is).target, y_train_pred[:,1]) )
        ap_valid.append( ap(y_train.slice(valid_is).target, y_valid_pred[:,1]) )
        ap_test.append( ap(y_test.target, y_test_pred[:,1]) )
        # print('-------------------------------------')
        # print('training ap on fold {} = {:.3f}'.format(fold_i, ap_train[-1]))
        # print('validation ap on fold {} = {:.3f}'.format(fold_i, ap_valid[-1]))
        # print('test ap on fold {} = {:.3f}'.format(fold_i, ap_test[-1]))
        
        y_test_preds.append(y_test_pred)

    # 计算排名指标: bagged average precision on test dataset
    ap_bagged_test.append(
        ap(y_test.target, np.array([y_test_pred for y_test_pred in y_test_preds]).mean(axis=0)[:,1]))
    print('{}: Bagged ap score = {}'.format(ss, ap_bagged_test[-1]))
    del problem, X_train, y_train, X_test, y_test, ap, splits, y_test_preds

## 画图

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()
fig.add_trace(go.Scatter(x=hp_range, y=ap_bagged_test, mode="lines",
                         name="sampling_strategy"))
fig.update_layout(xaxis_title="Hyperparameter", yaxis_title="Bagged ap")
fig.show()