# 预处理探究
## 装载数据

In [1]:
from others import load_all_dataset, rename_dataset
X_train, y_train, X_test, y_test = load_all_dataset(show=False)
import numpy as np
np.set_printoptions(edgeitems=5,
                    linewidth=1000,
                    formatter={"float":lambda x: "{:.3f}".format(x)})

Train data
Test data


## NaN值处理

In [2]:
import warnings
warnings.filterwarnings("ignore")

class FeatureExtractor:
    def transform(self, X):
        '''
        Parameters
        ----------
        `X`: ndarray of (sample, 672, 10)
            3D input dataset(sample, time, features)
        
        Returns
        -------
        `X`: ndarray of (sample, 6720)
            The filtered dataset
        '''
        X = np.delete(X, [3,], axis=2)
        X = X.astype(np.float64)
        
        ## 1st round
        X1, nanmean = [], []
        for i in range(X.shape[0]):
            x = X[i]
            indice = ~np.isfinite(x)
            nanmean.append(np.nanmean(x, axis=0))

            # Columns with full Nan
            col_is_nan = np.all(indice, axis=0)
            if (col_is_nan == True).any():
                X1.append(x) # deal later
                continue
            
            # Rows with full Nan
            # Unachievable. Cause we don't have access to manipulate on labels
            # row_is_nan = np.all(indice, axis=1)
            # if (row_is_nan == True).any():
            #     row = np.where(row_is_nan == True)[0]
            #     if len(row) >= x.shape[0]/4: # drop sample, /2=85%+, /4=75%+
            #         continue
            
            # Columns with partial NaN
            part_is_nan = np.any(indice, axis=0)
            if (part_is_nan == True).any():
                col = np.where(part_is_nan == True)[0]
                # part_nan[i] = col[0]
                for c in col:
                    this = x[:,c]
                    finite = this[np.isfinite(this)]
                    fill = np.repeat(finite, np.ceil(len(this)/len(finite)))[:len(this)]
                    x[:,c] = np.where(np.isfinite(this), this, fill)
            
            # Construct new array
            X1.append(x)
        X1, nanmean = np.array(X1), np.array(nanmean)
        
        ## 2nd round
        candidate_mean = []
        for i in range(nanmean.shape[1]):
            col = nanmean[i]
            finite = col[np.isfinite(col)]
            candidate_mean.append(finite)

        X2 = []
        for i in range(X1.shape[0]):
            x = X[i]
            indice = ~np.isfinite(x)
            # Columns with full Nan
            col_is_nan = np.all(indice, axis=0)
            if (col_is_nan == True).any():
                col = np.where(col_is_nan == True)[0]
                for c in col:
                    value = np.random.choice(candidate_mean[c])
                    x = np.nan_to_num(x, nan=value)
            X2.append(x)
        
        X = np.array(X2)

        ## Final
        X = X.reshape(X.shape[0], -1) # Flatten
        print("Expected True:", np.all(np.isfinite(X))) # expected True
        return X


fe = FeatureExtractor()
# X_target = fe.transform(X_train.target)
# np.all(np.isfinite(X_target)) # expected True
[X_source, X_source_bkg, X_target, X_target_unlabeled, X_target_bkg,
    y_source, y_target, X_test] = rename_dataset(
    fe, X_train, y_train, X_test, y_test, show_imbalance=0)

==== TRAIN SET ====
Expected True: True
  | X_source: (46110, 6048) ; y_source: (46110,)
Expected True: True
A | X_source_bkg: (50862, 6048)
Expected True: True
----
  | X_target: (438, 6048) ; y_target: (438,)
Expected True: True
B | X_target_bkg: (29592, 6048)
Expected True: True
  | X_target_unlabeled: (8202, 6048)
==== TEST SET ====
Expected True: True
  | X_test.target: (17758, 6048) ; y_test.target: (17758,)
Expected True: True
B | X_test.target_bkg: (47275, 6048)
  | X_test.target_unlabeled: None


In [3]:
print(X_target.shape)

(438, 6048)


## Outlier detection

Requirement:
- input array must be 2D, i.e. (n_samples, n_features)
- input array must not contain NaN, Inf

In [4]:
from copy import deepcopy
X = deepcopy(X_source_bkg)
np.nan_to_num(X, copy=False)
# LocalOutlierFactor: 204s X_source_bkg
lof = LocalOutlierFactor(contamination=0.01, novelty=False)
flag = lof.fit_predict(X)
print(flag.shape)
X[flag==True].shape
# IsolationForest: 296s
from sklearn.ensemble import IsolationForest
IF = IsolationForest(contamination=0.01)
flag = IF.fit_predict(X)
print(flag.shape)
X[flag==True].shape

NameError: name 'LocalOutlierFactor' is not defined

## 整理数据（Normalization, Oversampling, ...)

In [None]:
stop
X_source = X_source.reshape(X_source.shape[0], -1)
X_source_bkg = X_source_bkg.reshape(X_source_bkg.shape[0], -1)
X_target = X_target.reshape(X_target.shape[0], -1)
X_target_unlabeled = X_target_unlabeled.reshape(X_target_unlabeled.shape[0], -1)
X_target_bkg = X_target_bkg.reshape(X_target_bkg.shape[0], -1)
X_test.target = X_test.target.reshape(X_test.target.shape[0], -1)
np.all(np.isfinite(X_source))

## 搭建模型

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model_DT = DecisionTreeClassifier(max_depth=2, random_state=44,)

model_RF = RandomForestClassifier(
    n_estimators=2, max_depth=2, random_state=44, n_jobs=-1)

In [None]:
# model_DT.fit(X_source, y_source)
model_RF.fit(X_source, y_source)

## 预测概率

In [None]:
from collections import Counter
print("X_test.target.shape:", X_test.target.shape)

print("[train: B] Random Forest:", model_RF.score(X_target, y_target))
print("[test:  B] Random Forest:", model_RF.score(X_test.target, y_test.target))
y_pred = model_RF.predict(X_test.target)
print("Predicted:", Counter(y_pred), y_pred.shape)
print("True:      ", Counter(y_test.target), y_test.target.shape)
exit()

# 检查bug
直接使用华为提供的比赛工具包来评价模型

In [1]:
import rampwf as rw
import numpy as np
from others import cd

ap_bagged_test = []


# 载入数据
with cd("~/Codes/HuaweiRAMP"):
    problem = rw.utils.assert_read_problem()
    X_train, y_train = problem.get_train_data(show=False)
    X_test, y_test = problem.get_test_data(show=False)
# 导入评价函数
ap    = problem.score_types[0]
# 设置crossvalidation
splits = problem.get_cv(X_train, y_train, n_splits=10) # 默认10
# 开始实验
ap_train, ap_valid, ap_test, = [], [], []
y_test_preds = []
for fold_i, (train_is, valid_is) in enumerate(splits):
    trained_workflow = problem.workflow.train_submission(
        '.', X_train, y_train, train_is,)
    X_fold_train = X_train.slice(train_is)
    X_fold_valid = X_train.slice(valid_is)
    
    y_train_pred = problem.workflow.test_submission(trained_workflow, X_fold_train)
    y_valid_pred = problem.workflow.test_submission(trained_workflow, X_fold_valid)
    y_test_pred = problem.workflow.test_submission(trained_workflow, X_test)
    ap_train.append( ap(y_train.slice(train_is).target, y_train_pred[:,1]) )
    ap_valid.append( ap(y_train.slice(valid_is).target, y_valid_pred[:,1]) )
    ap_test.append( ap(y_test.target, y_test_pred[:,1]) )
    print('-------------------------------------')
    print('training ap on fold {} = {:.3f}'.format(fold_i, ap_train[-1]))
    print('validation ap on fold {} = {:.3f}'.format(fold_i, ap_valid[-1]))
    print('test ap on fold {} = {:.3f}'.format(fold_i, ap_test[-1]))
    
    y_test_preds.append(y_test_pred)

# 计算排名指标: bagged average precision on test dataset
score = ap(y_test.target, np.array([y_test_pred for y_test_pred in y_test_preds]).mean(axis=0)[:,1])
ap_bagged_test.append(score)
del problem, X_train, y_train, X_test, y_test, ap, splits, y_test_preds

Train data
Test data
-------------------------------------
training ap on fold 0 = 0.350
validation ap on fold 0 = 0.287
test ap on fold 0 = 0.167
-------------------------------------
training ap on fold 1 = 0.389
validation ap on fold 1 = 0.276
test ap on fold 1 = 0.167
-------------------------------------
training ap on fold 2 = 0.186
validation ap on fold 2 = 0.293
test ap on fold 2 = 0.189
-------------------------------------
training ap on fold 3 = 0.533
validation ap on fold 3 = 0.281
test ap on fold 3 = 0.171
-------------------------------------
training ap on fold 4 = 0.287
validation ap on fold 4 = 0.296
test ap on fold 4 = 0.166
-------------------------------------
training ap on fold 5 = 0.275
validation ap on fold 5 = 0.311
test ap on fold 5 = 0.172
-------------------------------------
training ap on fold 6 = 0.213
validation ap on fold 6 = 0.318
test ap on fold 6 = 0.187
-------------------------------------
training ap on fold 7 = 0.233
validation ap on fold 7 = 0.2

In [None]:
ap_bagged_test[-1]

## 画图

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()
fig.add_trace(go.Scatter(x=hp_range, y=ap_bagged_test, mode="lines",
                         name="sampling_strategy"))
fig.update_layout(xaxis_title="Hyperparameter", yaxis_title="Bagged ap")
fig.show()