## Load Libraries

In [1]:
import json
from random import randint, sample
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import xgboost
import treelite


sns.set(style="ticks")
np.set_printoptions(suppress=True)

pd.set_option('display.max_rows', 150)

## Load Data

In [2]:
df_feats = pd.read_csv("/data/data/NonwearCheck/450/Results/df_feat_ppg_g.csv", index_col=None)
df_feats = df_feats.iloc[shuffle(range(len(df_feats)), random_state=0), :]

df_objects = pd.read_csv("/data/data/NonwearCheck/450/Results/df_object_ppg_g.csv", index_col=None)

In [3]:
feats_cols = [c for c in df_feats.columns if "ppg" in c]
target_col  = "wear_category_id"

## Feature Candidates

In [4]:
X_cols = feats_cols
y_col  = target_col

X_train, y_train = df_feats.loc[:, X_cols].values, df_feats.loc[:, y_col].values
X_test,  y_test  = df_feats.loc[:, X_cols].values, df_feats.loc[:, y_col].values

shuffle_index = shuffle(range(len(X_train)), random_state=0)
X_train = X_train[shuffle_index, :]
y_train = y_train[shuffle_index]

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

D_train = xgboost.DMatrix(X_train, label=y_train)
D_test = xgboost.DMatrix(X_test, label=y_test)

params = {'max_depth': 5, 'objective':'binary:logistic'}
num_iter = 15
bst_all = xgboost.train(params, D_train, num_iter, [(D_train, 'train')], verbose_eval=False)

probs = bst_all.predict(D_test)
preds = np.array(probs) > 0.7

print("MAE: ", accuracy_score(y_test, preds))

MAE:  0.9973353762115711


In [5]:
bst_all.feature_names = X_cols
key_value = bst_all.get_score(importance_type="gain")
key_value = sorted(key_value.items(), key = lambda kv:(kv[1], kv[0]))[::-1][:50]
top_feats = [kv[0] for kv in key_value]
# top_feats

### xgboost

In [12]:
X_cols_candidates = top_feats
y_col = target_col

params = {'max_depth': 5, 'objective':'binary:logistic'}
num_iter = 15

segment_ids = df_feats['segment_id'].unique()

accs = []
segment_accs = []
append_feats = []
feats_tobe_combined = []
corrcoefs = []

df_results = pd.DataFrame({})

for f in X_cols_candidates:
    combined_feats = []
    if f not in combined_feats:
        combined_feats.append(f)
    else:
        continue
        
    y_probs_list, y_preds_list, y_test_list = [], [], []
    
    for segment_id in segment_ids:
        test_index = df_feats['segment_id'] == segment_id
        X_train, y_train = df_feats.loc[~test_index, combined_feats].values, df_feats.loc[~test_index, y_col].values
        X_test,  y_test  = df_feats.loc[test_index, combined_feats].values, df_feats.loc[test_index, y_col].values
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test  = scaler.transform(X_test)
        
        D_train = xgboost.DMatrix(X_train, label=y_train)
        D_test = xgboost.DMatrix(X_test, label=y_test)
        
        bst = xgboost.train(params, D_train, num_iter, [(D_train, 'train')], verbose_eval=False)
        probs = bst.predict(D_test)
        preds = np.array(probs) > 0.7
        acc = accuracy_score(preds, list(y_test))
        
        segment_accs.append(acc)
        
        y_probs_list.extend(probs)
        y_preds_list.extend(preds)
        y_test_list.extend(list(y_test))
        print(segment_id)
    
    acc = accuracy_score(y_test_list, y_preds_list)
    accs.append(acc)
    
    append_feats.append(f)
    
    if not feats_tobe_combined:
        for ftc in feats_tobe_combined:
            corrcoef = np.corrcoef(df_feats[f], df_feats[ftc])[0][1]
            corrcoefs.append(corrcoef)
        
segment_accs = np.array(segment_accs).reshape(-1, len(segment_ids))
accs = np.array(accs).reshape(-1, 1)
append_feats = np.array(append_feats).reshape(-1, 1)

if not feats_tobe_combined:
    corrcoefs = np.array(corrcoefs).reshape(-1, len(feats_tobe_combined))
    df_results = np.concatenate([append_feats, corrcoefs, accs, segment_accs], axis=1)
    columns = ["append_feats"] + feats_tobe_combined + ["accs"] + list(segment_ids)
    df_results = pd.DataFrame(df_results, columns=columns)
else:
    df_results = np.concatenate([append_feats, accs, segment_accs], axis=1)
    columns = ["append_feats"] + ["accs"] + list(segment_ids)
    df_results = pd.DataFrame(df_results, columns=columns)

df_results.to_csv("df_results_1feat.csv", index=None)

32772480813553015
32772480813553082
32772480813553019
32772480813553015
32772480813553082
32772480813553019


ValueError: cannot reshape array of size 6 into shape (203)

In [14]:
segment_accs = np.array(segment_accs).reshape(-1, 3)
accs = np.array(accs).reshape(-1, 1)
append_feats = np.array(append_feats).reshape(-1, 1)

if not feats_tobe_combined:
    corrcoefs = np.array(corrcoefs).reshape(-1, len(feats_tobe_combined))
    df_results = np.concatenate([append_feats, corrcoefs, accs, segment_accs], axis=1)
    columns = ["append_feats"] + feats_tobe_combined + ["accs"] + list(segment_ids)
    df_results = pd.DataFrame(df_results, columns=columns)
else:
    df_results = np.concatenate([append_feats, accs, segment_accs], axis=1)
    columns = ["append_feats"] + ["accs"] + list(segment_ids)
    df_results = pd.DataFrame(df_results, columns=columns)

df_results.to_csv("df_results_1feat.csv", index=None)

ValueError: cannot reshape array of size 0 into shape (0)

In [21]:
if feats_tobe_combined:
    corrcoefs = np.array(corrcoefs).reshape(-1, len(feats_tobe_combined))
    df_results = np.concatenate([append_feats, corrcoefs, accs, segment_accs], axis=1)
    columns = ["append_feats"] + feats_tobe_combined + ["accs"] + list(segment_ids[:3])
    print(len(columns))
    df_results = pd.DataFrame(df_results, columns=columns)
else:
    df_results = np.concatenate([append_feats, accs, segment_accs], axis=1)
    columns = ["append_feats"] + ["accs"] + list(segment_ids[:3])
    df_results = pd.DataFrame(df_results, columns=columns)

df_results.to_csv("df_results_1feat.csv", index=None)

In [22]:
df_results

Unnamed: 0,append_feats,accs,32772480813553015,32772480813553082,32772480813553019
0,ppg_g__ratio_value_number_to_time_series_length,0.997872340425532,1.0,0.993103448275862,1.0
1,"ppg_g__change_quantiles__f_agg_""mean""__isabs_T...",0.9957446808510638,1.0,0.986206896551724,1.0


In [28]:
# 逐个查看特征的分类性能
params = {'max_depth': 5, 'objective':'binary:logistic'}
num_iter = 15
X_cols_candidates = ['ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"']

for f in X_cols_candidates:
    combined_feats = [
        'ppg__autocorrelation__lag_1'
                      ]
    if f not in combined_feats:
        combined_feats.append(f)
    else:
        continue
    segment_ids = df_features['segment_id'].unique()
    y_probs_list, y_preds_list, y_test_list, segment_score = [], [], [], {}
    for segment_id in segment_ids:
        test_index = df_features['segment_id'] == segment_id
        X_train, y_train = df_features.loc[~test_index, combined_feats].values, df_features.loc[~test_index, 'wear_category_id'].values
        X_test,  y_test  = df_features.loc[test_index, combined_feats].values, df_features.loc[test_index, 'wear_category_id'].values
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test  = scaler.transform(X_test)
        
        D_train = xgboost.DMatrix(X_train, label=y_train)
        D_test = xgboost.DMatrix(X_test, label=y_test)
        
        bst = xgboost.train(params, D_train, num_iter, [(D_train, 'train')], verbose_eval=False)
        probs = bst.predict(D_test)
        preds = np.array(probs) > 0.7
        
        y_probs_list.extend(probs)
        y_preds_list.extend(preds)
        y_test_list.extend(list(y_test))
        print(segment_id, accuracy_score(preds, list(y_test)))
    print("=================================================")
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list))
    print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__autocorrelation__lag_1'])[0][1])  # , ' -- ', np.corrcoef(df_features[f], df_features['Resultant__ar_0'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__cid_ce__normalize_True'])[0][1])# , ' -- ', np.corrcoef(df_features[f], df_features['Resultant__ar_0'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_0.6__ql_0.2'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__fft_aggregated__aggtype_"centroid"'])[0][1])

1595313013 0.994572591587517
1588257455 1.0
1588257312 0.9459459459459459
1595383255 1.0
1595388059 1.0
1586841055 1.0
1588257368 1.0
1586843082 1.0
1595313028 0.9433465085638999
1595387992 1.0
1588257385 1.0
1586843123 1.0
1595313107 1.0
1595383319 1.0
1586841282 1.0
1595303876 0.0
1595303902 1.0
1595388039 1.0
1595313000 1.0
1595303970 1.0
1586843703 1.0
1588257400 1.0
1595388133 1.0
1586843360 0.0
1586841154 0.883495145631068
1586843044 1.0
1586842986 1.0
1588257418 1.0
1588257437 1.0
1595313066 1.0
1586843333 1.0
1586841443 1.0
1586843057 1.0
1595383285 1.0
1588257346 1.0
1595313094 1.0
1595303946 0.9914529914529915
1595383302 1.0
1595388118 1.0
1586842998 1.0
1595303958 1.0
1586843686 1.0
1588257483 1.0
1586843727 1.0
1586843580 0.0
1586843071 1.0
1586843345 1.0
1586841392 1.0
1586843478 1.0
1586843563 0.0
1586843527 0.0
1586843501 1.0
1586843548 0.0
1586843675 1.0
ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"  --  0.9657731501993797  --  0.296755808620785
