## Load Libraries

In [18]:
import json
from random import randint, sample
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import xgboost
import treelite


sns.set(style="ticks")
np.set_printoptions(suppress=True)

pd.set_option('display.max_rows', 150)

## Load Data

In [19]:
with open('/data-temp/data/nonwear-check/O/results/annotations.json', 'r') as f:
    annotations = json.load(f)

record_annotation_index = {}
for record_annotation in annotations['record_annotations']:
    if record_annotation['id'] not in record_annotation_index:
        record_annotation_index[record_annotation['id']] = {}
    id = record_annotation['id']
    record_annotation.pop('id')
    record_annotation_index[id].update(record_annotation)

segment_annotation_index = {}
for segment_annotation in annotations['segment_annotations']:
    if segment_annotation['id'] not in segment_annotation_index:
        segment_annotation_index[segment_annotation['id']] = {}
    id = segment_annotation['id']
    segment_annotation.pop('id')
    segment_annotation_index[id].update(segment_annotation)

In [20]:
df_features = pd.read_csv("/data-temp/data/nonwear-check/O/results/features__ppg-g__object_length_36__cut_500.csv", index_col=None)
df_features = df_features.iloc[shuffle(range(len(df_features)), random_state=0), :]

df_objects = pd.read_csv("/data-temp/data/nonwear-check/O/results/objects__ppg-g__object_length_36__cut_500.csv", index_col=None)

In [21]:
feat_cols = [c for c in df_features.columns if "ppg" in c]
target_col  = "wear_category_id"

## Select features

In [22]:
X_cols_candidates = [
'ppg__cid_ce__normalize_True',
 'ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"',
 'ppg__cwt_coefficients__coeff_7__w_2__widths_(2, 5, 10, 20)',
 'ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"max"',
 'ppg__cwt_coefficients__coeff_14__w_5__widths_(2, 5, 10, 20)',
 'ppg__c3__lag_1',
 'ppg__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)',
 'ppg__variation_coefficient',
 'ppg__sum_of_reoccurring_data_points',
 'ppg__number_peaks__n_1',
 'ppg__ar_coefficient__coeff_1__k_10',
 'ppg__abs_energy',
 'ppg__autocorrelation__lag_1',
 'ppg__max_langevin_fixed_point__m_3__r_30',
 'ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.6',
 'ppg__ar_coefficient__coeff_2__k_10',
 'ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"min"',
 'ppg__fft_coefficient__attr_"abs"__coeff_3',
 'ppg__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"max"',
 'ppg__fft_coefficient__attr_"abs"__coeff_2',
 'ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_1.0__ql_0.8',
 'ppg__spkt_welch_density__coeff_2',
 'ppg__fft_aggregated__aggtype_"variance"',
 'ppg__autocorrelation__lag_2',
 'ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.4',
 'ppg__cwt_coefficients__coeff_4__w_2__widths_(2, 5, 10, 20)',
 'ppg__cwt_coefficients__coeff_5__w_2__widths_(2, 5, 10, 20)',
 'ppg__cwt_coefficients__coeff_13__w_5__widths_(2, 5, 10, 20)',
 'ppg__change_quantiles__f_agg_"var"__isabs_True__qh_0.4__ql_0.0',
 'ppg__change_quantiles__f_agg_"mean"__isabs_False__qh_0.2__ql_0.0',
 'ppg__fft_aggregated__aggtype_"centroid"',
 'ppg__longest_strike_above_mean',
 'ppg__fft_aggregated__aggtype_"skew"',
 'ppg__change_quantiles__f_agg_"var"__isabs_False__qh_1.0__ql_0.6',
 'ppg__change_quantiles__f_agg_"var"__isabs_False__qh_0.2__ql_0.0',
 'ppg__partial_autocorrelation__lag_9',
 'ppg__fft_coefficient__attr_"angle"__coeff_5',
 'ppg__autocorrelation__lag_0',
 'ppg__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.6',
 'ppg__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"min"',
 'ppg__approximate_entropy__m_2__r_0.7',
 'ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.2',
 'ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_0.8__ql_0.6',
 'ppg__partial_autocorrelation__lag_3',
 'ppg__kurtosis'
]

### xgboost

In [28]:
# 逐个查看特征的分类性能
params = {'max_depth': 3, 'objective':'binary:logistic'}
num_iter = 5
X_cols_candidates = ['ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"']
for f in X_cols_candidates:
    combined_feats = [
        'ppg__autocorrelation__lag_1'
                      ]
    if f not in combined_feats:
        combined_feats.append(f)
    else:
        continue
    segment_ids = df_features['segment_id'].unique()
    y_probs_list, y_preds_list, y_test_list, segment_score = [], [], [], {}
    for segment_id in segment_ids:
        test_index = df_features['segment_id'] == segment_id
        X_train, y_train = df_features.loc[~test_index, combined_feats].values, df_features.loc[~test_index, 'wear_category_id'].values
        X_test,  y_test  = df_features.loc[test_index, combined_feats].values, df_features.loc[test_index, 'wear_category_id'].values
        
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test  = scaler.transform(X_test)
        
        D_train = xgboost.DMatrix(X_train, label=y_train)
        D_test = xgboost.DMatrix(X_test, label=y_test)
        
        bst = xgboost.train(params, D_train, num_iter, [(D_train, 'train')], verbose_eval=False)
        probs = bst.predict(D_test)
        preds = np.array(probs) > 0.7
        
        y_probs_list.extend(probs)
        y_preds_list.extend(preds)
        y_test_list.extend(list(y_test))
        print(segment_id, accuracy_score(preds, list(y_test)))
    print("=================================================")
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list))
    print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__autocorrelation__lag_1'])[0][1])  # , ' -- ', np.corrcoef(df_features[f], df_features['Resultant__ar_0'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__cid_ce__normalize_True'])[0][1])# , ' -- ', np.corrcoef(df_features[f], df_features['Resultant__ar_0'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__change_quantiles__f_agg_"mean"__isabs_True__qh_0.6__ql_0.2'])[0][1])
#     print(f, ' -- ', accuracy_score(y_test_list, y_preds_list), ' -- ', np.corrcoef(df_features[f], df_features['ppg__fft_aggregated__aggtype_"centroid"'])[0][1])

1595313013 0.994572591587517
1588257455 1.0
1588257312 0.9459459459459459
1595383255 1.0
1595388059 1.0
1586841055 1.0
1588257368 1.0
1586843082 1.0
1595313028 0.9433465085638999
1595387992 1.0
1588257385 1.0
1586843123 1.0
1595313107 1.0
1595383319 1.0
1586841282 1.0
1595303876 0.0
1595303902 1.0
1595388039 1.0
1595313000 1.0
1595303970 1.0
1586843703 1.0
1588257400 1.0
1595388133 1.0
1586843360 0.0
1586841154 0.883495145631068
1586843044 1.0
1586842986 1.0
1588257418 1.0
1588257437 1.0
1595313066 1.0
1586843333 1.0
1586841443 1.0
1586843057 1.0
1595383285 1.0
1588257346 1.0
1595313094 1.0
1595303946 0.9914529914529915
1595383302 1.0
1595388118 1.0
1586842998 1.0
1595303958 1.0
1586843686 1.0
1588257483 1.0
1586843727 1.0
1586843580 0.0
1586843071 1.0
1586843345 1.0
1586841392 1.0
1586843478 1.0
1586843563 0.0
1586843527 0.0
1586843501 1.0
1586843548 0.0
1586843675 1.0
ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"  --  0.9657731501993797  --  0.296755808620785
