## Load Libraries

In [1]:
import json
from random import randint, sample
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import xgboost
import treelite


sns.set(style="ticks")
np.set_printoptions(suppress=True)

pd.set_option('display.max_rows', 150)

## Load Data

In [2]:
with open('/data/data/NonwearCheck/450/Results/annotations.json', 'r') as f:
    annotations = json.load(f)

record_annotation_index = {}
for record_annotation in annotations['record_annotations']:
    if record_annotation['id'] not in record_annotation_index:
        record_annotation_index[record_annotation['id']] = {}
    id = record_annotation['id']
    record_annotation.pop('id')
    record_annotation_index[id].update(record_annotation)

segment_annotation_index = {}
for segment_annotation in annotations['segment_annotations']:
    if segment_annotation['id'] not in segment_annotation_index:
        segment_annotation_index[segment_annotation['id']] = {}
    id = segment_annotation['id']
    segment_annotation.pop('id')
    segment_annotation_index[id].update(segment_annotation)

In [42]:
df_feats = pd.read_csv("/data/data/NonwearCheck/450/Results/df_feat_ppg_ir.csv", index_col=None)
df_feats = df_feats.iloc[shuffle(range(len(df_feats)), random_state=0), :]

df_objects = pd.read_csv("/data/data/NonwearCheck/450/Results/df_object_ppg_ir.csv", index_col=None)

In [43]:
feats_cols = [c for c in df_feats.columns if "ppg" in c]
target_col = "wear_category_id"

## 全数据构建模型

In [44]:
# 利用xgboost选择前50个特征分析
params = {'max_depth': 6, 'objective': 'binary:logistic', "n_jobs": -1}
num_iter = 20

X_cols, y_col = [
    'ppg_ir__cid_ce__normalize_True',
    'ppg_ir__number_peaks__n_1',
    'ppg_ir__number_peaks__n_3',
    'ppg_ir__agg_linear_trend__attr_"intercept"__chunk_len_50__f_agg_"max"',
    'ppg_ir__ratio_beyond_r_sigma__r_0.5',
    'ppg_ir__autocorrelation__lag_2',
    'ppg_ir__autocorrelation__lag_6',
    'ppg_ir__binned_entropy__max_bins_10',
    'ppg_ir__change_quantiles__f_agg_"var"__isabs_False__qh_0.6__ql_0.4',
    'ppg_ir__percentage_of_reoccurring_values_to_all_values'
                       ], target_col

X, y = df_feats.loc[:, X_cols].values, df_feats.loc[:, y_col].values

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

D_train = xgboost.DMatrix(X, label=y)
D_test = xgboost.DMatrix(X, label=y)

bst = xgboost.train(params, D_train, num_iter, [(D_train, 'train')], verbose_eval=False)

accuracy_score(y, bst.predict(D_test) > 0.7)

0.9789966207434364

In [45]:
model = treelite.Model.from_xgboost(bst)

model.export_srcpkg(platform='unix', toolchain='gcc', pkgpath='./model_20201231_003.zip',
                    libname='mymodel.so', verbose=True, params={'quantize': 1})

[08:32:09] /workspace/src/compiler/ast_native.cc:44: Using ASTNativeCompiler
[08:32:09] /workspace/src/compiler/ast/split.cc:24: Parallel compilation disabled; all member trees will be dumped to a single source file. This may increase compilation time and memory usage.
[08:32:09] /workspace/src/c_api/c_api.cc:286: Code generation finished. Writing code to files...
[08:32:09] /workspace/src/c_api/c_api.cc:291: Writing file recipe.json...
[08:32:09] /workspace/src/c_api/c_api.cc:291: Writing file header.h...
[08:32:09] /workspace/src/c_api/c_api.cc:291: Writing file main.c...


## 验证和C一致性

In [53]:
selected_object_ids = [15000]# list(range(3000, 3004))

### 待验证数据

In [58]:
selected_object_ids = [15000]
mask = np.isin(df_objects['id'], selected_object_ids)
mask = np.where(mask)[0]
df_objects.loc[mask, 'ppg_ir'].values

array([21054, 21049, 21035, 21030, 21051, 21046, 21045, 21038, 21052,
       21050, 21061, 21038, 21041, 21056, 21046, 21045, 21047, 21050,
       21043, 21043, 21053, 21049, 21046, 21042, 21046, 21049, 21049,
       21048, 21055, 21065, 21053, 21038, 21051, 21044, 21060, 21051,
       21055, 21047, 21055, 21062, 21045, 21052, 21049, 21059, 21042,
       21057, 21058, 21052, 21051, 21049, 21058, 21058, 21071, 21057,
       21041, 21050, 21061, 21053, 21044, 21050, 21049, 21041, 21060,
       21047, 21050, 21041, 21049, 21058, 21052, 21059, 21042, 21054,
       21050, 21048, 21058, 21052, 21057, 21050, 21046, 21056, 21052,
       21057, 21051, 21060, 21058, 21060, 21057, 21053, 21060, 21051,
       21070, 21064, 21039, 21050, 21057, 21064, 21056, 21043, 21048,
       21040, 21052, 21060, 21065, 21052, 21053, 21045, 21043, 21050,
       21057, 21057, 21055, 21055, 21056, 21066, 21069, 21061, 21055,
       21070, 21053, 21061, 21048, 21071, 21059, 21066, 21046, 21061,
       21063, 21063]

### 特征一致性

In [59]:
feats = df_feats.loc[selected_object_ids, X_cols]
feats

Unnamed: 0,ppg_ir__cid_ce__normalize_True,ppg_ir__number_peaks__n_1,ppg_ir__number_peaks__n_3,"ppg_ir__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""max""",ppg_ir__ratio_beyond_r_sigma__r_0.5,ppg_ir__autocorrelation__lag_2,ppg_ir__autocorrelation__lag_6,ppg_ir__binned_entropy__max_bins_10,"ppg_ir__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.4",ppg_ir__percentage_of_reoccurring_values_to_all_values
15000,13.807082,39.0,17.0,21066.0,0.617188,0.145667,0.18626,2.074416,6.222222,0.953125


### 模型一致性

In [60]:
D = xgboost.DMatrix(feats.values)
bst.predict(D)

array([0.996917], dtype=float32)

In [66]:
import json
import logging

_logger = logging.getLogger('nni')

In [67]:
_logger.info(
            "Creating graph json, writing to. Visualization enabled.")