## Load Libraries

In [1]:
import json
from random import randint, sample
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler

import xgboost
import treelite


sns.set(style="ticks")
np.set_printoptions(suppress=True)

pd.set_option('display.max_rows', 150)

## Load Data

In [2]:
with open('/data-temp/data/nonwear-check/O/results/annotations.json', 'r') as f:
    annotations = json.load(f)

record_annotation_index = {}
for record_annotation in annotations['record_annotations']:
    if record_annotation['id'] not in record_annotation_index:
        record_annotation_index[record_annotation['id']] = {}
    id = record_annotation['id']
    record_annotation.pop('id')
    record_annotation_index[id].update(record_annotation)

segment_annotation_index = {}
for segment_annotation in annotations['segment_annotations']:
    if segment_annotation['id'] not in segment_annotation_index:
        segment_annotation_index[segment_annotation['id']] = {}
    id = segment_annotation['id']
    segment_annotation.pop('id')
    segment_annotation_index[id].update(segment_annotation)

In [3]:
df_features = pd.read_csv("/data-temp/data/nonwear-check/O/results/features__ppg-g__object_length_36__cut_500.csv", index_col=None)
df_features = df_features.iloc[shuffle(range(len(df_features)), random_state=0), :]

df_objects = pd.read_csv("/data-temp/data/nonwear-check/O/results/objects__ppg-g__object_length_36__cut_500.csv", index_col=None)

In [4]:
feat_cols = [c for c in df_features.columns if "ppg" in c]
target_col  = "wear_category_id"

## 全数据构建模型

In [52]:
# 利用xgboost选择前50个特征分析
params = {'max_depth': 3, 'objective':'binary:logistic'}
num_iter = 5

X_cols, y_col = ['ppg__autocorrelation__lag_1', 'ppg__agg_linear_trend__attr_"intercept"__chunk_len_10__f_agg_"mean"'], target_col

X, y = df_features.loc[:, X_cols].values, df_features.loc[:, y_col].values

# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

D_train = xgboost.DMatrix(X, label=y)
D_test = xgboost.DMatrix(X, label=y)

bst = xgboost.train(params, D_train, num_iter, [(D_train, 'train')], verbose_eval=False)

accuracy_score(y, bst.predict(D_test) > 0.7)

0.9840496233938857

In [53]:
model = treelite.Model.from_xgboost(bst)

model.export_srcpkg(platform='unix', toolchain='gcc', pkgpath='./mymodel.zip',
                    libname='mymodel.so', verbose=True)

[14:55:47] /io/treelite/src/frontend/xgboost.cc:359: Global bias of the model: 0.5
[14:55:47] /io/treelite/src/frontend/xgboost.cc:397: gbm_param_.num_feature = 0
[14:55:47] /io/treelite/src/frontend/xgboost.cc:398: gbm_param_.num_output_group = 0
[14:55:47] /io/treelite/src/compiler/ast_native.cc:22: Using ASTNativeCompiler
[14:55:47] /io/treelite/src/compiler/ast/split.cc:10: Parallel compilation disabled; all member trees will be dumped to a single source file. This may increase compilation time and memory usage.
[14:55:47] /io/treelite/src/c_api/c_api.cc:297: Code generation finished. Writing code to files...
[14:55:47] /io/treelite/src/c_api/c_api.cc:314: Writing file recipe.json...
[14:55:47] /io/treelite/src/c_api/c_api.cc:314: Writing file main.c...
[14:55:47] /io/treelite/src/c_api/c_api.cc:314: Writing file header.h...


## 验证和C一致性

In [41]:
selected_object_ids = list(range(50000, 50004))

### 待验证数据

In [42]:
mask = np.isin(df_objects['id'], selected_object_ids)
mask = np.where(mask)[0]
df_objects.loc[mask, 'ppg'].values

array([1099232., 1098752., 1098944., 1099328., 1099680., 1097216.,
       1095264., 1093984., 1093952., 1095296., 1096256., 1097600.,
       1098176., 1099040., 1100224., 1099520., 1088288., 1078272.,
       1074176., 1073056., 1073600., 1074720., 1074464., 1072992.,
       1071040., 1070400., 1070752., 1072000., 1073344., 1075360.,
       1077024., 1078304., 1079424., 1080928., 1082368., 1074880.,
       1063584., 1058368., 1056864., 1057760., 1059136., 1059968.,
       1060000., 1059008., 1058144., 1058272., 1059392., 1061440.,
       1062976., 1064960., 1066624., 1068000., 1069792., 1071680.,
       1067648., 1057280., 1051296., 1050112., 1051488., 1053024.,
       1054464., 1055392., 1053792., 1053664., 1053920., 1055456.,
       1057088., 1059392., 1061760., 1063488., 1066080., 1067968.,
       1069952., 1066240., 1056736., 1051008., 1049664., 1051168.,
       1053056., 1055232., 1056384., 1055424., 1055264., 1055488.,
       1056992., 1059104., 1061344., 1063904., 1066304., 10688

### 特征一致性

In [46]:
features = df_features.loc[selected_object_ids, X_cols]
features

Unnamed: 0,ppg__autocorrelation__lag_1,"ppg__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""mean"""
5000,0.957756,-3904.277637
5001,0.866809,-3939.830713
5002,0.822571,-3942.463708
5003,0.847003,-3932.429968


### 模型一致性

In [49]:
D = xgboost.DMatrix(features.values)
bst.predict(D)

array([0.12708697, 0.12708697, 0.12708697, 0.12708697], dtype=float32)