You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
When using sklearn_pmml_model.ensemble.PMMLForestClassifier load pmml weight file, when call predict_proba in loop, the memory usage will get a continuous growth.
Steps/Code to Reproduce
from sklearn_pmml_model.ensemble import PMMLForestClassifier
import numpy as np
import logging
from random import random
from tqdm import tqdm
import gc
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s',)
mlog = logging.getLogger('myLogger')
level = logging.getLevelName('INFO')
mlog.setLevel(level)
class MergeClassifier(object):
def __init__(self, config):
self._config = config
@staticmethod
def _clip_data(x, max_val=10000.0):
for j in range(len(x)):
x[j] = min(max_val, x[j])
x[j] = max(-max_val, x[j])
return x
def __call__(self, feature, threshold=None):
raise NotImplementedError
class RFMergeClassifier(MergeClassifier):
def __init__(self, config):
super(RFMergeClassifier, self).__init__(config)
model_file = config["merge_model"]
self._classifier = PMMLForestClassifier(pmml=model_file, n_jobs=1)
self._threshold = 0.10799543425155324
self._keep_pool_threshold = 0.06698819140047929
self._feature_names = ['reid_dist_p10', 'reid_dist_p20', 'reid_dist_p25',
'reid_dist_p30', 'reid_dist_p50', 'region_in_out_match',
'has_shared_companion', 'tpid_overlap_duration',
'min_tpid_duration', 'max_tpid_duration',
'min_spatial_distance', 'max_spatial_distance',
'spatial_match_count', 'confident_spatial_match_count',
'spatial_match_candidate_count', 'static_area_match',
'exact_door_match', 'other_door_match', 'end_door_distance',
'start_door_distance', 'min_staff_prob', 'max_staff_prob']
if "merge_model_threshold" in config:
self._threshold = config["merge_model_threshold"]
if "merge_model_keep_threshold" in config:
self._keep_pool_threshold = config["merge_model_keep_threshold"]
def build_feature(self):
return {key: random() * 10 for key in self._feature_names}
@property
def feature_keys(self):
return self._feature_names
def _get_score(self, feature, t1, t2, threshold=None):
x = self._clip_data([feature[key] for key in self._feature_names])
score = self._classifier.predict_proba(np.asarray([x], dtype=np.float32))[0, 1]
if "face_match_score" in feature:
score += feature["face_match_score"]
return score
def __call__(self, feature, t1, t2, threshold=None):
if threshold is None:
threshold = self._threshold
score = self._get_score(feature, t1, t2)
return score > threshold, 1 - score, \
score > self._keep_pool_threshold, "match prob {:0.3f}".format(score)
if __name__ == "__main__":
config = {
"merge_model": "./models/random_forest.pmml",
"merge_model_threshold": 0.5,
"_keep_pool_threshold": 2
}
_classifer = RFMergeClassifier(config)
feature = _classifer.build_feature()
for i in tqdm(range(10000)):
_classifer(feature, 100, 100, 0.5)
Thanks for the detailed report! If its not too much effort and you still have this setup ready, could you make sure using scikit-learn's RandomForestClassifier does not exhibit this leak? I don't expect so, but its useful to exclude early on.
I managed to reproduce the error and trace it back to a typo in the originally forked cython code. I released a new version 1.0.3 with the fix. Let me know if this resolved your issue :)
Description
When using sklearn_pmml_model.ensemble.PMMLForestClassifier load pmml weight file, when call predict_proba in loop, the memory usage will get a continuous growth.
Steps/Code to Reproduce
Actual Results
there is the memray report for above code
Versions
macOS-10.16-x86_64-i386-64bit
Python 3.9.7 (default, Sep 16 2021, 08:50:36)
[Clang 10.0.0 ]
NumPy 1.23.0
SciPy 1.8.1
Scikit-Learn 1.3.0
sklearn-pmml-model 1.0.2
The text was updated successfully, but these errors were encountered: