-
Notifications
You must be signed in to change notification settings - Fork 7
/
_table_row_classifier.py
274 lines (222 loc) · 10.9 KB
/
_table_row_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
from typing import Dict, List, Union, Optional
import numpy as np
import pandas as pd
from ._pixel_classifier import PixelClassifier
class TableRowClassifier:
def __init__(
self,
opencl_filename="temp_table_row_classifier.cl",
max_depth: int = 2, num_ensembles: int = 100,
overwrite_classname: Optional[str] = None,
):
"""A RandomForestClassifier for classifying rows of a table that converts itself to OpenCL after training.
Parameters
----------
opencl_filename : str (optional)
The path to which the openCL classifier will be saved.
max_depth : int (optional)
The maximum depth of the tree.
num_ensembles : int (optional)
The number of trees in the random forest.
See Also
--------
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
"""
self._ordered_feature_names = []
if overwrite_classname is None:
overwrite_classname = self.__class__.__name__
self._classifier_classname = overwrite_classname
self.classifier = PixelClassifier(
opencl_filename=opencl_filename,
max_depth=max_depth,
num_ensembles=num_ensembles,
overwrite_classname=self.classifier_classname
)
# in case file existed, read feature specification from classifier
from os.path import exists
if exists(opencl_filename):
self.feature_specification = self.classifier.feature_specification
self._ordered_feature_names = self.feature_specification.replace(",", " ").split(" ")
def __str__(self) -> str:
"""Return classifier information as string."""
return str(self.classifier)
def __repr__(self) -> str:
"""Return classifier information as string."""
return str(self.classifier)
@property
def ordered_feature_names(self) -> List[str]:
"""The feature names used in the order they are used by the classifier.
This is set by self._prepare_feature_table()
Returns
-------
ordered_feature_names : List[str]
A list of the feature names in the order they are expected by the classifer.
"""
return self._ordered_feature_names
@property
def classifier_classname(self) -> str:
"""The name used for the classifier class.
This is set in the __init__ by the overwrite_classname kwarg.
Returns
-------
classifier_classname : str
The name used for the classifier class.
"""
return self._classifier_classname
def train(
self,
feature_table: Dict[str, Union[List[float], np.ndarray]],
ground_truth: np.ndarray,
continue_training: bool = False
):
"""Train a classifier that can differentiate classes from rows of pre-calculated features.
Parameters
----------
feature_table : Dict[str, Union[List[float], np.ndarray]]
The table from which to make the prediction. Each row of the table
will be classified. The table can either be a pandas DataFrame or a
Dict with string keys (column names) and numpy array columns.
ground_truth : np.array
The array containing the ground truth class for each row in feature_table
continue_training : bool
Flag set to true if training is to be continued from an existing classifier.
The default value is False.
"""
ordered_features, ground_truth = self._prepare_feature_table(feature_table, ground_truth)
self.classifier.train(ordered_features, ground_truth, continue_training=continue_training)
self.classifier.to_opencl_file(self.classifier.opencl_file, overwrite_classname=self.classifier_classname)
self.feature_specification = self.classifier.feature_specification
def predict(
self,
feature_table: Dict[str, Union[List[float], np.ndarray]]
) -> np.array:
"""Predict row class from a table.
Parameters
----------
feature_table : Dict[str, Union[List[float], np.ndarray]]
The table from which to make the prediction. Each row of the table
will be classified. The table can either be a pandas DataFrame or a
Dict with string keys (column names) and numpy array columns.
Returns
-------
output : np.ndarray
An array containing the predicted class for each row.
"""
import pyclesperanto_prototype as cle
ordered_features = self.order_feature_table(feature_table)
# allocate the result
if len(ordered_features[0].shape) > 1:
output = cle.create_like(ordered_features[0].shape)
else:
# make sure it's at least 2D
output = cle.create((1, len(ordered_features[0])))
# push the features
parameters = {}
for i, f in enumerate(ordered_features):
parameters['in' + str(i)] = cle.push(f)
parameters['out'] = output
# run the classifier
cle.execute(None, self.classifier.opencl_file, "predict", ordered_features[0].shape, parameters)
# determine rows which contained NaN values
table = pd.DataFrame(ordered_features)
was_not_nan = cle.asarray([1 - np.max(table.isnull().values, axis=0) * 1])
# mask output: if there was NaN before, classification becomes 0
output = was_not_nan * output
return np.asarray(output[0]).astype(np.uint32)
def to_opencl_file(self, filename, extra_information: str = None, overwrite_classname:str = None):
"""Save the classifier to an OpenCL-file.
See Also
--------
.. PixelClassifier.to_opencl_file()
"""
if overwrite_classname is None:
overwrite_classname = self.__class__.__name__
return self.classifier.to_opencl_file(filename=filename, extra_information=extra_information, overwrite_classname=overwrite_classname)
def _prepare_feature_table(
self,
feature_table: Dict[str, Union[List[float], np.ndarray]],
ground_truth : np.ndarray
) -> List[np.ndarray]:
"""Prepare a feature table for training.
This coerces the feature table into the form expected by the classifier
(list of numpy array) and stores the order of the features.
Table entries where any column == NaN or ground_truth = 0 are dropped.
Parameters
----------
feature_table : Dict[str, Union[List[float], np.ndarray]]
The table from which to make the prediction. Each row of the table
will be classified. The table can either be a pandas DataFrame or a
Dict with string keys (column names) and numpy array columns.
ground_truth : np.array
The array containing the ground truth class for each row in feature_table
Returns
-------
ordered_features : List[np.ndarray]
The features stored in a list. The order of the features is
specified by self.ordered_feature_names
updated_ground_truth : List[int]
new list of ground_truth without the entries where ground_truth==0 or any
column was NaN
"""
# make sure it's a DataFrame and we can modify it
feature_table = pd.DataFrame(feature_table).copy()
# store original keys
original_keys = feature_table.keys().copy()
# add ground_truth to table so that we can filter it with the other columns
if "ground_truth" in feature_table.keys():
raise ValueError("feature_table must not contain column named 'ground_truth'")
feature_table['ground_truth'] = ground_truth
# drop rows with NaN values
feature_table = feature_table.dropna(how="any")
# drop rows with ground_truth == 0
feature_table = feature_table.loc[feature_table['ground_truth'] > 0]
# get ground_truth back and a table with the original columns
updated_ground_truth = np.asarray(feature_table['ground_truth'])
feature_table = feature_table[original_keys]
self._ordered_feature_names = list(feature_table.keys())
self.classifier.feature_specification = " ".join(self.ordered_feature_names)
return self.order_feature_table(feature_table), updated_ground_truth
def order_feature_table(self, feature_table: Dict[str, np.ndarray]) -> List[np.ndarray]:
"""Coerce a feature table into the format required by the classifier.
Parameters
----------
feature_table : Dict[str, Union[List[float], np.ndarray]]
The table from which to make the prediction. Each row of the table
will be classified. The table can either be a pandas DataFrame or a
Dict with string keys (column names) and numpy array columns.
Returns
-------
ordered_features : List[np.ndarray]
The features stored in a list. The order of the features is
specified by self.ordered_feature_names
"""
return [np.asarray(feature_table[feature]) for feature in self.ordered_feature_names]
def feature_importances(self):
"""Provide feature importances about the trained Random Forest Classifier
The values are provided as dictionary {feature_name:portion_importance}.
See also
--------
..[0] https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
"""
return self.classifier.feature_importances()
def statistics(self):
"""Provide statistics about the trained Random Forest Classifier
After training or loading a model, this function reads out the decision trees and generates
statistics from it. It counts for each decision depth how often given features are taken into
account. It returns two dictionaries. Both dictionaries contain the feature names used during
training as keys. The values are lists with numbers: The first 'ratios' dictionary contains
numbers between 0 and 1. The higher that number, the more often is the given feature taken
into account on the given decision level. The second 'count' dictionary contains the count of
decisions taking a given feature into account. For example in case the result looks like this:
A: [0.3, 0.1], [30, 20]
B: [0.7, 0.9], [70, 180]
Feature A was taken into account 30% of the decision trees on the first level and in 10% on
the second level. In case of 100 trees, these are 30 trees on the first level and can be up
to 20 trees on the second level. Each level doubles the number of available decisions in these
binary decision trees.
Returns
-------
ratios: dict
counts: dict
"""
return self.classifier.statistics()