-
Notifications
You must be signed in to change notification settings - Fork 17
/
api.py
403 lines (349 loc) · 14.2 KB
/
api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
"""API for julearn."""
# Authors: Federico Raimondo <f.raimondo@fz-juelich.de>
# Sami Hamdan <s.hamdan@fz-juelich.de>
# License: AGPL
from typing import List, Optional, Union, Dict
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.model_selection import check_cv, cross_validate
from sklearn.model_selection._search import BaseSearchCV
from sklearn.pipeline import Pipeline
from .pipeline import PipelineCreator
from .pipeline.merger import merge_pipelines
from .prepare import check_consistency, prepare_input_data
from .scoring import check_scoring
from .utils import logger, raise_error, _compute_cvmdsum
from .inspect import Inspector
def run_cross_validation(
X: List[str],
y: str,
model: Union[str, PipelineCreator, BaseEstimator, List[PipelineCreator]],
X_types: Optional[Dict] = None,
data: Optional[pd.DataFrame] = None,
problem_type: Optional[str] = None,
preprocess: Union[None, str, List[str]] = None,
return_estimator: Optional[str] = None,
return_inspector: bool = False,
return_train_score: bool = False,
cv: Union[None, int] = None,
groups: Optional[str] = None,
scoring: Union[str, List[str], None] = None,
pos_labels: Union[str, List[str], None] = None,
model_params: Optional[Dict] = None,
search_params: Optional[Dict] = None,
seed: Optional[int] = None,
n_jobs: Optional[int] = None,
verbose: Optional[int] = 0,
):
"""Run cross validation and score.
Parameters
----------
X : str, list(str) or numpy.array
The features to use.
See :ref:`data_usage` for details.
y : str or numpy.array
The targets to predict.
See :ref:`data_usage` for details.
model : str or scikit-learn compatible model.
If string, it will use one of the available models.
X_types : dict[str, list of str]
A dictionary containing keys with column type as a str and the
columns of this column type as a list of str.
data : pandas.DataFrame | None
DataFrame with the data (optional).
See :ref:`data_usage` for details.
problem_type : str
The kind of problem to model.
Options are:
* "classification": Perform a classification
in which the target (y) has categorical classes (default).
The parameter pos_labels can be used to convert a target with
multiple_classes into binary.
* "regression". Perform a regression. The target (y) has to be
ordinal at least.
preprocess : str, TransformerLike or list or PipelineCreator | None
Transformer to apply to the features. If string, use one of the
available transformers. If list, each element can be a string or
scikit-learn compatible transformer. If None (default), no
transformation is applied.
See documentation for details.
return_estimator : str | None
Return the fitted estimator(s).
Options are:
* 'final': Return the estimator fitted on all the data.
* 'cv': Return the all the estimator from each CV split, fitted on the
training data.
* 'all': Return all the estimators (final and cv).
return_inspector : bool
Whether to return the inspector object (default is False)
return_train_score : bool
Whether to return the training score with the test scores
(default is False).
cv : int, str or cross-validation generator | None
Cross-validation splitting strategy to use for model evaluation.
Options are:
* None: defaults to 5-fold
* int: the number of folds in a `(Stratified)KFold`
* CV Splitter (see scikit-learn documentation on CV)
* An iterable yielding (train, test) splits as arrays of indices.
groups : str or numpy.array | None
The grouping labels in case a Group CV is used.
See :ref:`data_usage` for details.
scoring : ScorerLike, optional
The scoring metric to use.
See https://scikit-learn.org/stable/modules/model_evaluation.html for
a comprehensive list of options. If None, use the model's default
scorer.
pos_labels : str, int, float or list | None
The labels to interpret as positive. If not None, every element from y
will be converted to 1 if is equal or in pos_labels and to 0 if not.
model_params : dict | None
If not None, this dictionary specifies the model parameters to use
The dictionary can define the following keys:
* 'STEP__PARAMETER': A value (or several) to be used as PARAMETER for
STEP in the pipeline. Example: 'svm__probability': True will set
the parameter 'probability' of the 'svm' model. If more than option
is provided for at least one hyperparameter, a search will be
performed.
search_params : dict | None
Additional parameters in case Hyperparameter Tuning is performed, with
the following keys:
* 'kind': The kind of search algorithm to use, e.g.:
'grid' or 'random'. Can be any valid julearn searcher name or
scikit-learn compatible searcher.
* 'cv': If a searcher is going to be used, the cross-validation
splitting strategy to use. Defaults to same CV as for the model
evaluation.
* 'scoring': If a searcher is going to be used, the scoring metric to
evaluate the performance.
See :ref:`hp_tuning` for details.
seed : int | None
If not None, set the random seed before any operation. Useful for
reproducibility.
verbose: int
Verbosity level of outer cross-validation.
Follows scikit-learn/joblib converntions.
0 means no additional information is printed.
Larger number genereally mean more information is printed.
Note: verbosity up to 50 will print into standard error,
while larger than 50 will print in standrad output.
Returns
-------
scores : pd.DataFrame
The resulting scores (one column for each score specified).
Additionally, a 'fit_time' column will be added.
And, if ``return_estimator='all'`` or
``return_estimator='cv'``, an 'estimator' columns with the
corresponding estimators fitted for each CV split.
final_estimator : object
The final estimator, fitted on all the data (only if
``return_estimator='all'`` or ``return_estimator='final'``)
inspector : Inspector | None
The inspector object (only if ``return_inspector=True``)
"""
# Validate parameters
if return_estimator not in [None, "final", "cv", "all"]:
raise_error(
f"return_estimator must be one of None, 'final', 'cv', 'all'. "
f"Got {return_estimator}."
)
if return_inspector:
if return_estimator is None:
logger.info("Inspector requested: setting return_estimator='all'")
return_estimator = "all"
if return_estimator != "all":
raise_error(
"return_inspector=True requires return_estimator to be `all`."
)
X_types = {} if X_types is None else X_types
if seed is not None:
# If a seed is passed, use it, otherwise do not do anything. User
# might have set the seed outside of the library
logger.info(f"Setting random seed to {seed}")
np.random.seed(seed)
# Interpret the input data and prepare it to be used with the library
df_X, y, df_groups, X_types = prepare_input_data(
X=X,
y=y,
df=data,
pos_labels=pos_labels,
groups=groups,
X_types=X_types,
)
if model_params is None:
model_params = {}
if search_params is None:
search_params = {}
# Deal with model and preprocess
if isinstance(model, Pipeline):
raise_error(
"Currently we do not allow to pass a scikit-learn pipeline as "
"model, use PipelineCreator instead"
)
wrap_score = False
if isinstance(model, (PipelineCreator, list)):
if preprocess is not None:
raise_error(
"If model is a PipelineCreator (or list of), "
"preprocess should be None"
)
if problem_type is not None:
raise_error("Problem type should be set in the PipelineCreator")
if len(model_params) > 0:
raise_error(
"If model is a PipelineCreator (or list of), model_params must"
f" be None. Currently, it contains {model_params.keys()}"
)
if isinstance(model, list):
if any(not isinstance(m, PipelineCreator) for m in model):
raise_error(
"If model is a list, all elements must be PipelineCreator"
)
else:
model = [model]
problem_types = set([m.problem_type for m in model])
if len(problem_types) > 1:
raise_error(
"If model is a list of PipelineCreator, all elements must have"
" the same problem_type"
)
expanded_models = []
for m in model:
expanded_models.extend(m.split())
wrap_score = expanded_models[-1]._added_target_transformer
all_pipelines = [
model.to_pipeline(X_types=X_types, search_params=search_params)
for model in expanded_models
]
if len(all_pipelines) > 1:
pipeline = merge_pipelines(
*all_pipelines, search_params=search_params
)
else:
pipeline = all_pipelines[0]
problem_type = model[0].problem_type
elif not isinstance(model, (str, BaseEstimator)):
raise_error(
"Model has to be a PipelineCreator, a string or a "
"scikit-learn compatible model."
)
else:
if problem_type is None:
raise_error(
"If model is not a PipelineCreator, then `problem_type` "
"must be specified in run_cross_validation."
)
if isinstance(preprocess, str):
preprocess = [preprocess]
if isinstance(preprocess, list):
pipeline_creator = PipelineCreator.from_list(
preprocess, model_params, problem_type=problem_type
)
elif preprocess is None:
pipeline_creator = PipelineCreator(problem_type=problem_type)
else:
raise_error(
"preprocess has to be a string or a " "list of strings."
)
# Add the model to the pipeline creator
t_params = {}
if isinstance(model, str):
t_params = {
x.replace(f"{model}__", ""): y
for x, y in model_params.items()
if x.startswith(f"{model}__")
}
else:
model_name = model.__class__.__name__.lower()
if any(x.startswith(f"{model_name}__") for x in model_params):
raise_error(
"Cannot use model_params with a model object. Use either "
"a string or a PipelineCreator"
)
pipeline_creator.add(step=model, **t_params)
# Check for extra model_params that are not used
unused_params = []
for t_param in model_params:
used = False
for step in pipeline_creator.steps:
if t_param.startswith(f"{step.name}__"):
used = True
break
if not used:
unused_params.append(t_param)
if len(unused_params) > 0:
raise_error(
"The following model_params are incorrect: " f"{unused_params}"
)
wrap_score = pipeline_creator._added_target_transformer
pipeline = pipeline_creator.to_pipeline(
X_types=X_types, search_params=search_params
)
# Log some information
logger.info("= Data Information =")
logger.info(f"\tProblem type: {problem_type}")
logger.info(f"\tNumber of samples: {len(df_X)}")
logger.info(f"\tNumber of features: {len(df_X.columns)}")
logger.info("====================")
logger.info("")
if problem_type == "classification":
logger.info(f"\tNumber of classes: {len(np.unique(y))}")
logger.info(f"\tTarget type: {y.dtype}")
logger.info(f"\tClass distributions: {y.value_counts()}")
elif problem_type == "regression":
logger.info(f"\tTarget type: {y.dtype}")
# Prepare cross validation
cv_outer = check_cv(cv, classifier=problem_type == "classification")
logger.info(f"Using outer CV scheme {cv_outer}")
check_consistency(y, cv, groups, problem_type)
cv_return_estimator = return_estimator in ["cv", "all"]
scoring = check_scoring(pipeline, scoring,
wrap_score=wrap_score
)
cv_mdsum = _compute_cvmdsum(cv_outer)
fit_params = {}
if df_groups is not None:
if isinstance(pipeline, BaseSearchCV):
fit_params["groups"] = df_groups.values
scores = cross_validate(
pipeline,
df_X,
y,
cv=cv_outer,
scoring=scoring,
groups=df_groups,
return_estimator=cv_return_estimator,
n_jobs=n_jobs,
return_train_score=return_train_score,
verbose=verbose,
fit_params=fit_params,
)
n_repeats = getattr(cv_outer, "n_repeats", 1)
n_folds = len(scores["fit_time"]) // n_repeats
repeats = np.repeat(np.arange(n_repeats), n_folds)
folds = np.tile(np.arange(n_folds), n_repeats)
fold_sizes = np.array(
[list(map(len, x)) for x in cv_outer.split(df_X, y, groups=df_groups)]
)
scores["n_train"] = fold_sizes[:, 0]
scores["n_test"] = fold_sizes[:, 1]
scores["repeat"] = repeats
scores["fold"] = folds
scores["cv_mdsum"] = cv_mdsum
scores_df = pd.DataFrame(scores)
out = scores_df
if return_estimator in ["final", "all"]:
pipeline.fit(df_X, y, **fit_params)
out = scores_df, pipeline
if return_inspector:
inspector = Inspector(
scores=scores_df,
model=pipeline,
X=df_X,
y=y,
groups=df_groups,
cv=cv_outer,
)
out = scores_df, pipeline, inspector
return out