From a19f0df0c4b570a7558d389ba953387cd76d723e Mon Sep 17 00:00:00 2001 From: "(Venkat) Raghav, Rajagopalan" Date: Sun, 16 Jul 2017 17:58:59 -0500 Subject: [PATCH] [MRG] Add few more tests + Documentation for re-entrant cross-validation estimators (#7823) * DOC Add NOTE that unless random_state is set, split will not be identical * TST use np.testing.assert_equal for nested lists/arrays * TST Make sure cv param can be a generator * DOC rank_ becomes a link when rendered * Use test_... * Remove blank line; Add if shuffle is True * Fix tests * Explicitly test for GeneratorType * TST Add the else clause * TST Add comment on usage of np.testing.assert_array_equal * TYPO * MNT Remove if ; * Address Joel's comments * merge the identical points in doc * DOC address Andy's comments * Move comment to before the check for generator type --- doc/modules/cross_validation.rst | 3 +- sklearn/model_selection/_search.py | 2 +- sklearn/model_selection/_split.py | 41 +++++++++++++++++-- sklearn/model_selection/tests/test_search.py | 42 +++++++++++++++----- sklearn/model_selection/tests/test_split.py | 7 +++- 5 files changed, 76 insertions(+), 19 deletions(-) diff --git a/doc/modules/cross_validation.rst b/doc/modules/cross_validation.rst index a242cf9ab62bb..a43c5cf675cb8 100644 --- a/doc/modules/cross_validation.rst +++ b/doc/modules/cross_validation.rst @@ -723,8 +723,7 @@ to shuffle the data indices before splitting them. Note that: shuffling will be different every time ``KFold(..., shuffle=True)`` is iterated. However, ``GridSearchCV`` will use the same shuffling for each set of parameters validated by a single call to its ``fit`` method. -* To ensure results are repeatable (*on the same platform*), use a fixed value - for ``random_state``. +* To get identical results for each split, set ``random_state`` to an integer. Cross validation and model selection ==================================== diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py index 17c588c293eda..db41c19218fa7 100644 --- a/sklearn/model_selection/_search.py +++ b/sklearn/model_selection/_search.py @@ -924,7 +924,7 @@ class GridSearchCV(BaseSearchCV): For instance the below given table +------------+-----------+------------+-----------------+---+---------+ - |param_kernel|param_gamma|param_degree|split0_test_score|...|..rank...| + |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...| +============+===========+============+=================+===+=========+ | 'poly' | -- | 2 | 0.8 |...| 2 | +------------+-----------+------------+-----------------+---+---------+ diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py index 4bcc0ae1c5349..386d439184117 100644 --- a/sklearn/model_selection/_split.py +++ b/sklearn/model_selection/_split.py @@ -83,6 +83,12 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) indices = np.arange(_num_samples(X)) @@ -308,6 +314,12 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -567,10 +579,7 @@ def __init__(self, n_splits=3, shuffle=False, random_state=None): super(StratifiedKFold, self).__init__(n_splits, shuffle, random_state) def _make_test_folds(self, X, y=None): - if self.shuffle: - rng = check_random_state(self.random_state) - else: - rng = self.random_state + rng = self.random_state y = np.asarray(y) n_samples = y.shape[0] unique_y, y_inversed = np.unique(y, return_inverse=True) @@ -645,6 +654,12 @@ def split(self, X, y, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. """ y = check_array(y, ensure_2d=False, dtype=None) return super(StratifiedKFold, self).split(X, y, groups) @@ -726,6 +741,12 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) n_samples = _num_samples(X) @@ -1164,6 +1185,12 @@ def split(self, X, y=None, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. """ X, y, groups = indexable(X, y, groups) for train, test in self._iter_indices(X, y, groups): @@ -1578,6 +1605,12 @@ def split(self, X, y, groups=None): test : ndarray The testing set indices for that split. + + Note + ---- + Randomized CV splitters may return different results for each call of + split. You can make the results identical by setting ``random_state`` + to an integer. """ y = check_array(y, ensure_2d=False, dtype=None) return super(StratifiedShuffleSplit, self).split(X, y, groups) diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py index 9dfd49714ee08..5e667727d9dda 100644 --- a/sklearn/model_selection/tests/test_search.py +++ b/sklearn/model_selection/tests/test_search.py @@ -7,6 +7,7 @@ from itertools import chain, product import pickle import sys +from types import GeneratorType import re import numpy as np @@ -1070,16 +1071,10 @@ def test_search_cv_results_rank_tie_breaking(): cv_results['mean_test_score'][1]) assert_almost_equal(cv_results['mean_train_score'][0], cv_results['mean_train_score'][1]) - try: - assert_almost_equal(cv_results['mean_test_score'][1], - cv_results['mean_test_score'][2]) - except AssertionError: - pass - try: - assert_almost_equal(cv_results['mean_train_score'][1], - cv_results['mean_train_score'][2]) - except AssertionError: - pass + assert_false(np.allclose(cv_results['mean_test_score'][1], + cv_results['mean_test_score'][2])) + assert_false(np.allclose(cv_results['mean_train_score'][1], + cv_results['mean_train_score'][2])) # 'min' rank should be assigned to the tied candidates assert_almost_equal(search.cv_results_['rank_test_score'], [1, 1, 3]) @@ -1421,6 +1416,33 @@ def test_grid_search_cv_splits_consistency(): cv=KFold(n_splits=n_splits)) gs2.fit(X, y) + # Give generator as a cv parameter + assert_true(isinstance(KFold(n_splits=n_splits, + shuffle=True, random_state=0).split(X, y), + GeneratorType)) + gs3 = GridSearchCV(LinearSVC(random_state=0), + param_grid={'C': [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, + random_state=0).split(X, y)) + gs3.fit(X, y) + + gs4 = GridSearchCV(LinearSVC(random_state=0), + param_grid={'C': [0.1, 0.2, 0.3]}, + cv=KFold(n_splits=n_splits, shuffle=True, + random_state=0)) + gs4.fit(X, y) + + def _pop_time_keys(cv_results): + for key in ('mean_fit_time', 'std_fit_time', + 'mean_score_time', 'std_score_time'): + cv_results.pop(key) + return cv_results + + # Check if generators are supported as cv and + # that the splits are consistent + np.testing.assert_equal(_pop_time_keys(gs3.cv_results_), + _pop_time_keys(gs4.cv_results_)) + # OneTimeSplitter is a non-re-entrant cv where split can be called only # once if ``cv.split`` is called once per param setting in GridSearchCV.fit # the 2nd and 3rd parameter will not be evaluated as no train/test indices diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py index b1bb44efe59c2..300bb8953efae 100644 --- a/sklearn/model_selection/tests/test_split.py +++ b/sklearn/model_selection/tests/test_split.py @@ -446,9 +446,11 @@ def test_shuffle_kfold_stratifiedkfold_reproducibility(): for cv in (kf, skf): for data in zip((X, X2), (y, y2)): + # Test if the two splits are different + # numpy's assert_equal properly compares nested lists try: - np.testing.assert_equal(list(cv.split(*data)), - list(cv.split(*data))) + np.testing.assert_array_equal(list(cv.split(*data)), + list(cv.split(*data))) except AssertionError: pass else: @@ -1188,6 +1190,7 @@ def test_cv_iterable_wrapper(): # results kf_randomized_iter = KFold(n_splits=5, shuffle=True).split(X, y) kf_randomized_iter_wrapped = check_cv(kf_randomized_iter) + # numpy's assert_array_equal properly compares nested lists np.testing.assert_equal(list(kf_randomized_iter_wrapped.split(X, y)), list(kf_randomized_iter_wrapped.split(X, y)))