In [1]:
import numpy as np
from collections import Counter

In [2]:
def kfold(x, y, n_splits=3, shuffle=True, random_state=42):
    cv_samples = np.arange(0, len(y)) # creating list of y indices
    if shuffle:
        np.random.seed(random_state)
        cv_samples = np.random.choice(cv_samples, len(cv_samples), replace=False) # shuffling the indices

    cut = np.linspace(0, len(cv_samples), n_splits+1).astype("int") # creating 'n_splits' cuts to split the labels
    cv_samples = [cv_samples[cut[i]:cut[i+1]] for i in range(len(cut)-1)] # creating a list of 'n_splits' list of samples of y
    
    split = 0 # int variable to track split #
    split_idx = [*range(n_splits)] # list of split # [0, n_splits)
    while split < n_splits:
        test_cv_idx = split_idx[split] # index of test samples from cv_samples
        train_cv_idx = split_idx[: split] + split_idx[split+1:] # index of train samples from cv_samples
        train_idx = np.concatenate([cv_samples[idx] for idx in train_cv_idx]) # train samples idxs
        test_idx = cv_samples[test_cv_idx] # test samples idxs
        split += 1
        yield train_idx, test_idx # using generator to prevent out of memory error

### TESTING ON BREAST CANCER DATA

In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
x = load_breast_cancer()['data']
y = load_breast_cancer()['target']

In [4]:
# TESTING Y DISTRIBUTION IN TRAIN TEST SPLITS
print(f'Y DISTRIBUTION\n{pd.Series(y).value_counts(normalize=True)}\n')
cv_samples = kfold(x, y)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    print(f'TRAIN SPLIT {n}\n{pd.Series(y[train_idx]).value_counts(normalize=True)}\n')
    print(f'TEST SPLIT {n}\n{pd.Series(y[test_idx]).value_counts(normalize=True)}\n')

Y DISTRIBUTION
1    0.627417
0    0.372583
dtype: float64

TRAIN SPLIT 0
1    0.621053
0    0.378947
dtype: float64

TEST SPLIT 0
1    0.640212
0    0.359788
dtype: float64

TRAIN SPLIT 1
1    0.620053
0    0.379947
dtype: float64

TEST SPLIT 1
1    0.642105
0    0.357895
dtype: float64

TRAIN SPLIT 2
1    0.641161
0    0.358839
dtype: float64

TEST SPLIT 2
1    0.6
0    0.4
dtype: float64



In [5]:
# TESTING FOR DUPLICATE IDXS IN TRAIN AND TEST IDXS
cv_samples = kfold(x, y)
for train_idx, test_idx in cv_samples:
    assert len(set(test_idx).intersection(set(train_idx))) == 0
print("No issues found")

No issues found


In [6]:
# TESTING FOR EQUAL TRAIN + TEST LENGTHS ACROSS ALL SPLITS
len_ = 0
cv_samples = kfold(x, y)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    if n == 0:
        len_ = len(train_idx) + len(test_idx)
        assert len_ == len(y)
    else:
        assert len_ == len(train_idx) + len(test_idx)
print("No issues found")

No issues found


In [7]:
# TESTING FOR DUPLICATE IDXS IN TRAIN AND TEST SPLITS
cv_samples = kfold(x, y)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    assert len(set(train_idx)) == len(train_idx)
    assert len(set(test_idx)) == len(test_idx)
print("No issues found")

No issues found


In [8]:
%%timeit
cv_samples = kfold(x, y, n_splits=5)
for train_idx, test_idx in cv_samples:
    pass

189 µs ± 48.4 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### TESTING ON IRIS DATA

In [9]:
import pandas as pd
from sklearn.datasets import load_iris
x = load_iris()['data']
y = load_iris()['target']

In [10]:
# TESTING Y DISTRIBUTION IN TRAIN TEST SPLITS
print(f'Y DISTRIBUTION\n{pd.Series(y).value_counts(normalize=True)}\n')
cv_samples = kfold(x, y, n_splits=5)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    print(f'TRAIN SPLIT {n}\n{pd.Series(y[train_idx]).value_counts(normalize=True)}\n')
    print(f'TEST SPLIT {n}\n{pd.Series(y[test_idx]).value_counts(normalize=True)}\n')

Y DISTRIBUTION
0    0.333333
1    0.333333
2    0.333333
dtype: float64

TRAIN SPLIT 0
1    0.341667
0    0.333333
2    0.325000
dtype: float64

TEST SPLIT 0
2    0.366667
0    0.333333
1    0.300000
dtype: float64

TRAIN SPLIT 1
2    0.358333
1    0.333333
0    0.308333
dtype: float64

TEST SPLIT 1
0    0.433333
1    0.333333
2    0.233333
dtype: float64

TRAIN SPLIT 2
2    0.350000
1    0.333333
0    0.316667
dtype: float64

TEST SPLIT 2
0    0.400000
1    0.333333
2    0.266667
dtype: float64

TRAIN SPLIT 3
0    0.350000
1    0.333333
2    0.316667
dtype: float64

TEST SPLIT 3
2    0.400000
1    0.333333
0    0.266667
dtype: float64

TRAIN SPLIT 4
0    0.358333
1    0.325000
2    0.316667
dtype: float64

TEST SPLIT 4
2    0.400000
1    0.366667
0    0.233333
dtype: float64



In [11]:
# TESTING FOR DUPLICATE IDXS IN TRAIN AND TEST IDXS
cv_samples = kfold(x, y, n_splits=5)
for train_idx, test_idx in cv_samples:
    assert len(set(test_idx).intersection(set(train_idx))) == 0
print("No issues found")

No issues found


In [12]:
# TESTING FOR EQUAL TRAIN + TEST LENGTHS ACROSS ALL SPLITS
len_ = 0
cv_samples = kfold(x, y, n_splits=5)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    if n == 0:
        len_ = len(train_idx) + len(test_idx)
        assert len_ == len(y)
    else:
        assert len_ == len(train_idx) + len(test_idx)
print("No issues found")

No issues found


In [13]:
# TESTING FOR DUPLICATE IDXS IN TRAIN AND TEST SPLITS
cv_samples = kfold(x, y, n_splits=5)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    assert len(set(train_idx)) == len(train_idx)
    assert len(set(test_idx)) == len(test_idx)
print("No issues found")

No issues found


In [14]:
%%timeit
cv_samples = kfold(x, y, n_splits=5)
for train_idx, test_idx in cv_samples:
    pass

161 µs ± 29.5 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### TESTING ON SYNTHETIC DATASET WITH 10M SAMPLES & 10 FEATURES

In [15]:
from sklearn.datasets import make_classification

In [16]:
x, y = make_classification(n_samples=10000000, n_features=10, n_classes=10, n_informative=8, weights=[0.1, 0.3, 0.2, 0.05, 0.02, 0.03, 0.1, 0.05, 0.02, 0.13])

In [17]:
# TESTING Y DISTRIBUTION IN TRAIN TEST SPLITS
print(f'Y DISTRIBUTION\n{pd.Series(y).value_counts(normalize=True)}\n')
cv_samples = kfold(x, y, n_splits=5)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    print(f'TRAIN SPLIT {n}\n{pd.Series(y[train_idx]).value_counts(normalize=True)}\n')
    print(f'TEST SPLIT {n}\n{pd.Series(y[test_idx]).value_counts(normalize=True)}\n')

Y DISTRIBUTION
1    0.297990
2    0.199017
9    0.129694
6    0.100014
0    0.100011
7    0.050496
3    0.050488
5    0.030703
4    0.020804
8    0.020783
dtype: float64

TRAIN SPLIT 0
1    0.297958
2    0.199063
9    0.129652
6    0.100054
0    0.099964
3    0.050554
7    0.050484
5    0.030698
4    0.020800
8    0.020774
dtype: float64

TEST SPLIT 0
1    0.298116
2    0.198833
9    0.129864
0    0.100199
6    0.099858
7    0.050549
3    0.050223
5    0.030721
8    0.020820
4    0.020817
dtype: float64

TRAIN SPLIT 1
1    0.298061
2    0.198980
9    0.129691
0    0.100021
6    0.099925
7    0.050510
3    0.050500
5    0.030719
4    0.020810
8    0.020781
dtype: float64

TEST SPLIT 1
1    0.297702
2    0.199164
9    0.129706
6    0.100372
0    0.099973
7    0.050442
3    0.050438
5    0.030635
8    0.020790
4    0.020777
dtype: float64

TRAIN SPLIT 2
1    0.297955
2    0.198942
9    0.129756
0    0.100048
6    0.100042
3    0.050472
7    0.050472
5    0.030692
4    0.020834
8    0.0207

In [18]:
# TESTING FOR DUPLICATE IDXS IN TRAIN AND TEST IDXS
cv_samples = kfold(x, y, n_splits=5)
for train_idx, test_idx in cv_samples:
    assert len(set(test_idx).intersection(set(train_idx))) == 0
print("No issues found")

No issues found


In [19]:
# TESTING FOR EQUAL TRAIN + TEST LENGTHS ACROSS ALL SPLITS
len_ = 0
cv_samples = kfold(x, y, n_splits=5)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    if n == 0:
        len_ = len(train_idx) + len(test_idx)
        assert len_ == len(y)
    else:
        assert len_ == len(train_idx) + len(test_idx)
print("No issues found")

No issues found


In [20]:
# TESTING FOR DUPLICATE IDXS IN TRAIN AND TEST SPLITS
cv_samples = kfold(x, y, n_splits=5)
for n, (train_idx, test_idx) in enumerate(cv_samples):
    assert len(set(train_idx)) == len(train_idx)
    assert len(set(test_idx)) == len(test_idx)
print("No issues found")

No issues found


In [21]:
%%timeit
cv_samples = kfold(x, y, n_splits=5)
for train_idx, test_idx in cv_samples:
    pass

986 ms ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
