In [223]:
n_folds = 3
n_frames = 23
n_frames_in_sample = 3

n_samples = n_frames - n_frames_in_sample + 1

all_data = list(range(n_frames))

n_samples, len(all_data)

(21, 23)

In [224]:
import copy


def collator(data, seq_length=3):
    it = iter(data)
    for this_record in it:
        seq = [this_record]
        seq_it = copy.copy(it)
        for next_record in seq_it:
            if len(seq) < seq_length:
                seq.append(next_record)
                this_record = next_record
            else:
                break
        if len(seq) == seq_length:
            yield seq


In [225]:
import numpy as np

collated_samples = [d for d in collator(all_data)]
assert len(collated_samples) == n_samples
collated_samples = np.array(collated_samples)
collated_samples.shape

(21, 3)

In [226]:
def leakage(train, test):
    train_frames = {i for i in collated_samples[train].flat}
    test_frames = {i for i in collated_samples[test].flat}
    l = train_frames.intersection(test_frames)
    return len(l), l


# Random folds example

In [227]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)
for train, test in kf.split(collated_samples):
    print(train + 1)
    print(test + 1)
    break


[ 1  3  4  5  6  7  8 10 13 16 17 18 19 20]
[ 2  9 11 12 14 15 21]


In [228]:
for train, test in kf.split(collated_samples):
    print(leakage(train, test)[0])


14
12
18


# In-order folds example

In [229]:
kf = KFold(n_splits=n_folds, shuffle=False)
for train, test in kf.split(collated_samples):
    print(train + 1)
    print(test + 1)
    break

[ 8  9 10 11 12 13 14 15 16 17 18 19 20 21]
[1 2 3 4 5 6 7]


In [230]:
for train, test in kf.split(collated_samples):
    print(leakage(train, test)[0])


2
4
2


# Custom fold assigment

In [231]:
from dataloader import get_fold_indices

for train, test in get_fold_indices(collated_samples, n_folds=n_folds, chunk_size=7):
    train = np.array(train)
    test = np.array(test)
    print(train + 1)
    print(test + 1)
    break


[ 4  5  6  7 11 12 13 14 18 19 20 21]
[ 1  2  3  8  9 10 15 16 17]


In [232]:
for train, test in get_fold_indices(collated_samples, n_folds=n_folds, chunk_size=7):
    print(leakage(train, test)[0])

10
12
10


## Leakage on real dataset

In [233]:
n_folds = 5
n_frames = 20306
n_frames_in_sample = 3

n_samples = n_frames - n_frames_in_sample + 1

all_data = list(range(n_frames))

n_samples, len(all_data)



(20304, 20306)

In [234]:
collated_samples = [d for d in collator(all_data)]
assert len(collated_samples) == n_samples
collated_samples = np.array(collated_samples)
collated_samples.shape

(20304, 3)

### Random

In [235]:
leak = set()
for train, test in KFold(n_splits=n_folds, shuffle=True, random_state=0).split(collated_samples):
    n_leaked_frames, leaked_frames = leakage(train, test)
    print(n_leaked_frames)
    leak.update(leaked_frames)

len(leak)/n_frames*100

9729
9834
9708
9793
9723


95.91746281887127

### Ordered

In [236]:
leak = set()
for train, test in KFold(n_splits=n_folds, shuffle=False).split(collated_samples):
    n_leaked_frames, leaked_frames = leakage(train, test)
    print(n_leaked_frames)
    leak.update(leaked_frames)

len(leak)/n_frames*100

2
4
4
4
2


0.039397222495814045

### Custom

In [237]:
leak = set()
for train, test in get_fold_indices(collated_samples, n_folds=n_folds):
    n_leaked_frames, leaked_frames = leakage(train, test)
    print(n_leaked_frames)
    leak.update(leaked_frames)

len(leak)/n_frames*100

758
760
760
760
758


9.346991037131883