# Cross-validation

In [1]:
import ipytest
import numpy as np
import random

ipytest.autoconfig()

## Task

Given a set of instances (by their IDs), divide them into k folds to perform cross-validation.

Each fold should enumerate the instances for the train and test splits.

In [2]:
def create_folds(instances, k=5):
    """Given a set of instances, it returns k splits of train and test."""
    # Shuffle instances (by first making a copy of them).
    instances_shuffled = list(instances)
    random.shuffle(instances_shuffled)

    folds = []
    for fold_id in range(k):
        train, test = [], []
        for i in range(len(instances_shuffled)):
            if i % k == fold_id:
                test.append(instances_shuffled[i])
            else:
                train.append(instances_shuffled[i])
        
        folds.append({
            'train': train, 
            'test': test
        })
    return folds

### Tests

One simple test is provided, which merely checks if the required number of folds is generated and that each contains the correct number of train and test instances.

Part of the exercise is to create some more advanced tests. 

  - One test should test converage, that is, check that all instances are part of exactly one test fold and k-1 train folds.
  - Another test should checks that the folds are sufficiently random, i.e., that you're not always returning the exact same partitioning of instances.

In [3]:
%%run_pytest[clean]

def test_fold_size():
    instances = list(range(100))
    folds = create_folds(instances, k=5)
    assert len(folds) == 5
    for fold in folds:
        assert len(fold['train']) == 80
        assert len(fold['test']) == 20
  
def test_coverage():
    instances = list(range(100))
    k = 5
    folds = create_folds(instances, k=k)
    in_train_fold = {}  # How many times each instance is in a train fold
    in_test_fold = {}  # How many times each instance is in a test fold
    for fold in folds:
        for instance in fold['train']:
            in_train_fold[instance] = in_train_fold.get(instance, 0) + 1
        for instance in fold['test']:
            in_test_fold[instance] = in_test_fold.get(instance, 0) + 1        
    
    for instance in instances:
        assert (instance in in_train_fold) == True
        assert in_train_fold[instance] == k - 1
        assert (instance in in_test_fold) == True
        assert in_test_fold[instance] == 1
    
def test_randomization():
    instances = list(range(100))
    k = 5
    # Keeps track of which test fold the instance was part.
    # This should be roughly uniformly distributed
    in_test_fold = [[0] * k for _ in instances]
    for _ in range(1000):
        folds = create_folds(instances, k=k)
        for fold_id, fold in enumerate(folds):
            for instance in fold['test']:
                in_test_fold[instance][fold_id] += 1
    
    for fold_distribution in in_test_fold:
        assert np.std(fold_distribution) < 30

...                                                                                [100%]
3 passed in 0.15s
