In [None]:
import os

import numpy as np

import mcbn.data.dataset_loaders as dl
from mcbn.utils.helper import get_setup
from mcbn.utils.helper import random_subset_indices
from mcbn.utils.helper import get_logger

from mcbn.environment.constants import DATA_PATH

# Split datasets into test and training/CV

In [None]:
logger = get_logger()

logger.info("STEP 1: Splitting datasets into test and training/CV")

s = get_setup()

# Set random generator seed for reproducible splits
np.random.seed(s['split_seed'])

In [None]:
for dataset_name in s['datasets']:
    logger.info("Making splits for dataset " + dataset_name)
    
    # Load full dataset
    X, y = dl.load_uci_data_full(dataset_name)
    
    # Get test examples count
    N = y.shape[0]
    test_count = int(round(s['test_fraction'] * N))
    
    # Get indices of test and training/validation data at random
    test_idx, trainval_idx = random_subset_indices(y, test_count)
    
    path = os.path.join(DATA_PATH, dataset_name, 'train_cv-test')
    dl.save_indices(path, 'test_indices.txt', test_idx)
    dl.save_indices(path, 'train_cv_indices.txt', trainval_idx)

In [None]:
logger.info("DONE STEP 1")