In [58]:
import numpy as np

In [192]:
def train_test_split(X, y, K, test_size=0.1, verbose=True):
    '''
    Splits a dataset (X,y) into a training and a testing set while preserving the same ratio of positive labels in the
    training and testing sets as in the initial dataset. The function also returns two kernel matrices for training and
    testing which entries are computed according to the shuffling operated on the initial dataset before splitting into
    training and testing, i.e. K_train[i,j] = K[X_train[i], X_train[j]] and K_test[i,j] = K[X_test[i], X_train[j]]
    
    Arguments :
    - X : a 2d array of features containing as many rows as there are samples in the dataset
    - y : a 1d array containing the labels of the dataset
    - test_size : a float representing the ratio of the initial data that the testing set should contain (default 0.1)
    - verbose : a Boolean stating whether or not the function should print information about the the initial dataset 
                such as number of samples, ratio of positive samples, etc. (default True)
                
    Outputs : 
    A tuple of length 6 containing:
    - X_train : a 2d array containing the training data
    - X_test : a 2d array containing the testing data
    - y_train : a 1d array containing the training labels
    - y_test : a 1d array containing the testing labels
    - K_train : a 2d array containing the training kernel matrix
    - K_test : a 2d array containing the kernel matrix for predicting
    '''
    ## Getting number of examples and range of indices
    n = len(y)
    assert n == X.shape[0] == K.shape[0]
    indices = np.arange(n)
    
    ## Splitting indices according to labels
    target_values = np.sort(np.unique(y))
    assert(len(target_values == 2))
    positives = y==target_values[1]
    negatives = y==target_values[0]
    positive_indices = indices[positives]
    negative_indices = indices[negatives]
    
    ## Getting number of examples of each class
    n_pos = len(positive_indices)
    n_neg = len(negative_indices)
    assert n_pos + n_neg == n
    
    if verbose:
        print('Total number of examples : {}'.format(n))
        print('Ratio of positive samples : {:.2f}'.format(n_pos/n))
        print('Ratio of negative to positive labels in the data : {:.2f}'.format(n_pos/n_neg))
    
    ## Shuffing Positives
    shuffled_positive_indices = positive_indices.copy()
    np.random.shuffle(shuffled_positive_indices)
    max_index_train_positives = n_pos - int(np.ceil(test_size*n_pos))
    train_pos_indices = shuffled_positive_indices[:max_index_train_positives]
    test_pos_indices = shuffled_positive_indices[max_index_train_positives:]
    
    ## Shuffling negatives
    shuffled_negative_indices = negative_indices.copy()
    np.random.shuffle(shuffled_negative_indices)
    max_index_train_negatives = n_neg - int(np.ceil(test_size*n_neg))
    train_neg_indices = shuffled_negative_indices[:max_index_train_negatives]
    test_neg_indices = shuffled_negative_indices[max_index_train_negatives:]
    
    ## Combining train indices from positives and negatives an re-shuffling
    train_indices = np.concatenate((train_pos_indices, train_neg_indices), axis=0)
    test_indices = np.concatenate((test_pos_indices, test_neg_indices), axis=0)
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)
    
    ## Producing trrain and test arrays from previously computed indices
    X_train = X[train_indices,:]
    y_train = y[train_indices]
    X_test = X[test_indices,:]
    y_test = y[test_indices]
    
    K_train = K[train_indices,:][:,train_indices]
    K_test = K[test_indices,:][:,train_indices]
    
    return X_train, X_test, y_train, y_test, K_train, K_test