In [1]:
from kernels import * 
from learning_models import *
from tools import *
import pandas as pd
import numpy as np
from time import time 
from sklearn.svm import SVC
from autoreload import superreload

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def train_test_split(X, y, K, test_size=0.1, verbose=True):
    '''
    Splits a dataset (X,y) into a training and a testing set while preserving the same ratio of positive labels in the
    training and testing sets as in the initial dataset. The function also returns two kernel matrices for training and
    testing which entries are computed according to the shuffling operated on the initial dataset before splitting into
    training and testing, i.e. K_train[i,j] = K[X_train[i], X_train[j]] and K_test[i,j] = K[X_test[i], X_train[j]]

    Arguments :
    - X : a 2d array of features containing as many rows as there are samples in the dataset
    - y : a 1d array containing the labels of the dataset
    - test_size : a float representing the ratio of the initial data that the testing set should contain (default 0.1)
    - verbose : a Boolean stating whether or not the function should print information about the the initial dataset
                such as number of samples, ratio of positive samples, etc. (default True)

    Outputs :
    A tuple of length 6 containing:
    - X_train : a 2d array containing the training data
    - X_test : a 2d array containing the testing data
    - y_train : a 1d array containing the training labels
    - y_test : a 1d array containing the testing labels
    - K_train : a 2d array containing the training kernel matrix
    - K_test : a 2d array containing the kernel matrix for predicting
    '''

    ## Getting number of examples and range of indices
    n = len(y)
    assert n == X.shape[0] == K.shape[0]
    indices = np.arange(n)

    ## Splitting indices according to labels
    target_values = np.sort(np.unique(y))
    assert len(target_values) == 2
    positives = y==target_values[1]
    negatives = y==target_values[0]
    positive_indices = indices[positives]
    negative_indices = indices[negatives]

    ## Getting number of examples of each class
    n_pos = len(positive_indices)
    n_neg = len(negative_indices)
    assert n_pos + n_neg == n

    if verbose:
        print('Total number of examples : {}'.format(n))
        print('Ratio of positive samples : {:.2f}'.format(n_pos/n))
        print('Ratio of negative to positive labels in the data : {:.2f}'.format(n_neg/n_pos))

    ## Shuffing positives
    shuffled_positive_indices = positive_indices.copy()
    np.random.shuffle(shuffled_positive_indices)
    max_index_train_positives = n_pos - int(np.ceil(test_size*n_pos))
    train_pos_indices = shuffled_positive_indices[:max_index_train_positives]
    test_pos_indices = shuffled_positive_indices[max_index_train_positives:]

    ## Shuffling negatives
    shuffled_negative_indices = negative_indices.copy()
    np.random.shuffle(shuffled_negative_indices)
    max_index_train_negatives = n_neg - int(np.ceil(test_size*n_neg))
    train_neg_indices = shuffled_negative_indices[:max_index_train_negatives]
    test_neg_indices = shuffled_negative_indices[max_index_train_negatives:]

    ## Combining train indices from positives and negatives an re-shuffling
    train_indices = np.concatenate((train_pos_indices, train_neg_indices), axis=0)
    test_indices = np.concatenate((test_pos_indices, test_neg_indices), axis=0)
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)

    ## Producing train and test arrays from previously computed indices
    X_train = X[train_indices,:]
    y_train = y[train_indices]
    X_test = X[test_indices,:]
    y_test = y[test_indices]

    K_train = K[train_indices,:][:,train_indices]
    K_test = K[test_indices,:][:,train_indices]

    return X_train, X_test, y_train, y_test, K_train, K_test

In [4]:
def K_train_test_split(X, Y, K, test_size = 0.25):
    rs = ShuffleSplit(n_splits=1, test_size=test_size)
    train_index, test_index = next(rs.split(X))
    concatenate_index = np.concatenate((train_index,test_index))
    n_train = len(train_index)


    new_X = np.array(X)[concatenate_index]
    new_Y = np.array(Y)[concatenate_index]

    new_K = np.zeros((len(concatenate_index),len(concatenate_index)))
    for i,old_row in enumerate(concatenate_index):
        new_K[i] = np.array(K[old_row])[concatenate_index]

    return new_X[:n_train], new_X[n_train:], new_Y[:n_train], new_Y[n_train:], new_K[:n_train, :n_train], new_K[n_train:,:n_train]

### On prend un X bateau pour vérifier que l'algo fait bien ce qu'on veut, essayez de modifier X pour voir si on a toujours le bon résultat

In [25]:
X = np.arange(6*4).reshape(6,4)
Y = np.arange(6)

In [26]:
X

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [27]:
Y

array([0, 1, 2, 3, 4, 5])

In [35]:
y = np.array([1,1,1,0,0,0])

In [36]:
K = polynomial_kernel_matrix(X)

In [37]:
K

array([[   15.,    39.,    63.,    87.,   111.,   135.],
       [   39.,   127.,   215.,   303.,   391.,   479.],
       [   63.,   215.,   367.,   519.,   671.,   823.],
       [   87.,   303.,   519.,   735.,   951.,  1167.],
       [  111.,   391.,   671.,   951.,  1231.,  1511.],
       [  135.,   479.,   823.,  1167.,  1511.,  1855.]])

In [38]:
K.shape

(6, 6)

In [40]:
X_train, X_test, y_train, y_test, K_train, K_test = train_test_split(X,y,K)

Total number of examples : 6
Ratio of positive samples : 0.50
Ratio of negative to positive labels in the data : 1.00


In [41]:
X

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [13]:
X_train

array([[ 0,  1,  2,  3],
       [12, 13, 14, 15],
       [20, 21, 22, 23],
       [ 8,  9, 10, 11]])

In [14]:
X_test

array([[ 4,  5,  6,  7],
       [16, 17, 18, 19]])

Visiblement les indices (numéro des lignes) retenus pour le jeu d'entrainement sont [2,0,5,3], et ceux pour le jeu de test [4,1]. 

Vérifions plus bas que les matrices K_train et K_test correspondent bien respectivement à :
- K_train = K[i,j] pour (i,j) dans [2,0,5,3]x[2,0,5,3]
- K_test = K[i,j] pour (i,j) dans [4,1]x[2,0,5,3]

In [15]:
K

array([[   15.,    39.,    63.,    87.,   111.,   135.],
       [   39.,   127.,   215.,   303.,   391.,   479.],
       [   63.,   215.,   367.,   519.,   671.,   823.],
       [   87.,   303.,   519.,   735.,   951.,  1167.],
       [  111.,   391.,   671.,   951.,  1231.,  1511.],
       [  135.,   479.,   823.,  1167.,  1511.,  1855.]])

In [16]:
K_train

array([[   15.,    87.,   135.,    63.],
       [   87.,   735.,  1167.,   519.],
       [  135.,  1167.,  1855.,   823.],
       [   63.,   519.,   823.,   367.]])

In [17]:
K_test

array([[   39.,   303.,   479.,   215.],
       [  111.,   951.,  1511.,   671.]])

Les résultats sont ceux qu'on attend, le split a l'air de fonctionner correctement !

### Essayons maintenant le K_train de Charles

In [29]:
X_train, X_test, y_train, y_test, K_train, K_test = K_train_test_split(X,Y,K)

In [19]:
X

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [30]:
y_train

array([5, 3, 4, 1])

In [31]:
X_train

array([[20, 21, 22, 23],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [ 4,  5,  6,  7]])

In [33]:
y_test

array([0, 2])

In [34]:
X_test

array([[ 0,  1,  2,  3],
       [ 8,  9, 10, 11]])

Les indices du train sont [1,5,4,2], et ceux du test [0,3]

In [22]:
K

array([[   15.,    39.,    63.,    87.,   111.,   135.],
       [   39.,   127.,   215.,   303.,   391.,   479.],
       [   63.,   215.,   367.,   519.,   671.,   823.],
       [   87.,   303.,   519.,   735.,   951.,  1167.],
       [  111.,   391.,   671.,   951.,  1231.,  1511.],
       [  135.,   479.,   823.,  1167.,  1511.,  1855.]])

In [23]:
K_train

array([[  735.,   951.,    87.,  1167.],
       [  951.,  1231.,   111.,  1511.],
       [   87.,   111.,    15.,   135.],
       [ 1167.,  1511.,   135.,  1855.]])

In [24]:
K_test

array([[ 519.,  671.,   63.,  823.],
       [ 303.,  391.,   39.,  479.]])