# Loading in and preprocessing

### Centre the data

In [None]:
# centre the data
def centre_data(train, validation, test):
    
    # calculate the means for each attribute of the training data
    column_means = np.mean(train, axis=0) 
    
    # centre training data by subtracting training data attribute means
    for i in range(len(train)):
        train[i] = train[i] - column_means
    
    # centre testing data by subtracting training data attribute means
    for x in range(len(test)):
        test[x] = test[x] - column_means
    
    for x in range(len(validation)):
        validation[x] = validation[x] - column_means
        
    return train, validation, test

### Apply PCA

In [None]:
# apply PCA on the data 
def PCA(variance_target, training_data, validation_data, testing_data):

    U, sigma, Vt = np.linalg.svd(training_data, full_matrices=False)
    
    sum_square_singular = np.sum(sigma**2)
    
    ratios = sigma**2/sum_square_singular
    
                
    n_components = 0
    explained_variance = 0
    
    # determine how many principle components must be retained to maintain the target level of explained variance
    for i in range(len(ratios)):
        if explained_variance >= variance_target:
            break
        else: 
            n_components += 1
            explained_variance += ratios[i]
    
    return training_data.dot(Vt.T[:, :n_components]), testing_data.dot(Vt.T[:, :n_components]), validation_data.dot(Vt.T[:, :n_components])

### Load in the data

In [None]:
from keras.datasets import cifar100

def load_in_dataset_and_preprocess(explained_variance):
    (training_data, training_labels), (testing_data, testing_labels) = (cifar100.load_data("coarse"))

    # reshape the data 
    training_data = training_data.reshape(50000, 3072)
    testing_data = testing_data.reshape(10000, 3072)

    # preprocess data
    validation_data = training_data[49000:, :].astype(np.float)
    validation_labels = np.squeeze(training_labels[49000:, :])
    training_data = training_data[:49000, :].astype(np.float)
    training_labels = np.squeeze(training_labels[:49000, :])
    testing_labels = np.squeeze(testing_labels)
    testing_data = testing_data.astype(np.float)

    # Centre data
    training_data, validation_data, testing_data = centre_data(training_data, validation_data, testing_data)

    # Apply PCA
    training_data, testing_data, validation_data = PCA(explained_variance, training_data, validation_data, testing_data)
    
    return training_data, training_labels, testing_data, testing_labels, validation_data, validation_labels