Préparation des Datasets
=============

Ce notebook prépare les datasets séparés, standardisés, relabelés, etc.

In [280]:
""" !!! Long run time (csv importation) !!! """
import proj1_helpers as _help

yb, input_data, ids, featnames = _help.load_csv_data('all/train.csv', step=25000)

In [281]:
import numpy as np

In [282]:
X, y = input_data, yb

PRI_jet_num = 22
EPSILON = np.finfo(float).eps
NULL, ONE, PLURAL = 0, 1, 2

cond_null = X[:, PRI_jet_num] == 0.
cond_one = X[:, PRI_jet_num] == 1.
cond_plural = X[:, PRI_jet_num] >= 2.
conditions = (cond_null, cond_one, cond_plural)

dsets = [X[cond] for cond in conditions]
ybs = [y[cond] for cond in conditions]



# Imputation

In [283]:
#For now, just remove any column with undefined -999 values

clean_dsets = []

for dset in dsets:
    
    contains_undef = (dset == -999).any(axis=0)
    dset_no_undef = dset.T[~contains_undef].T

    # Check all -999 were indeed removed
    assert not (dset_no_undef == -999).any()
        
    clean_dsets.append(dset_no_undef)

# Standardization

We standardize each ***dset*** in ***dsets***, check if ***(mean, norm) == (0, 1)*** and put the result in ***std_dsets***.

After this cell, ***std_dsets*** contains the ***dsets*** NULL, ONE and PLURAL in standardized form.

In [284]:
std_dsets = []

for dset in clean_dsets:
    means = np.mean(dset, axis=1)
    norms = np.std(dset, axis=1)
    std_dset = ((dset.T - means) / norms).T
    
    # Check new distributions are indeed N(0, 1²)
    assert np.all(np.abs(std_dset.mean(axis=1)) < EPSILON)
    assert np.all(np.abs(std_dset.std(axis=1) - 1) < EPSILON+1e-16)#TODO understand why < EPSILON fails here
    
    std_dsets.append(std_dset)

In [285]:
# Adds yb's to newdatasets

augmented_sets = []

for yb, dset in zip(ybs, std_dsets):
    aug = np.column_stack((yb, dset))
    print(dset.shape, '+', yb.shape, '=', aug.shape)
    augmented_sets.append(aug)

(4, 19) + (4,) = (4, 20)
(2, 23) + (2,) = (2, 24)
(4, 30) + (4,) = (4, 31)


In [286]:
# Saves .csv files with results

keys = ('NULL', 'ONE', 'PLURAL')

for key, dset in zip(keys, augmented_sets):
    np.savetxt(f'{key}.csv', dset, delimiter=",")