Préparation des Datasets
=============

Ce notebook prépare les datasets séparés, standardisés, relabelés, etc.

In [1]:
""" !!! Long run time (csv importation) !!! """
import proj1_helpers as _help

yb, input_data, ids, featnames = _help.load_csv_data('all/train.csv')

In [2]:
import numpy as np

In [3]:
X, y = input_data, yb

i, = np.where(featnames == 'PRI_jet_num')
PRI_jet_num = np.squeeze(i)
NULL, ONE, PLURAL = 0, 1, 2

cond_null = X[:, PRI_jet_num] == 0.
cond_one = X[:, PRI_jet_num] == 1.
cond_plural = X[:, PRI_jet_num] >= 2.
conditions = (cond_null, cond_one, cond_plural)

dsets = [X[cond] for cond in conditions]
ybs = [y[cond] for cond in conditions]

# Imputation

In [7]:
#For now, just remove any column with undefined -999 values

clean_dsets = []

for dset in dsets:
    
    no_undefined = np.all(dset != -999, axis = 0)
    dset_no_undef = dset[:,no_undefined]
    clean_dsets.append(dset_no_undef)

In [14]:
test = np.array([[1,2,3],[1,4,5],[1,6,7]])

In [17]:
test != test[0]

array([[False, False, False],
       [False,  True,  True],
       [False,  True,  True]])

In [18]:
np.any(test != test[0], axis = 0)

array([False,  True,  True])

# Standardization

We standardize each ***dset*** in ***dsets***, check if ***(mean, norm) == (0, 1)*** and put the result in ***std_dsets***.

After this cell, ***std_dsets*** contains the ***dsets*** NULL, ONE and PLURAL in standardized form.

In [16]:
std_dsets = []

for dset in clean_dsets:
    means = np.mean(dset, axis=0)
    norms = np.std(dset, axis=0)
    std_dset = ((dset - means) / norms)
    
    # Check new distributions are indeed N(0, 1²)
    print(np.abs(std_dset.mean(axis=0)))
    assert np.all(np.abs(std_dset.mean(axis=0)) < 1e-5)
    assert np.all(np.abs(std_dset.std(axis=0) - 1) < 1e-5)#TODO understand why < EPSILON fails here
    
    std_dsets.append(std_dset)

[5.26615051e-17 6.34213778e-16 3.18600328e-17 9.95626025e-19
 6.22977427e-17 2.04103335e-16 2.90153870e-17 4.90701398e-17
 9.61490275e-17 6.79159181e-18 9.60067952e-19 3.30014469e-16
 1.89880106e-17 6.82714988e-18 2.13597340e-16 1.23742092e-17
 9.73935601e-17            nan            nan]


  
  # Remove the CWD from sys.path while we load stuff.


AssertionError: 

In [12]:
means.shape

(72543,)

In [285]:
# Adds yb's to newdatasets

augmented_sets = []

for yb, dset in zip(ybs, std_dsets):
    aug = np.column_stack((yb, dset))
    print(dset.shape, '+', yb.shape, '=', aug.shape)
    augmented_sets.append(aug)

(4, 19) + (4,) = (4, 20)
(2, 23) + (2,) = (2, 24)
(4, 30) + (4,) = (4, 31)


In [286]:
# Saves .csv files with results

keys = ('NULL', 'ONE', 'PLURAL')

for key, dset in zip(keys, augmented_sets):
    np.savetxt(f'{key}.csv', dset, delimiter=",")