# Initialization

This notebook contains commands for all necessary preparations between training models:

- concatenate data into 3 datasets (train, test1, test2);
- produce labels for cross-validation

Note: indices for train/test splits are already provided with this repository, but user may recompute it.

### Data preparation

Filter inappropriate samples (blanks, etc) from dataset, map labels of low-sampled identities to '-1' class.

In [None]:
import numpy as np
import sys
sys.path.append('../src/')

from initialization import concatenateSeparateToOneDF
from initialization import filterLabels

dirname = '../data/'
save_filename = 'dataset'
save_filename2 = 'test2'
main_dataset_filename = 'LCMS-IT-TOF.npz'

filterLabels(main_dataset_filename, dirname, min_count=20, save_filename=save_filename)
print "<%s> successfully filtered into <%s>" % (main_dataset_filename, save_filename)

additional_dataset_filenames = [
    'LCMS-IT-TOF_water.npz', 'LCMS-IT-TOF_methanol.npz', 
    'Agilent_QqQ_water.npz', 'Agilent_QqQ_methanol.npz', 'Agilent_QqQ_ethanol.npz'
]

concatenateSeparateToOneDF(additional_dataset_filenames, dirname, save_filename2)
print "<%s>: successfully concatenated." % (save_filename2)

### Cross validation index generation

Generate indices for repeated K-fold CV.

In [None]:
import numpy as np
import os
import sys
sys.path.append('../src/')

from initialization import concatenateSeparateToOneDF
from initialization import filterLabels
from initialization import generateRandomizedKFoldedSet

dirname = '../data/'
filename = 'dataset.npz'
filename_cv = 'cv_indices'

n_splits = 5
n_repeats = 5
# next variable must guarantee the quivalence of generated splits and ones used in our research
random_state = 235

try:
    df = np.load(dirname+filename)
    X, y = df['data'], df['label']
    i1, i2 = generateRandomizedKFoldedSet(
        X, y, n_splits, n_repeats, random_state, filename_cv,
        dirname, return_result=1
    )
    print '%d-times repeated %d-fold splits are generated and saved to <%s> in <%s> directory' % (
        n_repeats, n_splits, filename_cv, dirname
    )
except:
    print "No file %s in %s directory." % (filename, dirname)
    print "Please run data preparation code from above section ."

[Reopen contents](../chemfin.ipynb)