# Data handling

This notebook contains commands for all necessary preparations before training models:

- concatenate data into 3 datasets (train, test1, test2);
- produce labels for cross-validation

Note: indices for train/test splits are already provided with this repository, but user may recompute it.

## 1. Import packages

In [1]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../data/csv/')
sys.path.append('../src/')

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder

from initialization import concatenateSeparateToOneDF
from initialization import filterLabels



## 2. Custom functions

In [2]:
def discretize(
    dirnames_list,
    mza=100., mzb=900., mz_step=1.,
    polarity_index_in=None, polarity_index_out=None
):

    if polarity_index_in is None:
        pos_ind_in, neg_ind_in = 0, 1
    else:
        pos_ind_in, neg_ind_in = polarity_index_in

    if polarity_index_out is None:
        pos_ind_out, neg_ind_out = 0, 1
    else:
        pos_ind_out, neg_ind_out = polarity_index_out

    result = []
    labels = []
    expnm = []
    for k_ex in range(len(dirnames_list)):
        filename = list(filter(lambda x: x.endswith('.npz'), os.listdir(dirnames_list[k_ex])))
        if len(filename) > 1:
            print(
                'Warning: %s contains multiple .npz files; %s selected' % (
                    dirnames_list[k_ex],
                    filename[0]
                )
            )
        filename = filename[0]
        current_expname = filename[::-1].split('.')[1][::-1]
        df = np.load(dirnames_list[k_ex]+'/'+filename)
        mz = df['mz']
        rt = df['rt']
        data = df['data']
        labels += df['labels'].tolist()
        expnm += [current_expname]*len(df['labels'])
        posNpoints, posNsamples = data[pos_ind_in].shape
        negNpoints, negNsamples = data[neg_ind_in].shape
        for k_sample in range(posNsamples):
            local_data = np.zeros([int((mzb - mza)//mz_step), 2])
            for k_point in range(max(posNpoints, negNpoints)):
                if k_point < posNpoints:
                    mzind = np.trunc((mz[pos_ind_in][k_point] - mza)/mz_step)
                    #mzind = np.round((mz[pos_ind_in][k_point] - mza)/mz_step)
                    mzind = int(mzind)
                    if (
                        (mza > mz[pos_ind_in][k_point]) or
                        (mzb <= mz[pos_ind_in][k_point]) or
                        (data[pos_ind_in][k_point, k_sample] == 0.) or
                        #(mzind >= int(np.round((mzb - mza)/mz_step))) or
                        (mzind >= int(np.trunc((mzb - mza)/mz_step))) or
                        (mzind < 0)
                    ):
                        pass
                    else:
                        local_data[mzind, pos_ind_out] = max(
                            local_data[mzind, pos_ind_out],
                            data[pos_ind_in][k_point, k_sample],
                            0
                        )
                if k_point < negNpoints:
                    mzind = np.trunc((mz[neg_ind_in][k_point] - mza)/mz_step)
                    #mzind = np.round((mz[neg_ind_in][k_point] - mza)/mz_step)
                    mzind = int(mzind)
                    if (
                        (mza > mz[neg_ind_in][k_point]) or
                        (mzb <= mz[neg_ind_in][k_point]) or
                        (data[neg_ind_in][k_point, k_sample] == 0.) or
                        #(mzind >= int(np.round((mzb - mza)/mz_step))) or
                        (mzind >= int(np.trunc((mzb - mza)/mz_step))) or
                        (mzind < 0)
                    ):
                        pass
                    else:
                        #print(mz[neg_ind_in][k_point], mz[neg_ind_in][k_point]-mza)
                        local_data[mzind, neg_ind_out] = max(
                            local_data[mzind, neg_ind_out],
                            data[neg_ind_in][k_point, k_sample],
                            0
                        )
            result.append(local_data)

    result = np.array(result)
    labels = np.array(labels)
    expnm = np.array(expnm)
    return result, labels, expnm
                
def get_exdf(
    dirnames, labels_filename, uninteresting_directories=None, uninteresting_column_prefixes=None
):
    if uninteresting_directories is None:
        uninteresting_directories = []
    if uninteresting_column_prefixes is None:
        uninteresting_column_prefixes = []
        
    ex_dict = {
        'experiment': [],
        'label': [],
        'name_pos': [],
        'name_neg': []
    }


    m3samples = {}

    for dirname in dirnames:
        if dirname in uninteresting_directories:
            continue
        with open(data_dirname+dirname+'/'+labels_filename, 'r') as f:
            tmp_labels = f.readlines()
            tmp_labels = list(map(lambda x: x.replace('\n', ''), tmp_labels))
            tmp_labels = list(filter(lambda x: (len(x) > 0), tmp_labels))
            tmp_labels = list(map(int, tmp_labels))
            tmp_labels = list(filter(lambda x: x != -1, tmp_labels))
        tmp_labels = np.array(tmp_labels)
        ind1 = np.where(tmp_labels == -3)[0]
        ind2 = np.where(tmp_labels != -3)[0]
        m3flag = len(ind1) > 0
        if m3flag:
            tmp_labels = tmp_labels[ind2].tolist()
        else:
            tmp_labels = tmp_labels.tolist()
        ex_dict['label'] += tmp_labels
        ex_dict['experiment'] += [dirname]*len(tmp_labels)
        for pol in ['pos', 'neg']:
            df = pd.read_csv(data_dirname+dirname+'/'+pol+'.csv', skiprows=2)
            data_columns = [
                x for x in df.columns if not x.startswith(tuple(uninteresting_column_prefixes))
            ]
            data_columns = list(map(lambda x: x.replace('#', '_'), data_columns))
            if m3flag:
                data_columns = np.array(data_columns)
                m3samples[dirname] = data_columns[ind1]
                data_columns = data_columns[ind2].tolist()
            if dirname.endswith('ex1.1'):
                data_columns = list(map(lambda x: 'pr'+x, data_columns))
            ex_dict['name_'+pol] += data_columns
    exdf = pd.DataFrame(ex_dict)
    return exdf

## 3. Converting CSVs into NPZs

In [3]:
data_dirname = '../data/csv/'
label_filename = 'labels.dat'
prefix = 'ex'

ex_dirnames = os.listdir(data_dirname)
ex_dirnames = filter(lambda x: x.startswith(prefix), ex_dirnames)
ex_dirnames = sorted(ex_dirnames)

csv_filenames = ['pos', 'neg']
skip_prefixes = ['Charge', 'Blank', 'blank', 'Flav', 'flav', 'Compound', 'Comp']


In [4]:
for k_ex in range(len(ex_dirnames)):
    current_dirname = ex_dirnames[k_ex]
    with open(data_dirname+current_dirname+'/'+label_filename, 'r') as f:
        labels = f.readlines()
        labels = list(map(lambda x: int(x.replace('\n', '')), labels))
        labels = list(filter(lambda x: x != -1, labels))
    mz = []
    rt = []
    data = []
    samples = []
    for k_pol in range(len(csv_filenames)):
        current_filename = csv_filenames[k_pol]+'.csv'
        df = pd.read_csv(data_dirname+current_dirname+'/'+current_filename, skiprows=2)
        columns = df.columns.values
        columns = filter(lambda x: all([not x.startswith(y) for y in skip_prefixes]), columns)
        df = df[columns]
        mz.append(df.iloc[:, 0].values)
        rt.append(df.iloc[:, 1].values)
        data.append(df.values[:, 2:])
        samples.append(columns[2:])
        print(len(labels), df.values[:, 2:].shape)
    np.savez_compressed(
        data_dirname+current_dirname+'/'+current_dirname,
        mz=mz, rt=rt, samples=samples, data=data,
        csv_filenames=csv_filenames, labels=labels
    )   

    #del df

(78, (3132, 78))
(78, (4462, 78))
(36, (3049, 36))
(36, (3904, 36))
(216, (3129, 216))
(216, (3036, 216))
(216, (3859, 216))
(216, (2925, 216))
(105, (3917, 105))
(105, (1448, 105))
(104, (3507, 104))
(104, (2584, 104))
(115, (2927, 115))
(115, (2206, 115))
(1636, (4843, 1636))
(1636, (5977, 1636))
(26, (2896, 26))
(26, (1667, 26))
(26, (1682, 26))
(26, (1720, 26))


## 4. Stacking main dataset

In [5]:
pos_ind_in = 0
neg_ind_in = (pos_ind_in+1) % 2
pos_ind_out = 1
neg_ind_out = (pos_ind_out+1) % 2

dirnames = [data_dirname+x for x in ex_dirnames[:-2]]
result, labels, expnm = discretize(
    dirnames,
    mza=100., mzb=900., mz_step=1.,
    polarity_index_in=(pos_ind_in, neg_ind_in),
    polarity_index_out=(pos_ind_out, neg_ind_out)
)

In [7]:
labels = np.array(labels)
expnm = np.array(expnm)
i = np.where(labels > 0)[0]
print(labels[i].shape, result[i].shape)

((2262,), (2262, 800, 2))


In [8]:
save_dirname = '../data/'
save_filename = 'LCMS-IT-TOF.npz'
np.savez_compressed(
    save_dirname+save_filename, expnm=expnm[i], label=labels[i], data=result[i]
)

In [9]:
dirname = '../data/'
save_filename = 'dataset'
save_filename_without_moc = 'dataset_without_moc'
filename_without_moc = 'dataset_without_moc.npz'

main_dataset_filename = 'LCMS-IT-TOF.npz'

filterLabels(main_dataset_filename, dirname, min_count=20, save_filename=save_filename)
print "<%s> successfully filtered into <%s>" % (main_dataset_filename, save_filename)

filterLabels(main_dataset_filename, dirname, min_count=1, save_filename=save_filename_without_moc)
print "<%s>: successfully filtered into <%s>." % (main_dataset_filename, save_filename_without_moc)


filename_without_moc_parts = 'dataset_parts.npz'
species_filename = 'species.csv'
try:
    df = pd.read_csv(dirname+species_filename)
    labelMapperParts = {}
    for k in xrange(len(df)):
        labelMapperParts[df.iloc[k, 0]] = df.iloc[k, -1]
except:
    print "No file %s in %s directory." % (species_filename, dirname)

df = np.load(dirname+filename_without_moc)
X, y = df['data'], df['label']
y_new = map(lambda x: labelMapperParts[x], y)
labelEncoder = LabelEncoder()
y_new = labelEncoder.fit_transform(y_new)
class_names = labelEncoder.classes_
np.savez_compressed(
    dirname+filename_without_moc_parts, data=X, label=y_new
)
print('Parts dataset: ', dict((i, class_names[i]) for i in range(len(class_names))))

<LCMS-IT-TOF.npz> successfully filtered into <dataset>
<LCMS-IT-TOF.npz>: successfully filtered into <dataset_without_moc>.
('Parts dataset: ', {0: 'bark', 1: 'buds', 2: 'flowers', 3: 'fructus', 4: 'leaves', 5: 'roots', 6: 'roots and rhizomes', 7: 'seeds'})


## 5. Generating CV indices [for prediction (species, part) class]

In [10]:
data_dirname = '../data/csv/'

uninteresting_directories = [
    'ex2.2', 'ex2.3'
]

dirnames = os.listdir(data_dirname)
dirnames = list(filter(lambda x: x.startswith('ex'), dirnames))
dirnames = list(filter(lambda x: not (x in uninteresting_directories), dirnames))
dirnames = list(filter(lambda x: os.path.isdir(data_dirname+x), dirnames))
dirnames = sorted(dirnames)

uninteresting_column_prefixes = [
    'Flav_all', 'Blank', 'blank', 'Compound', 'Comp', 'm/z', 'Charge', 'Retention time (min)'
]

labels_filename = 'labels.dat'

exdf = get_exdf(
    dirnames, labels_filename, uninteresting_directories, uninteresting_column_prefixes
)

In [11]:
def correctorCSV(x):
    result, tail = x.split('_Seg', 1)
    if 'ul_AutoMSMS' in result:
        result = 'pr'+result
    if 'G26' in result:
        result = result.replace('G26', 'G25')
    if '10rc_1k10_3' in result:
        result = result.replace('10rc', '10c')
    if ('c_1k10_' in result) and (result.endswith('r')):
        result = result[:-1]
    result += '_Seg' + tail
    return result

def corrector(x):
    result, tail = x.split('_Seg', 1)
    if '0_u' in result:
        result = result.replace('0_u', '0u')
    if '5_u' in result:
        result = result.replace('5_u', '5u')
    if (result == 'C52-1_13'):
        result = result[:-2] + '3'
    if (result == '15+_1k5_10ul_12'):
        result = result[:-2] + '2'
    if (result == '33_1k10_10ul_') or (result == '33h_1k10_10ul_'):
        result += '6'
    result += '_Seg' + tail
    return result

def corrector2(x):
    if x in [
        'pr1_1k10_20ul_AutoMSMS_Pos__1_Seg1Ev1',
        'pr16_1k10_20ul_AutoMSMS_Pos__1_Seg1Ev1',
        'pr18_1k10_20ul_AutoMSMS_Pos__1_Seg1Ev1',
        'pr19_1k10_20ul_AutoMSMS_Pos__1_Seg1Ev1',
        'pr35_1k10_20ul_AutoMSMS_Pos__1_Seg1Ev1',
        'pr20_1k10_20ul_AutoMSMS_Pos__1_Seg1Ev1'
    ]:
        return x.replace('_20ul_', '_10ul_')
    return x

def trimmer(x):
    result = x.split('_Seg')[0]
    if 'AutoMSMS' in x:
        result = x.split('_AutoMSMS')[0]
        result += '_AutoMSMS'
    if '0_u' in result:
        result = result.replace('0_u', '0u')
    if '5_u' in result:
        result = result.replace('5_u', '5u')
    if (result == 'C52-1_13'):
        result = result[:-2] + '3'
    if (result == '15+_1k5_10ul_12'):
        result = result[:-2] + '2'
    if (result == '33_1k10_10ul_') or (result == '33h_1k10_10ul_'):
        result += '6'
    return result

exdf['name_neg'] = exdf['name_neg'].apply(corrector)
exdf['name_pos'] = exdf['name_pos'].apply(corrector)

exdf['name_neg'] = exdf['name_neg'].apply(correctorCSV)
exdf['name_pos'] = exdf['name_pos'].apply(correctorCSV)
exdf.head()

Unnamed: 0,experiment,label,name_neg,name_pos
0,ex1.1,56,pr1_1k10_20ul_1_Seg1Ev2,pr1_1k10_20ul_1_Seg1Ev1
1,ex1.1,56,pr1_1k10_20ul_2_Seg1Ev2,pr1_1k10_20ul_2_Seg1Ev1
2,ex1.1,53,pr2_1k10_10ul_1_Seg1Ev2,pr2_1k10_10ul_1_Seg1Ev1
3,ex1.1,53,pr2_1k10_10ul_2_Seg1Ev2,pr2_1k10_10ul_2_Seg1Ev1
4,ex1.1,57,pr3_1k10_10ul_1_Seg1Ev2,pr3_1k10_10ul_1_Seg1Ev1


In [12]:
df = pd.read_csv(data_dirname+'s_plant_species_ident.txt', sep='\t')
df['Sample Name'] = df['Sample Name'].apply(corrector)
df.head()

Unnamed: 0,Source Name,Factor Value[year],Unit,Term Source REF,Term Accession Number,Characteristics[Organism],Term Source REF.1,Term Accession Number.1,Characteristics[Organism part],Term Source REF.2,Term Accession Number.2,Protocol REF,Sample Name,Factor Value[location],Term Source REF.3,Term Accession Number.3
0,no_treatment,2016,year,CCONT,http://purl.obolibrary.org/obo/UO_0000036,Melilotus officinalis,NCBITAXON,,leaf,BTO,,Sample collection,10-1_1_Seg1Ev1,Altai Krai,,
1,no_treatment,2016,year,CCONT,http://purl.obolibrary.org/obo/UO_0000036,Melilotus officinalis,NCBITAXON,,leaf,BTO,,Sample collection,10-1_1_Seg1Ev2,Altai Krai,,
2,no_treatment,2016,year,CCONT,http://purl.obolibrary.org/obo/UO_0000036,Melilotus officinalis,NCBITAXON,,leaf,BTO,,Sample collection,10-1_2_Seg1Ev1,Altai Krai,,
3,no_treatment,2016,year,CCONT,http://purl.obolibrary.org/obo/UO_0000036,Melilotus officinalis,NCBITAXON,,leaf,BTO,,Sample collection,10-1_2_Seg1Ev2,Altai Krai,,
4,no_treatment,2016,year,CCONT,http://purl.obolibrary.org/obo/UO_0000036,Melilotus officinalis,NCBITAXON,,leaf,BTO,,Sample collection,10-1_3_Seg1Ev1,Altai Krai,,


In [13]:
treatment_pos = list(
    map(
        lambda x: dict(name_pos=x, treatment=df[df['Sample Name'] == x]['Source Name'].values[0]),
        exdf.name_pos
    )
)

treatment_neg = list(
    map(
        lambda x: dict(name_neg=x, treatment=df[df['Sample Name'] == x]['Source Name'].values[0]),
        exdf.name_neg
    )
)

tmp_dfpos = pd.DataFrame(treatment_pos)
tmp_dfneg = pd.DataFrame(treatment_neg)

assert np.all(tmp_dfpos.treatment.values == tmp_dfneg.treatment.values)
assert np.all(tmp_dfpos.name_pos.values == exdf.name_pos.values)

exdf['treatment'] = tmp_dfpos.treatment.values
exdf.head()

Unnamed: 0,experiment,label,name_neg,name_pos,treatment
0,ex1.1,56,pr1_1k10_20ul_1_Seg1Ev2,pr1_1k10_20ul_1_Seg1Ev1,no_treatment
1,ex1.1,56,pr1_1k10_20ul_2_Seg1Ev2,pr1_1k10_20ul_2_Seg1Ev1,no_treatment
2,ex1.1,53,pr2_1k10_10ul_1_Seg1Ev2,pr2_1k10_10ul_1_Seg1Ev1,no_treatment
3,ex1.1,53,pr2_1k10_10ul_2_Seg1Ev2,pr2_1k10_10ul_2_Seg1Ev1,no_treatment
4,ex1.1,57,pr3_1k10_10ul_1_Seg1Ev2,pr3_1k10_10ul_1_Seg1Ev1,no_treatment


In [14]:
exdf['name_pos'] = exdf['name_pos'].apply(corrector2)

exdf['sample_pos'] = exdf['name_pos'].apply(trimmer)
exdf['trial_pos'] = exdf['sample_pos'].apply(lambda x: x[::-1].split('_', 1)[0][::-1])
exdf['sample_pos'] = exdf['sample_pos'].apply(lambda x: x[::-1].split('_', 1)[1][::-1])

exdf['sample_neg'] = exdf['name_neg'].apply(trimmer)
exdf['trial_neg'] = exdf['sample_neg'].apply(lambda x: x[::-1].split('_', 1)[0][::-1])
exdf['sample_neg'] = exdf['sample_neg'].apply(lambda x: x[::-1].split('_', 1)[1][::-1])

tmp = np.char.array(exdf['sample_neg'].values)
tmp = tmp + '_'
tmp = tmp + np.char.array(exdf['treatment'].values)
exdf['sample'] = tmp
print(len(np.unique(tmp)), len(exdf))
exdf.head()

(652, 2262)


Unnamed: 0,experiment,label,name_neg,name_pos,treatment,sample_pos,trial_pos,sample_neg,trial_neg,sample
0,ex1.1,56,pr1_1k10_20ul_1_Seg1Ev2,pr1_1k10_20ul_1_Seg1Ev1,no_treatment,pr1_1k10_20ul,1,pr1_1k10_20ul,1,pr1_1k10_20ul_no_treatment
1,ex1.1,56,pr1_1k10_20ul_2_Seg1Ev2,pr1_1k10_20ul_2_Seg1Ev1,no_treatment,pr1_1k10_20ul,2,pr1_1k10_20ul,2,pr1_1k10_20ul_no_treatment
2,ex1.1,53,pr2_1k10_10ul_1_Seg1Ev2,pr2_1k10_10ul_1_Seg1Ev1,no_treatment,pr2_1k10_10ul,1,pr2_1k10_10ul,1,pr2_1k10_10ul_no_treatment
3,ex1.1,53,pr2_1k10_10ul_2_Seg1Ev2,pr2_1k10_10ul_2_Seg1Ev1,no_treatment,pr2_1k10_10ul,2,pr2_1k10_10ul,2,pr2_1k10_10ul_no_treatment
4,ex1.1,57,pr3_1k10_10ul_1_Seg1Ev2,pr3_1k10_10ul_1_Seg1Ev1,no_treatment,pr3_1k10_10ul,1,pr3_1k10_10ul,1,pr3_1k10_10ul_no_treatment


In [15]:
labels_dict = {}
labels_unique = list(set(exdf.label.values))
exdf['label'] = exdf['label'].astype('i')
k = 0
for i in range(len(labels_unique)):
    label = labels_unique[i]
    tmp = exdf[exdf['label']==label]
    samples_unique = np.unique(tmp['sample'].values)
    cur_dict = {}
    for j in range(len(samples_unique)):
        ind = tmp[tmp['sample'] == samples_unique[j]].index.values
        cur_dict[samples_unique[j]] = ind
    labels_dict[label] = cur_dict

phys_labels = []
for i in range(len(labels_unique)):
    key = labels_unique[i]
    phys_labels += [key]*len(labels_dict[key])
print(len(phys_labels))

652


In [16]:
data_dirname = '../data/'
data_filename = 'dataset.npz'

labels_init = np.load(data_dirname+data_filename)['label']
labels_actual_unique = np.unique(labels_init)
def labelMapper(x):
    if x in labels_actual_unique:
        return x
    return -1

exdf['label_actual'] = exdf['label'].map(labelMapper)
labels_actual_dict = {}
for i in range(len(labels_actual_unique)):
    label = labels_actual_unique[i]
    tmp = exdf[exdf['label_actual']==label]
    samples_unique = np.unique(tmp['sample'].values)
    cur_dict = {}
    for j in range(len(samples_unique)):
        ind = tmp[tmp['sample'] == samples_unique[j]].index.values
        cur_dict[samples_unique[j]] = ind
    labels_actual_dict[label] = cur_dict


In [17]:
phys_labels_actual = []
sample_names = []
for i in range(len(labels_actual_unique)):
    key = labels_actual_unique[i]
    phys_labels_actual += [key]*len(labels_actual_dict[key])
    sample_names.append(labels_actual_dict[key].keys())
print(len(phys_labels_actual))
uni_actual, cnt_actual = np.unique(phys_labels_actual, return_counts=True)
print(np.mean(cnt_actual[1:]), np.std(cnt_actual[1:], ddof=1))

np.savez_compressed(
    data_dirname+'physical', phys_labels_actual=phys_labels_actual, labels_actual_dict=labels_actual_dict,
    sample_names=sample_names
)

snm = reduce(lambda x, y: x+y, sample_names)

652
(9.758620689655173, 2.4373579658309357)


In [18]:
n_splits=5
n_repeats=5
random_state=235
filename='physical_cv_indices_nc'
data_dirname = '../data/'

y = phys_labels_actual
X = np.empty([len(y), 0])

kfold = RepeatedStratifiedKFold(
    n_splits=n_splits,
    n_repeats=n_repeats,
    random_state=random_state
)
train_indices = []
test_indices = []
np.random.seed(random_state)
for train_index, test_index in kfold.split(X, y):
    phys_train_index = []
    phys_test_index = []
    for i in range(max(len(train_index), len(test_index))):
        if i < len(train_index):
            train_ind = train_index[i]
            train_key = snm[train_ind]
            i_train = y[train_ind]
            phys_train_index += labels_actual_dict[i_train][train_key].tolist()
        if i < len(test_index):
            test_ind = test_index[i]
            test_key = snm[test_ind]
            i_test = y[test_ind]
            phys_test_index.append(
                np.random.choice(
                    labels_actual_dict[i_test][test_key]
                )
            )
    train_indices.append(phys_train_index)
    test_indices.append(phys_test_index)
    np.savez_compressed(
        dirname+filename, n_splits=n_splits, n_repeats=n_repeats, random_state=random_state,
        train_indices=train_indices, test_indices=test_indices
    )

## 6. Generating CV indices [for prediction parts]

In [19]:
exdf['parts_label'] = exdf['label'].map(labelMapperParts)
exdf['parts_label'] = labelEncoder.transform(exdf['parts_label'])
exdf.head()

Unnamed: 0,experiment,label,name_neg,name_pos,treatment,sample_pos,trial_pos,sample_neg,trial_neg,sample,label_actual,parts_label
0,ex1.1,56,pr1_1k10_20ul_1_Seg1Ev2,pr1_1k10_20ul_1_Seg1Ev1,no_treatment,pr1_1k10_20ul,1,pr1_1k10_20ul,1,pr1_1k10_20ul_no_treatment,56,5
1,ex1.1,56,pr1_1k10_20ul_2_Seg1Ev2,pr1_1k10_20ul_2_Seg1Ev1,no_treatment,pr1_1k10_20ul,2,pr1_1k10_20ul,2,pr1_1k10_20ul_no_treatment,56,5
2,ex1.1,53,pr2_1k10_10ul_1_Seg1Ev2,pr2_1k10_10ul_1_Seg1Ev1,no_treatment,pr2_1k10_10ul,1,pr2_1k10_10ul,1,pr2_1k10_10ul_no_treatment,53,6
3,ex1.1,53,pr2_1k10_10ul_2_Seg1Ev2,pr2_1k10_10ul_2_Seg1Ev1,no_treatment,pr2_1k10_10ul,2,pr2_1k10_10ul,2,pr2_1k10_10ul_no_treatment,53,6
4,ex1.1,57,pr3_1k10_10ul_1_Seg1Ev2,pr3_1k10_10ul_1_Seg1Ev1,no_treatment,pr3_1k10_10ul,1,pr3_1k10_10ul,1,pr3_1k10_10ul_no_treatment,57,5


In [20]:
parts_labels_unique = np.unique(exdf['parts_label'])
parts_labels_dict = {}
for i in range(len(parts_labels_unique)):
    label = parts_labels_unique[i]
    tmp = exdf[exdf['parts_label']==label]
    samples_unique = np.unique(tmp['sample'].values)
    cur_dict = {}
    for j in range(len(samples_unique)):
        ind = tmp[tmp['sample'] == samples_unique[j]].index.values
        cur_dict[samples_unique[j]] = ind
    parts_labels_dict[label] = cur_dict

phys_parts_labels = []
parts_sample_names = []
for i in range(len(parts_labels_unique)):
    key = parts_labels_unique[i]
    phys_parts_labels += [key]*len(parts_labels_dict[key])
    parts_sample_names.append(parts_labels_dict[key].keys())
print(len(phys_parts_labels))    
psnm = reduce(lambda x, y: x+y, parts_sample_names)

652


In [21]:
filename_without_moc_parts_cv = 'physical_cv_indices_parts'

n_splits = 4
n_repeats = 5
# next variable must guarantee the quivalence of generated splits and ones used
# in our research
random_state = 235

y = phys_parts_labels
X = np.empty([len(y), 0])

kfold = RepeatedStratifiedKFold(
    n_splits=n_splits,
    n_repeats=n_repeats,
    random_state=random_state
)
train_indices = []
test_indices = []
np.random.seed(random_state)
for train_index, test_index in kfold.split(X, y):
    phys_parts_train_index = []
    phys_parts_test_index = []
    for i in range(max(len(train_index), len(test_index))):
        if i < len(train_index):
            train_ind = train_index[i]
            train_key = psnm[train_ind]
            i_train = y[train_ind]
            phys_parts_train_index += parts_labels_dict[i_train][train_key].tolist()
        if i < len(test_index):
            test_ind = test_index[i]
            test_key = psnm[test_ind]
            i_test = y[test_ind]
            phys_parts_test_index.append(
                np.random.choice(
                    parts_labels_dict[i_test][test_key]
                )
            )
    train_indices.append(phys_parts_train_index)
    test_indices.append(phys_parts_test_index)
    np.savez_compressed(
        dirname+filename_without_moc_parts_cv, n_splits=n_splits, n_repeats=n_repeats,
        random_state=random_state,
        train_indices=train_indices, test_indices=test_indices, class_names=class_names
    )


## 7. Stacking test dataset

In [22]:
### dataset2
data_dirname = '../data/csv/'

pos_ind_in = 0
neg_ind_in = (pos_ind_in+1) % 2
pos_ind_out = 1
neg_ind_out = (pos_ind_out+1) % 2

dirnames = [data_dirname+x for x in ex_dirnames[-2:]]
result2, labels2, expnm2 = discretize(
    dirnames,
    mza=100., mzb=900., mz_step=1.,
    polarity_index_in=(pos_ind_in, neg_ind_in),
    polarity_index_out=(pos_ind_out, neg_ind_out)
)

In [23]:
data_dirname = '../data/csv/'

uninteresting_directories = [
    'ex2.1'
]

dirnames = os.listdir(data_dirname)
dirnames = list(filter(lambda x: x.startswith('ex2'), dirnames))
dirnames = list(filter(lambda x: not (x in uninteresting_directories), dirnames))
dirnames = list(filter(lambda x: os.path.isdir(data_dirname+x), dirnames))
dirnames = sorted(dirnames)

uninteresting_column_prefixes = [
    'Flav_all', 'Blank', 'blank', 'Compound', 'Comp', 'm/z', 'Charge', 'Retention time (min)'
]

labels_filename = 'labels.dat'
print(dirnames)

exdf2 = get_exdf(
    dirnames, labels_filename, uninteresting_directories, uninteresting_column_prefixes
)

['ex2.2', 'ex2.3']


In [24]:
data_dirname = '../data/'
save_filename2 = 'test2'

def corrector3(x):
    for char in ['_', '-']:
        if char in x:
            return x.split(char)[0]
    return x

def getEluentName(x):
    markers = ['m', 'w']
    for mark in markers:
        #if x.endswith(mark):
        if mark in x:
            return mark
    return 'e'

exdf2['eluent'] = exdf2['name_neg'].map(getEluentName)
assert np.all(exdf2['eluent'].values == exdf2['name_pos'].apply(getEluentName).values)

exdf2['name_neg'] = exdf2['name_neg'].map(corrector3)
exdf2['name_pos'] = exdf2['name_pos'].map(corrector3)

equipment = {
    'ex2.2': 'LCMS-IT-TOF',
    'ex2.3': 'Agilent_QqQ'
}
eluent_dict = {
    'w': 'water',
    'm': 'methanol',
    'e': 'ethanol'
}


for j_key in equipment:
    for i_key in eluent_dict:
        ind = np.where(
            (
                (exdf2.eluent.values == i_key) &
                (exdf2['experiment'] == j_key)
            ).values == True
        )[0]
        current_expnm = exdf2.iloc[ind].values
        current_label = labels2[ind]
        assert np.all(current_label == exdf2.label.iloc[ind].values)
        np.savez_compressed(
            data_dirname+equipment[j_key]+'_'+eluent_dict[i_key],
            label=current_label,
            expnm=current_expnm,
            data=result2[ind]
        )
        print data_dirname+equipment[j_key]+'_'+eluent_dict[i_key]

additional_dataset_filenames = [
    'LCMS-IT-TOF_water.npz', 'LCMS-IT-TOF_methanol.npz', 
    'Agilent_QqQ_water.npz', 'Agilent_QqQ_methanol.npz', 'Agilent_QqQ_ethanol.npz'
]

concatenateSeparateToOneDF(additional_dataset_filenames, data_dirname, save_filename2)
print "<%s>: successfully concatenated." % (save_filename2)


../data/LCMS-IT-TOF_ethanol
../data/LCMS-IT-TOF_methanol
../data/LCMS-IT-TOF_water
../data/Agilent_QqQ_ethanol
../data/Agilent_QqQ_methanol
../data/Agilent_QqQ_water
<test2>: successfully concatenated.


In [25]:
filename_test2 = 'test2.npz'
filename_test2_parts = 'test2_parts'
dirname = '../data/'

df = np.load(dirname+filename_test2)
Xtest2, ytest2 = df['data'], df['label']
ytest2_new = map(lambda x: labelMapperParts[x], ytest2)
ytest2_new = labelEncoder.transform(ytest2_new)
#class_names = labelEncoder.classes_
np.savez_compressed(
    dirname+filename_test2_parts, data=Xtest2, label=ytest2_new
)
