In [1]:
import numpy as np
import pandas as pd
import scipy as sp

from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = [20,10]

from scipy.sparse import load_npz
from scipy.sparse import csr_matrix, hstack, save_npz

## Reading Wim Data

In [2]:
data = np.load('inga_out.npz')

In [3]:
mass = data['arr_0']
mass

array([ 552.293448,  338.046024,  724.221464, ...,  328.072907,
        262.19328 ,  636.278192])

In [4]:
name = data['arr_1']
name

array(['UNPD98266', 'UNPD207163', 'UNPD3499', ..., 'UNPD98267',
       'UNPD47332', 'UNPD101003'], dtype=object)

In [5]:
massabund = data['arr_2']
massabund

array([[  41.00329 ,    1.688456],
       [  43.01894 ,    2.135631],
       [  55.01894 ,    1.105409],
       ..., 
       [ 549.23414 ,   24.377134],
       [ 551.24979 ,   22.666363],
       [ 591.2447  ,    5.496205]])

In [6]:
blockind = data['arr_3']
blockind

array([     0,      0,      0, ..., 220988, 220988, 220988], dtype=uint32)

In [7]:
wim = pd.DataFrame({'comp_name': name, 'mass': mass})
wim.shape

(220989, 2)

In [8]:
wim_inga = wim[wim['comp_name'].str.match('Inga')]
wim_inga.shape

(3291, 2)

In [9]:
wim_inga.head()

Unnamed: 0,comp_name,mass
131,Inga_compound_2296,1728.7834
132,Inga_compound_2295,585.08972
133,Inga_compound_2293,987.533777
134,Inga_compound_2292,1154.5703
135,Inga_compound_2291,722.174222


## Reading Peak & Loss Data

In [10]:
raw = pd.read_pickle('mz_and_losses_long_2018_07_09.pkl')
raw.shape

(2615424, 3)

In [11]:
raw.head()

Unnamed: 0,comp_name,feature_name,intensity
4,UNPD98266,peak_110,1.29489
5,UNPD98266,peak_191,2.076612
12,UNPD98266,peak_5691,16.325846
18,UNPD98266,peak_6984,2.461923
24,UNPD3499,peak_191,3.612248


### Peak Data

In [12]:
peak = raw[raw['feature_name'].str.match('peak')]
peak.shape

(1214577, 3)

In [13]:
peak_inga = peak[peak['comp_name'].str.match('Inga')]
peak_inga.shape

(53508, 3)

In [14]:
peak_inga.head()

Unnamed: 0,comp_name,feature_name,intensity
1391,Inga_compound_2296,peak_398,261.491486
1392,Inga_compound_2296,peak_506,452.270844
1393,Inga_compound_2296,peak_690,853.831665
1394,Inga_compound_2296,peak_867,1476.220703
1395,Inga_compound_2296,peak_1058,228.0


### Loss Data

In [15]:
loss = raw[raw['feature_name'].str.match('loss')]
loss.shape

(1400847, 3)

In [16]:
loss_inga = loss[loss['comp_name'].str.match('Inga')]
loss_inga.shape

(53508, 3)

In [17]:
loss_inga.head()

Unnamed: 0,comp_name,feature_name,intensity
1391,Inga_compound_2296,loss_42207,1.0
1392,Inga_compound_2296,loss_42072,1.0
1393,Inga_compound_2296,loss_41877,1.0
1394,Inga_compound_2296,loss_41699,1.0
1395,Inga_compound_2296,loss_41546,1.0


## Reading Classes Data

In [18]:
classes = pd.read_pickle('compound_names_with_classes.pkl')
classes.head()

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin,compound_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD98266
1,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3499
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3493
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3492
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3490


In [19]:
classes = classes.rename(columns={'compound_id': 'comp_name'})
classes.head()

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin,comp_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD98266
1,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3499
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3493
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3492
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,UNPD3490


In [20]:
classes.shape

(76903, 72)

In [21]:
classes_inga = classes[classes['comp_name'].str.match('Inga_compound')]
classes_inga.head()

Unnamed: 0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin,comp_name
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_3927
1,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,Inga_compound_3924
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_3714
3,1,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Inga_compound_3447
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_3258


In [22]:
classes_inga.shape

(842, 72)

In [101]:
classes_names = np.array(classes.columns)
classes_names.shape

(72,)

In [103]:
np.save('classes_names', classes_names[:-1])

## Dealing with Peak Data

### feature_name dictionary

In [23]:
fnames = peak['feature_name'].unique()
fnames

array(['peak_110', 'peak_191', 'peak_5691', ..., 'peak_33790',
       'peak_35474', 'peak_3893'], dtype=object)

In [24]:
fnumbers = np.array([int(str(x)[5:]) for x in fnames])
fnumbers

array([  110,   191,  5691, ..., 33790, 35474,  3893])

In [25]:
fnumbers_sorted = np.sort(fnumbers)
fnumbers_sorted

array([    2,     3,     4, ..., 37325, 37329, 37331])

In [26]:
fnames_sorted = np.array(['peak_'+str(x) for x in fnumbers_sorted])
fnames_sorted

array(['peak_2', 'peak_3', 'peak_4', ..., 'peak_37325', 'peak_37329',
       'peak_37331'],
      dtype='<U10')

In [27]:
fdict = pd.Series(data=np.arange(len(fnames)), index=fnames_sorted, name='feature_id')
fdict.head()

peak_2    0
peak_3    1
peak_4    2
peak_7    3
peak_8    4
Name: feature_id, dtype: int64

### comp_name dictionary

In [28]:
cnames_inga1 = set(peak_inga['comp_name'].unique())
len(cnames_inga1)

3291

In [29]:
cnames_inga2 = set(classes_inga['comp_name'].unique())
len(cnames_inga2)

842

#### training

In [30]:
cnames_inga_training = cnames_inga1 & cnames_inga2
len(cnames_inga_training)

842

In [31]:
cnumbers_inga_training = np.array([int(str(x)[14:]) for x in cnames_inga_training])
cnumbers_inga_training.shape

(842,)

In [32]:
cnumbers_inga_training_sorted = np.sort(cnumbers_inga_training)
cnumbers_inga_training_sorted.shape

(842,)

In [33]:
cnames_inga_training_sorted = np.array(['Inga_compound_'+str(x) for x in cnumbers_inga_training_sorted])
cnames_inga_training_sorted.shape

(842,)

In [34]:
cdict_inga_training = pd.Series(data=np.arange(len(cnames_inga_training)), index=cnames_inga_training_sorted, name='comp_id')
cdict_inga_training.head()

Inga_compound_15    0
Inga_compound_18    1
Inga_compound_20    2
Inga_compound_24    3
Inga_compound_25    4
Name: comp_id, dtype: int64

In [105]:
cdict_inga_training2 = pd.Series(index=np.arange(len(cnames_inga_training)), data=cnames_inga_training_sorted, name='comp_id')
cdict_inga_training2.head()

0    Inga_compound_15
1    Inga_compound_18
2    Inga_compound_20
3    Inga_compound_24
4    Inga_compound_25
Name: comp_id, dtype: object

In [112]:
cdict_inga_training2.to_csv('dict_training.csv', index=False)

#### testing

In [35]:
cnames_inga_testing = cnames_inga1 - cnames_inga2
len(cnames_inga_testing)

2449

In [36]:
cnumbers_inga_testing = np.array([int(str(x)[14:]) for x in cnames_inga_testing])
cnumbers_inga_testing.shape

(2449,)

In [37]:
cnumbers_inga_testing_sorted = np.sort(cnumbers_inga_testing)
cnumbers_inga_testing_sorted.shape

(2449,)

In [38]:
cnames_inga_testing_sorted = np.array(['Inga_compound_'+str(x) for x in cnumbers_inga_testing_sorted])
cnames_inga_testing_sorted.shape

(2449,)

In [39]:
cdict_inga_testing = pd.Series(data=np.arange(len(cnames_inga_testing)), index=cnames_inga_testing_sorted, name='comp_id')
cdict_inga_testing.head()

Inga_compound_1    0
Inga_compound_2    1
Inga_compound_3    2
Inga_compound_4    3
Inga_compound_5    4
Name: comp_id, dtype: int64

In [107]:
cdict_inga_testing2 = pd.Series(index=np.arange(len(cnames_inga_testing)), data=cnames_inga_testing_sorted, name='comp_id')
cdict_inga_testing2.head()

0    Inga_compound_1
1    Inga_compound_2
2    Inga_compound_3
3    Inga_compound_4
4    Inga_compound_5
Name: comp_id, dtype: object

In [111]:
cdict_inga_testing2.to_csv('dict_testing.csv', index=False)

### Appending new columns

#### training

In [40]:
ind_peak_inga_training = peak_inga['comp_name'].apply(lambda x: x in cnames_inga_training)
ind_peak_inga_training.shape

(53508,)

In [41]:
new_peak_inga_training = peak_inga[ind_peak_inga_training]
new_peak_inga_training.shape

(13599, 3)

In [42]:
new_peak_inga_training['feature_id'] = new_peak_inga_training['feature_name'].map(lambda x: fdict.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
new_peak_inga_training['comp_id'] = new_peak_inga_training['comp_name'].map(lambda x: cdict_inga_training.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [44]:
new_peak_inga_training.head()

Unnamed: 0,comp_name,feature_name,intensity,feature_id,comp_id
1412,Inga_compound_2295,peak_746,15504.016632,413,470
1413,Inga_compound_2295,peak_907,13900.255432,490,470
1414,Inga_compound_2295,peak_1249,7572.683838,655,470
1415,Inga_compound_2295,peak_1259,12585.825684,659,470
1416,Inga_compound_2295,peak_1272,15757.078171,665,470


In [45]:
new_peak_inga_training.shape

(13599, 5)

#### testing

In [46]:
ind_peak_inga_testing = peak_inga['comp_name'].apply(lambda x: x in cnames_inga_testing)
ind_peak_inga_testing.shape

(53508,)

In [47]:
new_peak_inga_testing = peak_inga[ind_peak_inga_testing]
new_peak_inga_testing.shape

(39909, 3)

In [48]:
new_peak_inga_testing['feature_id'] = new_peak_inga_testing['feature_name'].map(lambda x: fdict.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [49]:
new_peak_inga_testing['comp_id'] = new_peak_inga_testing['comp_name'].map(lambda x: cdict_inga_testing.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [50]:
new_peak_inga_testing.head()

Unnamed: 0,comp_name,feature_name,intensity,feature_id,comp_id
1391,Inga_compound_2296,peak_398,261.491486,201,1475
1392,Inga_compound_2296,peak_506,452.270844,266,1475
1393,Inga_compound_2296,peak_690,853.831665,379,1475
1394,Inga_compound_2296,peak_867,1476.220703,469,1475
1395,Inga_compound_2296,peak_1058,228.0,560,1475


In [51]:
new_peak_inga_testing.shape

(39909, 5)

### Generating sparse matrix

In [52]:
def generateSparse(data, row, col, fdict):
    mat_raw = csr_matrix((data, (row, col)))
    mat_raw_shape = mat_raw.shape
    mat_addition = np.zeros((mat_raw.shape[0], fdict.max()+1-mat_raw.shape[1]))
    mat = hstack([mat_raw, mat_addition])
    return mat

#### training

In [53]:
mat_peak_inga_training = generateSparse(new_peak_inga_training['intensity'], 
                                           new_peak_inga_training['comp_id'],
                                           new_peak_inga_training['feature_id'],
                                           fdict)
mat_peak_inga_training

<842x7492 sparse matrix of type '<class 'numpy.float64'>'
	with 13599 stored elements in COOrdinate format>

#### testing

In [54]:
mat_peak_inga_testing = generateSparse(new_peak_inga_testing['intensity'], 
                                          new_peak_inga_testing['comp_id'],
                                          new_peak_inga_testing['feature_id'],
                                          fdict)
mat_peak_inga_testing

<2449x7492 sparse matrix of type '<class 'numpy.float64'>'
	with 39909 stored elements in COOrdinate format>

## Dealing with Mass Data

### training

#### Filtering out unselected records

In [55]:
ind_wim_inga_training = wim_inga['comp_name'].apply(lambda x: x in cnames_inga_training)
ind_wim_inga_training.shape

(3291,)

In [56]:
new_wim_inga_training = wim_inga[ind_wim_inga_training]
new_wim_inga_training.shape

(842, 2)

#### Sorting by comp_id

In [57]:
new_wim_inga_training['comp_id'] = new_wim_inga_training['comp_name'].map(lambda x: cdict_inga_training.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [58]:
new_wim_inga_training = new_wim_inga_training.set_index('comp_id')

In [59]:
new_wim_inga_training = new_wim_inga_training.sort_index()

In [60]:
new_wim_inga_training.shape

(842, 2)

In [61]:
new_wim_inga_training.head()

Unnamed: 0_level_0,comp_name,mass
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Inga_compound_15,471.055939
1,Inga_compound_18,449.107866
2,Inga_compound_20,938.509671
3,Inga_compound_24,1043.5413
4,Inga_compound_25,435.092107


#### Generating dense matrix

In [62]:
mat_wim_inga_training = new_wim_inga_training.iloc[:,-1].as_matrix()

In [63]:
mat_wim_inga_training.shape

(842,)

#### Merging with sparse peak matrix

In [64]:
mat_features_inga_training = hstack([mat_wim_inga_training.reshape(-1,1), mat_peak_inga_training])

In [65]:
mat_features_inga_training.toarray().shape

(842, 7493)

### testing

#### Filtering out unselected records

In [66]:
ind_wim_inga_testing = wim_inga['comp_name'].apply(lambda x: x in cnames_inga_testing)
ind_wim_inga_testing.shape

(3291,)

In [67]:
new_wim_inga_testing = wim_inga[ind_wim_inga_testing]
new_wim_inga_testing.shape

(2449, 2)

#### Sorting by comp_id

In [68]:
new_wim_inga_testing['comp_id'] = new_wim_inga_testing['comp_name'].map(lambda x: cdict_inga_testing.get(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [69]:
new_wim_inga_testing = new_wim_inga_testing.set_index('comp_id')

In [70]:
new_wim_inga_testing = new_wim_inga_testing.sort_index()

In [71]:
new_wim_inga_testing.shape

(2449, 2)

In [72]:
new_wim_inga_testing.head()

Unnamed: 0_level_0,comp_name,mass
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Inga_compound_1,319.045218
1,Inga_compound_2,1888.8495
2,Inga_compound_3,1845.9285
3,Inga_compound_4,1904.9725
4,Inga_compound_5,1690.815957


#### Generating dense matrix

In [73]:
mat_wim_inga_testing = new_wim_inga_testing.iloc[:,-1].as_matrix()

In [74]:
mat_wim_inga_testing.shape

(2449,)

#### Merging with sparse peak matrix

In [75]:
mat_features_inga_testing = hstack([mat_wim_inga_testing.reshape(-1,1), mat_peak_inga_testing])

In [76]:
mat_features_inga_testing.toarray().shape

(2449, 7493)

## Dealing with Class Data

#### Filtering out unselected records

In [77]:
ind_classes_inga = classes_inga['comp_name'].apply(lambda x: x in cnames_inga_training)
ind_classes_inga.shape

(842,)

In [78]:
new_classes_inga = classes_inga[ind_classes_inga]
new_classes_inga.shape

(842, 72)

#### Sorting by comp_id

In [79]:
new_classes_inga['comp_id'] = new_classes_inga['comp_name'].map(lambda x: cdict_inga_training.get(x))

In [80]:
new_classes_inga = new_classes_inga.set_index('comp_id')

In [81]:
new_classes_inga = new_classes_inga.sort_index()

In [82]:
new_classes_inga.shape

(842, 72)

In [83]:
new_classes_inga.head()

Unnamed: 0_level_0,Phenolic,Flavonoid,Flavone,Flavonol,DihydroFlavone,Flavan3ol,Dihydroflavonol,Anthocyanidin,Chalcone,Aurone,...,QuinicAcid,Naringenin,Catechin_OR_Epicatechin,Myricetin,Kaempferol,Afzelechin,Gallocatechin,Quercetin,Apigenin,comp_name
comp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,Inga_compound_15
1,1,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,Inga_compound_18
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_20
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Inga_compound_24
4,1,1,0,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,Inga_compound_25


#### Generating dense matrix

In [84]:
mat_classes_inga = new_classes_inga.iloc[:,:-1].as_matrix()

In [85]:
mat_classes_inga.shape

(842, 71)

## Writing to Files

In [92]:
mat_features_inga_training

<842x7493 sparse matrix of type '<class 'numpy.float64'>'
	with 14441 stored elements in COOrdinate format>

In [93]:
mat_features_inga_testing

<2449x7493 sparse matrix of type '<class 'numpy.float64'>'
	with 42358 stored elements in COOrdinate format>

In [95]:
mat_classes_inga.shape

(842, 71)

In [96]:
np.save('Xtrain', mat_features_inga_training.todense())

In [97]:
np.save('Xtest', mat_features_inga_testing.todense())

In [98]:
np.save('Ytrain', mat_classes_inga)