Generate corpus from big spectra file using ms2lda.
Store output as json, numpy array (\*.npy) and scipy sparse matrix (\*.npz) files.

In [1]:
import sys

sys.path.append('../ms2ldaviz/lda/code')
sys.path.append('../ms2ldaviz/ms2ldaviz')

In [2]:
from ms2lda_feature_extraction import MakeBinnedFeatures, LoadMGF

The mgf file was downloaded from https://gnps.ucsd.edu/ProteoSAFe/gnpslibrary.jsp?library=all, direct link ftp://ccms-ftp.ucsd.edu/Spectral_Libraries/ALL_GNPS.mgf

In [3]:
ms2_file = 'METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main.mgf'

In [4]:
loader = LoadMGF(min_ms1_intensity = 0.0,
                 min_ms2_intensity = 25,
                 mz_tol=5,
                 rt_tol=10,
                 peaklist=None)

In [5]:
%load_ext memory_profiler

In [6]:
%time %memit ms1, ms2, metadata = loader.load_spectra([ms2_file])

Filtering MS2 on intensity
17025699 MS2 remaining
peak memory: 4300.30 MiB, increment: 4252.23 MiB
CPU times: user 4min 20s, sys: 5.19 s, total: 4min 25s
Wall time: 4min 26s


In [16]:
fm = MakeBinnedFeatures()
%time %memit corpus, features = fm.make_features(ms2)
corpus = corpus[corpus.keys()[0]]

107122 documents
After removing empty words, 229899 words left
peak memory: 5872.04 MiB, increment: 1733.77 MiB
CPU times: user 4min 7s, sys: 7.98 s, total: 4min 15s
Wall time: 4min 19s


In [17]:
%memit del ms2

peak memory: 5890.69 MiB, increment: 0.23 MiB


In [20]:
import json

In [21]:
with open('METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main.features.json', 'w') as f:
    json.dump(features, f)

In [25]:
import numpy as np
from scipy.sparse import dok_matrix

In [28]:
doc_index = np.array(corpus.keys())

In [32]:
with open('METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main.docindex.npy', 'w') as f:
  np.save(f, doc_index)

In [33]:
word_index = set()   

In [35]:
for words in corpus.itervalues():
    for word in words.iterkeys():
        if word not in word_index:
            word_index.add(word)

In [44]:
np_word_index = np.array(list(word_index))
with open('METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main.wordindex.npy', 'w') as f:
    np.save(f, np_word_index)

In [46]:
m = dok_matrix((len(doc_index), len(np_word_index)))

In [47]:
index_doc = {v: k for k,v in enumerate(doc_index)}

In [49]:
index_word = {v: k for k,v in enumerate(word_index)}

In [50]:
for doc, words in corpus.items():
    for word, score in words.items():
        m[index_doc[doc], index_word[word]] = score

In [52]:
del corpus

In [53]:
m2 = m.tocsr()

In [54]:
from scipy.sparse import save_npz

In [55]:
save_npz('METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main.corpus.npz', m2)