Initial run to test proteomics data with LDA
=============================================

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.display import display, HTML
import numpy as np
import pandas as pd
import re

import os
import sys
basedir = '..'
sys.path.append(basedir)

from multifile_feature import MultifileFeatureExtractor

# living dangerously by suppressing all annoying warning messages
import warnings
warnings.filterwarnings('ignore')

<h2>1. Feature Extraction</h2>

We need to parse the .MGF file and turn it into the count matrix

In [2]:
def parse_mgf(filename, debug=False):
    ms1_peakids = []
    ms1_peakdata = []
    ms2_peakids = []
    ms2_peakdata = []
    with open(filename, "r") as ins:

        pep_mass = None
        pep_rt = None
        pep_charge = np.nan
        fragments = []
        peak_id = 1
        for line in ins:

            line = line.strip()
            if not line:
                continue # skip empty line

            # split by ' ' or '='
            tokens = re.split(' |=', line)
            tok = tokens[0].upper()

            if tok == 'BEGIN':
                continue
            elif tok == 'TITLE':
                continue
            elif tok == 'RTINSECONDS':
                pep_rt = float(tokens[1])
                ms1_id = peak_id
                peak_id += 1            
            elif tok == 'PEPMASS':
                pep_mass = float(tokens[1])
            elif tok == 'CHARGE':
                pep_charge = tokens[1]
            elif tok == 'END':

                if debug:
                    print ms1_id, pep_mass, pep_rt, pep_charge
                ms1_peakdata.append((ms1_id, np.nan, 1, pep_mass, pep_rt, 0, pep_charge))
                ms1_peakids.append(ms1_id)

                for ms2_id, ms2_mass, ms2_intensity in fragments:
                    if debug:
                        print '- %d %f %f' % (ms2_id, ms2_mass, ms2_intensity)
                    ms2_peakdata.append((ms2_id, ms1_id, 2, ms2_mass, 0, ms2_intensity, np.nan))
                    ms2_peakids.append(ms2_id)
                if debug: 
                    print

                # reset for the next line
                pep_mass = None
                pep_rt = None
                pep_charge = np.nan
                fragments = []

            else: # read the fragments
                ms2_mass = float(tok)
                ms2_intensity = float(tokens[1])
                fragments.append((peak_id, ms2_mass, ms2_intensity))
                peak_id += 1

    ms1 = pd.DataFrame(ms1_peakdata, index=ms1_peakids, 
                       columns=['peakID', 'MSnParentPeakID', 'msLevel', 'mz', 'rt', 'intensity', 'charge'])
    ms2 = pd.DataFrame(ms2_peakdata, index=ms2_peakids, 
                       columns=['peakID', 'MSnParentPeakID', 'msLevel', 'mz', 'rt', 'intensity', 'charge'])

    return ms1, ms2

In [3]:
filename = 'iPRG2012_small.mgf'
ms1, ms2 = parse_mgf(filename)

In [4]:
display(ms1.head(10))
print ms1.shape

Unnamed: 0,peakID,MSnParentPeakID,msLevel,mz,rt,intensity,charge
1,1,,1,986.222592,1.85,0,
6,6,,1,1117.290047,2.35,0,
10,10,,1,951.174259,114.576,0,2+
60,60,,1,685.120003,115.109,0,2+
97,97,,1,818.148264,115.209,0,2+
141,141,,1,943.18612,115.309,0,
162,162,,1,1076.713199,115.409,0,2+
184,184,,1,1084.202678,115.559,0,2+


(8, 7)


In [5]:
display(ms2.head(10))
print ms2.shape

Unnamed: 0,peakID,MSnParentPeakID,msLevel,mz,rt,intensity,charge
2,2,1,2,986.331999,0,69.148811,
3,3,1,2,989.62616,0,72.000984,
4,4,1,2,989.716248,0,61.076389,
5,5,1,2,989.794898,0,94.243019,
7,7,6,2,1114.994507,0,69.292564,
8,8,6,2,1117.045898,0,61.075764,
9,9,6,2,1118.765479,0,62.225277,
11,11,10,2,159.020981,0,9.268942,
12,12,10,2,213.025406,0,12.0,
13,13,10,2,213.038666,0,11.268942,


(219, 7)


In [6]:
input_set = [(ms1, ms2)]
fragment_grouping_tol = 7
loss_grouping_tol = 15
loss_threshold_min_count = 15
loss_threshold_max_val = 200
scaling_factor = 1000

In [7]:
extractor = MultifileFeatureExtractor(input_set, fragment_grouping_tol, loss_grouping_tol, 
                                      loss_threshold_min_count, loss_threshold_max_val,
                                     input_type='dataframe')

Loading MS1 dataframe 8 X 7
Loading MS2 dataframe 219 X 7


In [8]:
fragment_q = extractor.make_fragment_queue()
fragment_groups = extractor.group_features(fragment_q, extractor.fragment_grouping_tol)

Processing fragments for file 0
Total groups=173


In [9]:
loss_q = extractor.make_loss_queue()
loss_groups = extractor.group_features(loss_q, extractor.loss_grouping_tol, 
                                       check_threshold=True)

Processing losses for file 0
Total groups=1


In [10]:
extractor.create_dataframes(fragment_groups, loss_groups, sparse=False)

173 fragment words
1 loss words
Initialising dense dataframe
Populating dataframes
Populating dataframe for fragment group 0/173
Populating dataframe for fragment group 100/173
Populating dataframe for loss group 0/1


In [11]:
f = 0
extractor.normalise(f, scaling_factor)
df, vocab, ms1, ms2 = extractor.get_entry(f)

file 0 data shape (8, 174)


In [12]:
display(df)

Unnamed: 0,fragment_159.02098,fragment_159.02271,fragment_213.02541,fragment_213.02794,fragment_213.03315,fragment_213.03867,fragment_214.97896,fragment_231.02635,fragment_231.03772,fragment_231.03993,...,fragment_1103.18982,fragment_1103.23975,fragment_1105.17065,fragment_1105.19177,fragment_1105.20837,fragment_1114.99451,fragment_1117.0459,fragment_1118.76548,fragment_1369.2898,loss_871.17021
986.22259_1.85_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1117.29005_2.35_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,359.0,317.0,323.0,0.0,0.0
951.17426_114.576_10,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,5.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
685.12_115.109_60,0.0,9.0,0.0,0.0,24.0,0.0,6.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
818.14826_115.209_97,0.0,4.0,0.0,11.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
943.18612_115.309_141,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1076.7132_115.409_162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1084.20268_115.559_184,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0


<hr/>

Sparse test

In [13]:
extractor = MultifileFeatureExtractor(input_set, fragment_grouping_tol, loss_grouping_tol, 
                                      loss_threshold_min_count, loss_threshold_max_val,
                                     input_type='dataframe')
fragment_q = extractor.make_fragment_queue()
fragment_groups = extractor.group_features(fragment_q, extractor.fragment_grouping_tol)
loss_q = extractor.make_loss_queue()
loss_groups = extractor.group_features(loss_q, extractor.loss_grouping_tol, 
                                       check_threshold=True)
extractor.create_dataframes(fragment_groups, loss_groups, sparse=False)
f = 0
extractor.normalise(f, scaling_factor)
df2, _, _, _ = extractor.get_entry(f)
display(df2)

Loading MS1 dataframe 8 X 7
Loading MS2 dataframe 219 X 9
Processing fragments for file 0
Total groups=173
Processing losses for file 0
Total groups=1
173 fragment words
1 loss words
Initialising dense dataframe
Populating dataframes
Populating dataframe for fragment group 0/173
Populating dataframe for fragment group 100/173
Populating dataframe for loss group 0/1
file 0 data shape (8, 174)


Unnamed: 0,fragment_159.02098,fragment_159.02271,fragment_213.02541,fragment_213.02794,fragment_213.03315,fragment_213.03867,fragment_214.97896,fragment_231.02635,fragment_231.03772,fragment_231.03993,...,fragment_1103.18982,fragment_1103.23975,fragment_1105.17065,fragment_1105.19177,fragment_1105.20837,fragment_1114.99451,fragment_1117.0459,fragment_1118.76548,fragment_1369.2898,loss_871.17021
986.22259_1.85_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1117.29005_2.35_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,359.0,317.0,323.0,0.0,0.0
951.17426_114.576_10,2.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,5.0,...,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
685.12_115.109_60,0.0,9.0,0.0,0.0,24.0,0.0,6.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
818.14826_115.209_97,0.0,4.0,0.0,11.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
943.18612_115.309_141,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1076.7132_115.409_162,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1084.20268_115.559_184,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0


<h2>2. Analysis</h2>

<h3>a. Run LDA</h3>

Once the data has been loaded by performing either step 1(a) or 1(b), we're now ready to run LDA.

In [None]:
ms2lda = Ms2Lda(df, vocab, ms1, ms2)

In [None]:
### all the parameters you need to specify to run LDA ###

n_topics = 300 # 300 - 400 topics from cross-validation
n_samples = 10 # 100 is probably okay for testing. For manuscript, use > 500-1000.
n_burn = 0 # if 0 then we only use the last sample
n_thin = 1 # every n-th sample to use for averaging after burn-in. Ignored if n_burn = 0
alpha = 50.0/n_topics # hyper-parameter for document-topic distributions
beta = 0.1 # hyper-parameter for topic-word distributions

ms2lda.run_lda(n_topics, n_samples, n_burn, n_thin, alpha, beta)

In [None]:
# leave the message parameter out if nothing to say
ms2lda.save_project('results/analysis.project', message="First try")

<hr/>

**resume project**

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
basedir = '../'
sys.path.append(basedir)

from lda_for_fragments import Ms2Lda

In [None]:
ms2lda = Ms2Lda.resume_from('results/analysis.project')

In [None]:
ms2lda.do_thresholding(th_doc_topic=0.05, th_topic_word=0.01)

In [None]:
ms2lda.print_topic_words()

In [None]:
ms2lda.plot_lda_fragments(interactive=True)