In [2]:
from collections import defaultdict
from functools import reduce
from pathlib import Path
from time import perf_counter
import sys
from pandas import CategoricalDtype
import numpy as np
from pyopenms import *
import pandas as pd
import os

In [9]:

class ConsensusMapDF(ConsensusMap):
    def __init__(self):
        super().__init__()

    def get_intensity_df(self):
        labelfree = self.getExperimentType() == "label-free"
        filemeta = self.getColumnHeaders()  # type: dict[int, ColumnHeader]
        labels = list(set([header.label for header in
                           filemeta.values()]))  # TODO could be more efficient. Do we require same channels in all files?
        files = list(set([header.filename for header in filemeta.values()]))
        label_to_idx = {k: v for v, k in enumerate(labels)}
        file_to_idx = {k: v for v, k in enumerate(files)}

        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        if not labelfree:
            # TODO write two functions for LF and labelled. One has only one channel, the other has only one file per CF
            def extractRowBlocksChannelWideFileLong(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                filerows = defaultdict(lambda: [0] * len(labels))  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row = filerows[header.filename]
                    row[label_to_idx[header.label]] = fh.getIntensity()
                return (f.getUniqueId(), filerows)

            def extractRowsChannelWideFileLong(f: ConsensusFeature):
                uniqueid, rowdict = extractRowBlocksChannelWideFileLong(f)
                for file, row in rowdict.items():
                    row.append(file)
                    yield tuple([uniqueid] + row)

            if len(labels) == 1:
                labels[0] = "intensity"
            dtypes = [('id', np.dtype('uint64'))] + list(zip(labels, ['f'] * len(labels)))
            dtypes.append(('file', 'U300'))
            # For TMT we know that every feature can only be from one file, since feature = PSM
            #cnt = 0
            #for f in self:
            #    cnt += f.size()

            intyarr = np.fromiter(iter=gen(self, extractRowsChannelWideFileLong), dtype=dtypes, count=self.size())
            return pd.DataFrame(intyarr).set_index('id')
        else:
            # Specialized for LabelFree which has to have only one channel
            def extractRowBlocksChannelLongFileWideLF(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                row = [0.] * len(files)  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row[file_to_idx[header.filename]] = fh.getIntensity()
                yield tuple([f.getUniqueId()] + row)

            dtypes = [('id', np.dtype('uint64'))] + list(zip(files, ['f'] * len(files)))
            # cnt = self.size()*len(files) # TODO for this to work, we would need to fill with NAs for CFs that do not go over all files
            cnt = self.size()

            intyarr = np.fromiter(iter=gen(self, extractRowBlocksChannelLongFileWideLF), dtype=dtypes, count=cnt)
            return pd.DataFrame(intyarr).set_index('id')

    def get_metadata_df(self):
        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        def extractMetaData(f: ConsensusFeature):
            # subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
            pep = f.getPeptideIdentifications()  # type: list[PeptideIdentification]
            if len(pep) != 0:
                hits = pep[0].getHits()
                if len(hits) != 0:
                    besthit = hits[0]  # type: PeptideHit
                    # TODO what else
                    yield f.getUniqueId(), besthit.getSequence().toString(), f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()
                else:
                    yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()
            else:
                yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()

        cnt = self.size()

        mddtypes = [('id', np.dtype('uint64')), ('sequence', 'U200'), ('charge', 'i4'), ('RT', 'f'), ('mz', 'f'),
                    ('quality', 'f')]
        mdarr = np.fromiter(iter=gen(self, extractMetaData), dtype=mddtypes, count=cnt)
        return pd.DataFrame(mdarr).set_index('id')

In [5]:

    cmap = ConsensusMapDF()
    ConsensusXMLFile().load("BSA_linked.consensusXML", cmap)

    print(cmap.get_intensity_df())
    print(cmap.get_metadata_df())

                      share/OpenMS/examples/FRACTIONS/BSA2_F1_aligned.featureXML  \
id                                                                                 
3980443836649891777                                          27983300.0            
6980891348972231939                                           2201550.0            
3593342995735647332                                           2367000.0            
14923483173153195451                                          1864890.0            
11488955034596218319                                           162836.0            
...                                                                 ...            
3982769909008338823                                            275799.0            
2764560358899928592                                          17297700.0            
2354319252995205019                                            756676.0            
4374451759732988964                                            367928.0     

In [8]:
    cmap = ConsensusMapDF()
    # TODO find a file, from a more complex experiment with merged cXML from multiple files
    ConsensusXMLFile().load("IsobaricAnalyzer_output_1.consensusXML", cmap)

    print(cmap.get_intensity_df())
    print(cmap.get_metadata_df())

                      itraq4plex_117  itraq4plex_114  itraq4plex_116  \
id                                                                     
5233264595117471314         215523.0        682505.0        159273.0   
4835329514588776807         320332.0        893328.0        269176.0   
17749660155506638460        355042.0        941902.0        277135.0   
7804704400743266335         195769.0        612458.0        159026.0   
15004869347769368353        245388.0        684796.0        197707.0   

                      itraq4plex_115  \
id                                     
5233264595117471314         447939.0   
4835329514588776807         867501.0   
17749660155506638460        969365.0   
7804704400743266335         631364.0   
15004869347769368353        634356.0   

                                                                   file  
id                                                                       
5233264595117471314   /home/sachsenb/OpenMS/src/tests/topp/Isobari

In [0]:
# TODO test SILAC