In [28]:
from collections import defaultdict
from functools import reduce
from pathlib import Path
from time import perf_counter
import sys
from pandas import CategoricalDtype
import numpy as np
from pyopenms import *
import pandas as pd
import os

In [29]:

class ConsensusMapDF(ConsensusMap):
    def __init__(self):
        super().__init__()

    def get_intensity_df(self):
        labelfree = self.getExperimentType() == "label-free"
        filemeta = self.getColumnHeaders()  # type: dict[int, ColumnHeader]
        labels = list(set([header.label for header in
                           filemeta.values()]))  # TODO could be more efficient. Do we require same channels in all files?
        files = list(set([header.filename for header in filemeta.values()]))
        label_to_idx = {k: v for v, k in enumerate(labels)}
        file_to_idx = {k: v for v, k in enumerate(files)}

        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        if not labelfree:
            # TODO write two functions for LF and labelled. One has only one channel, the other has only one file per CF
            def extractRowBlocksChannelWideFileLong(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                filerows = defaultdict(lambda: [0] * len(labels))  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row = filerows[header.filename]
                    row[label_to_idx[header.label]] = fh.getIntensity()
                return (f.getUniqueId(), filerows)

            def extractRowsChannelWideFileLong(f: ConsensusFeature):
                uniqueid, rowdict = extractRowBlocksChannelWideFileLong(f)
                for file, row in rowdict.items():
                    row.append(file)
                    yield tuple([uniqueid] + row)

            if len(labels) == 1:
                labels[0] = "intensity"
            dtypes = [('id', np.dtype('uint64'))] + list(zip(labels, ['f'] * len(labels)))
            dtypes.append(('file', 'U300'))
            # For TMT we know that every feature can only be from one file, since feature = PSM
            #cnt = 0
            #for f in self:
            #    cnt += f.size()

            intyarr = np.fromiter(iter=gen(self, extractRowsChannelWideFileLong), dtype=dtypes, count=self.size())
            return pd.DataFrame(intyarr).set_index('id')
        else:
            # Specialized for LabelFree which has to have only one channel
            def extractRowBlocksChannelLongFileWideLF(f: ConsensusFeature):
                subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
                row = [0.] * len(files)  # TODO use numpy array?
                for fh in subfeatures:
                    header = filemeta[fh.getMapIndex()]
                    row[file_to_idx[header.filename]] = fh.getIntensity()
                yield tuple([f.getUniqueId()] + row)

            dtypes = [('id', np.dtype('uint64'))] + list(zip(files, ['f'] * len(files)))
            # cnt = self.size()*len(files) # TODO for this to work, we would need to fill with NAs for CFs that do not go over all files
            cnt = self.size()

            intyarr = np.fromiter(iter=gen(self, extractRowBlocksChannelLongFileWideLF), dtype=dtypes, count=cnt)
            return pd.DataFrame(intyarr).set_index('id')

    def get_metadata_df(self):
        def gen(cmap: ConsensusMap, fun):
            for f in cmap:
                yield from fun(f)

        def extractMetaData(f: ConsensusFeature):
            # subfeatures = f.getFeatureList()  # type: list[FeatureHandle]
            pep = f.getPeptideIdentifications()  # type: list[PeptideIdentification]
            if len(pep) != 0:
                hits = pep[0].getHits()
                if len(hits) != 0:
                    besthit = hits[0]  # type: PeptideHit
                    # TODO what else
                    yield f.getUniqueId(), besthit.getSequence().toString(), f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()
                else:
                    yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()
            else:
                yield f.getUniqueId(), None, f.getCharge(), f.getRT(), f.getMZ(), f.getQuality()

        cnt = self.size()

        mddtypes = [('id', np.dtype('uint64')), ('sequence', 'U200'), ('charge', 'i4'), ('RT', 'f'), ('mz', 'f'),
                    ('quality', 'f')]
        mdarr = np.fromiter(iter=gen(self, extractMetaData), dtype=mddtypes, count=cnt)
        return pd.DataFrame(mdarr).set_index('id')

In [30]:

    cmap = ConsensusMapDF()
    from urllib.request import urlretrieve
    urlretrieve ("https://raw.githubusercontent.com/OpenMS/OpenMS/develop/src/tests/class_tests/openms/data/BSA.consensusXML", "label-free.consensusXML")

    ConsensusXMLFile().load("label-free.consensusXML", cmap)
    

In [31]:
display(cmap.get_intensity_df())

Unnamed: 0_level_0,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA3_F2.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA1_F2.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA3_F1.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA1_F1.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA2_F1.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA2_F2.mzML
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18055798904544351710,0.0,0.0,0.0,0.0,2788.05,0.0
16751911815002726321,0.0,0.0,461846.0,1358150.0,0.0,0.0
1766075384941176729,0.0,0.0,104389.0,0.0,214030.0,0.0
6714187641100376547,0.0,0.0,0.0,0.0,3881570.0,0.0
300941239321730683,0.0,0.0,3691860.0,20567800.0,11036700.0,0.0
8470403259047476092,0.0,0.0,760472.0,1971500.0,4102750.0,0.0
17001643603461665041,0.0,0.0,0.0,12925300.0,13581200.0,0.0
5658659041765702685,0.0,0.0,13570600.0,62024400.0,34746200.0,0.0
11003401133233860035,0.0,0.0,2598460.0,12406600.0,0.0,0.0
17968946775179838221,0.0,0.0,145653.0,0.0,0.0,0.0


In [32]:
display(cmap.get_metadata_df())

Unnamed: 0_level_0,sequence,charge,RT,mz,quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18055798904544351710,DGDIEAEISR,3,1523.370605,368.843781,-2.95842
16751911815002726321,SHC(Carbamidomethyl)IAEVEK,3,1550.230469,358.174591,4.05841
1766075384941176729,SHCIAEVEK,2,1646.545044,508.247498,4.30258
6714187641100376547,QEPERNEC(Carbamidomethyl)FLSHK,3,1717.691528,558.594849,3.741
300941239321730683,C(Carbamidomethyl)C(Carbamidomethyl)TESLVNR,2,1726.187988,569.752625,4.41006
8470403259047476092,LC(Carbamidomethyl)VLHEK,2,1726.379639,449.744385,4.09243
17001643603461665041,LC(Carbamidomethyl)VLHEK,3,1727.822021,300.165344,3.8209
5658659041765702685,DDSPDLPK,2,1731.364868,443.711273,4.19915
11003401133233860035,EC(Carbamidomethyl)C(Carbamidomethyl)DKPLLEK,3,1743.927124,431.205536,3.66304
17968946775179838221,CC(Carbamidomethyl)TESLVNR,2,1750.726318,541.241882,4.10223


In [33]:
cmap.get_metadata_df().merge(cmap.get_intensity_df(), how='left', on='id') # single table

Unnamed: 0_level_0,sequence,charge,RT,mz,quality,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA3_F2.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA1_F2.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA3_F1.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA1_F1.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA2_F1.mzML,/Users/pfeuffer/git/OpenMS-inference-src/share/OpenMS/examples/FRACTIONS/BSA2_F2.mzML
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
18055798904544351710,DGDIEAEISR,3,1523.370605,368.843781,-2.95842,0.0,0.0,0.0,0.0,2788.05,0.0
16751911815002726321,SHC(Carbamidomethyl)IAEVEK,3,1550.230469,358.174591,4.05841,0.0,0.0,461846.0,1358150.0,0.0,0.0
1766075384941176729,SHCIAEVEK,2,1646.545044,508.247498,4.30258,0.0,0.0,104389.0,0.0,214030.0,0.0
6714187641100376547,QEPERNEC(Carbamidomethyl)FLSHK,3,1717.691528,558.594849,3.741,0.0,0.0,0.0,0.0,3881570.0,0.0
300941239321730683,C(Carbamidomethyl)C(Carbamidomethyl)TESLVNR,2,1726.187988,569.752625,4.41006,0.0,0.0,3691860.0,20567800.0,11036700.0,0.0
8470403259047476092,LC(Carbamidomethyl)VLHEK,2,1726.379639,449.744385,4.09243,0.0,0.0,760472.0,1971500.0,4102750.0,0.0
17001643603461665041,LC(Carbamidomethyl)VLHEK,3,1727.822021,300.165344,3.8209,0.0,0.0,0.0,12925300.0,13581200.0,0.0
5658659041765702685,DDSPDLPK,2,1731.364868,443.711273,4.19915,0.0,0.0,13570600.0,62024400.0,34746200.0,0.0
11003401133233860035,EC(Carbamidomethyl)C(Carbamidomethyl)DKPLLEK,3,1743.927124,431.205536,3.66304,0.0,0.0,2598460.0,12406600.0,0.0,0.0
17968946775179838221,CC(Carbamidomethyl)TESLVNR,2,1750.726318,541.241882,4.10223,0.0,0.0,145653.0,0.0,0.0,0.0


In [34]:
    cmap = ConsensusMapDF()
    
    urlretrieve ("https://raw.githubusercontent.com/OpenMS/OpenMS/develop/src/tests/topp/IsobaricAnalyzer_output_1.consensusXML", "itraq.consensusXML")    
    ConsensusXMLFile().load("itraq.consensusXML", cmap)


In [35]:
display(cmap.get_intensity_df())

Unnamed: 0_level_0,itraq4plex_117,itraq4plex_115,itraq4plex_114,itraq4plex_116,file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5233264595117471314,215523.0,447939.0,682505.0,159273.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
4835329514588776807,320332.0,867501.0,893328.0,269176.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
17749660155506638460,355042.0,969365.0,941902.0,277135.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
7804704400743266335,195769.0,631364.0,612458.0,159026.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
15004869347769368353,245388.0,634356.0,684796.0,197707.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...


In [36]:
display(cmap.get_metadata_df())

Unnamed: 0_level_0,sequence,charge,RT,mz,quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5233264595117471314,,2,3611.092041,769.393799,0.0
4835329514588776807,,3,3611.397705,421.558594,0.0
17749660155506638460,,3,3611.701416,447.907074,0.0
7804704400743266335,,3,3612.005127,407.579285,0.0
15004869347769368353,,2,3612.311768,748.901794,0.0


In [37]:
cmap.get_metadata_df().merge(cmap.get_intensity_df(), how='left', on='id') # single table

Unnamed: 0_level_0,sequence,charge,RT,mz,quality,itraq4plex_117,itraq4plex_115,itraq4plex_114,itraq4plex_116,file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
5233264595117471314,,2,3611.092041,769.393799,0.0,215523.0,447939.0,682505.0,159273.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
4835329514588776807,,3,3611.397705,421.558594,0.0,320332.0,867501.0,893328.0,269176.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
17749660155506638460,,3,3611.701416,447.907074,0.0,355042.0,969365.0,941902.0,277135.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
7804704400743266335,,3,3612.005127,407.579285,0.0,195769.0,631364.0,612458.0,159026.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...
15004869347769368353,,2,3612.311768,748.901794,0.0,245388.0,634356.0,684796.0,197707.0,/home/sachsenb/OpenMS/src/tests/topp/IsobaricA...


In [38]:
    cmap = ConsensusMapDF()
    from urllib.request import urlretrieve
    urlretrieve ("https://raw.githubusercontent.com/OpenMS/OpenMS/develop/src/tests/topp/FeatureFinderMultiplex_10_output.consensusXML", "silac.consensusXML")
    ConsensusXMLFile().load("silac.consensusXML", cmap)    

In [39]:
display(cmap.get_intensity_df())

Unnamed: 0_level_0,label 0,label 1,file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4835329514588776807,10667330.0,0.0,FeatureFinderMultiplex_10_input.mzML
17749660155506638460,14698730.0,0.0,FeatureFinderMultiplex_10_input.mzML
7804704400743266335,466288000.0,0.0,FeatureFinderMultiplex_10_input.mzML
15004869347769368353,34953240.0,31916350.0,FeatureFinderMultiplex_10_input.mzML
3332699010107892018,7784946.0,5863498.0,FeatureFinderMultiplex_10_input.mzML
13440783915218733453,22511620.0,0.0,FeatureFinderMultiplex_10_input.mzML


In [40]:
display(cmap.get_metadata_df())

Unnamed: 0_level_0,sequence,charge,RT,mz,quality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4835329514588776807,,3,2053.195312,644.968445,1.0
17749660155506638460,,2,2053.780029,694.842407,1.0
7804704400743266335,,2,2054.967041,683.853943,1.0
15004869347769368353,,3,2055.814453,626.334656,1.0
3332699010107892018,,2,2056.463623,650.868042,1.0
13440783915218733453,,2,2058.406494,600.363159,1.0


In [41]:
cmap.get_metadata_df().merge(cmap.get_intensity_df(), how='left', on='id') # single table

Unnamed: 0_level_0,sequence,charge,RT,mz,quality,label 0,label 1,file
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4835329514588776807,,3,2053.195312,644.968445,1.0,10667330.0,0.0,FeatureFinderMultiplex_10_input.mzML
17749660155506638460,,2,2053.780029,694.842407,1.0,14698730.0,0.0,FeatureFinderMultiplex_10_input.mzML
7804704400743266335,,2,2054.967041,683.853943,1.0,466288000.0,0.0,FeatureFinderMultiplex_10_input.mzML
15004869347769368353,,3,2055.814453,626.334656,1.0,34953240.0,31916350.0,FeatureFinderMultiplex_10_input.mzML
3332699010107892018,,2,2056.463623,650.868042,1.0,7784946.0,5863498.0,FeatureFinderMultiplex_10_input.mzML
13440783915218733453,,2,2058.406494,600.363159,1.0,22511620.0,0.0,FeatureFinderMultiplex_10_input.mzML


In [42]:
cmap.get_intensity_df().index

UInt64Index([ 4835329514588776807, 17749660155506638460,  7804704400743266335,
             15004869347769368353,  3332699010107892018, 13440783915218733453],
            dtype='uint64', name='id')