In [59]:
from collections import defaultdict
from functools import reduce
from pathlib import Path
from time import perf_counter
import sys

from IPython.core.display import display
from pandas import CategoricalDtype
import numpy as np
from pyopenms import *
import pandas as pd
import os

In [60]:
from typing import List


def peptide_identifications_to_df(peps: List[PeptideIdentification], decode_ontology : bool = True):

        switchDictT = {bool: np.bool_, int: np.intc, float: np.float, str: np.byte}
        # TODO find a heuristic for length of strings or
        #  Especially sequences and spectrum_ids
        switchDict = {bool: '?', int: 'i', float: 'f', str: 'U100'}
        metavals = []
        types = []
        mainscorename = "score"
        for pep in peps:
            hits = pep.getHits()
            if not len(hits) == 0:
                hits[0].getKeys(metavals)
                mainscorename = pep.getScoreType()
                for k in metavals:
                    if k == b"target_decoy":
                        types.append('?')
                    else:
                        mv = hits[0].getMetaValue(k)
                        types.append(switchDict[type(mv)])
                break

        # TODO get score type name
        decodedMVs = [m.decode("utf-8") for m in metavals] if decode_ontology else metavals
        cv = ControlledVocabulary()
        cv.loadFromOBO("psims", File.getOpenMSDataPath() + "/CV/psi-ms.obo")
        clearMVs = [cv.getTerm(m).name if m.startswith("MS:") else m for m in decodedMVs]
        #cols = ["id", "RT", "mz", "score", "charge"] + decodedMVs
        clearcols = ["id", "RT", "mz", mainscorename, "charge"] + clearMVs
        coltypes = ['U100', 'f', 'f', 'f', 'i'] + types
        dt = list(zip(clearcols, coltypes))
        def extract(pep):
            hits = pep.getHits()
            if not hits:
                return tuple([pep.getIdentifier().encode('utf-8'), pep.getRT(), pep.getMZ(), np.NA, np.NA] + [np.NA]*len(metavals))
            else:
                besthit = hits[0]
                ret = [pep.getIdentifier().encode('utf-8'), pep.getRT(), pep.getMZ(), besthit.getScore(), besthit.getCharge()]
                for k in metavals:
                    if besthit.metaValueExists(k):
                        val = besthit.getMetaValue(k)
                        if k == b"target_decoy":
                            if val[0] == 't':
                                ret.append(True)
                            else:
                                ret.append(False)
                        else:
                            ret.append(val)
                    else:
                        ret.append(np.NA)
                return tuple(ret)

        #TODO implement hasHits function in C++
        psmarr = np.fromiter((extract(pep) for pep in peps), dtype=dt, count=len(peps))
        #TODO make spectrum_ref the index, if available?
        return pd.DataFrame(psmarr)

In [61]:
prots = []
peps = []
IdXMLFile().load("MSGFPlusAdapter_1_out.idXML", prots, peps)


In [62]:
display(peptide_identifications_to_df(peps))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


Unnamed: 0,id,RT,mz,SpecEValue,charge,MS-GF:RawScore,MS-GF:DeNovoScore,MS-GF:SpecEValue,MS-GF:EValue,AssumedDissociationMethod,calcMZ,pass_threshold,start,end,target_decoy,isotope_error
0,MS-GF+_2020-05-20T23:45:50_15199737197459537604,4587.668945,1063.209839,2.3492369999999998e-26,3,164.0,199.0,2.3492369999999998e-26,9.678857e-24,HCD,1063.209351,1,1,28,True,0
1,MS-GF+_2020-05-20T23:45:50_15199737197459537604,4923.777344,775.387207,1.624272e-19,3,151.0,188.0,1.624272e-19,6.513329000000001e-17,HCD,775.385437,1,1,23,True,0
2,MS-GF+_2020-05-20T23:45:50_15199737197459537604,2655.095703,520.262817,4.6521299999999995e-19,3,123.0,125.0,4.6521299999999995e-19,1.63755e-16,HCD,520.26355,1,1,14,True,0
