In [62]:
import re
from collections import namedtuple, defaultdict
from enum import Enum

from six import add_metaclass

In [63]:
class PrefixSavingMeta(type):
    def __new__(mcs, name, parents, attrs):
        new_type = type.__new__(mcs, name, parents, attrs)
        prefix = attrs.get("prefix_name")
        if prefix:
            new_type.prefix_map[prefix] = new_type
        short = attrs.get("short_prefix")
        if short:
            new_type.prefix_map[short] = new_type
        return new_type

In [102]:
class TagTypeEnum(Enum):
    unimod = 0
    psimod = 1
    massmod = 2
    generic = 3
    info = 4
    gnome = 5
    formula = 6
    glycan = 7
    xlmod = 8
    localization_marker = 9
    group_placeholder = 999
    

@add_metaclass(PrefixSavingMeta)
class TagBase(object):
    __slots__ = ("type", "value", "extra", "group_id")

    prefix_name = None
    short_prefix = None
    prefix_map = {}
    
    def __init__(self, type, value, extra=None, group_id=None):
        self.type = type
        self.value = value
        self.extra = extra or []
        self.group_id = group_id

    def __str__(self):
        part = self._format_main()
        if self.extra:
            rest = [str(e) for e in self.extra]
            label = '|'.join([part] + rest)
        else:
            label = part
        if self.group_id:
            label = '%s#%s' % (label, self.group_id)
        return label
    
    def __repr__(self):
        template = "{self.__class__.__name__}({self.value!r}, {self.extra!r}, {self.group_id!r})"
        return template.format(self=self)


class LocalizationMarker(TagBase):
    __slots__ = ()
    
    def __init__(self, value, extra=None, group_id=None):
        assert group_id is not None
        super(LocalizationMarker, self).__init__(TagTypeEnum.localization_marker, float(value), extra, group_id)
    
    def _format_main(self):
        return "#{self.group_id}({self.value!f})".format(self=self)
    
    
class MassModification(TagBase):
    __slots__ = ()
    
    def __init__(self, value, extra=None, group_id=None):
        super(MassModification, self).__init__(TagTypeEnum.massmod, float(value), extra, group_id)
    
    def _format_main(self):
        return '%0.4f' % self.value

    
class ControlledVocabularyModificationBase(TagBase):
    _tag_type = None
    __slots__ = ()
    
    def __init__(self, value, extra=None, group_id=None):
        super(ControlledVocabularyModificationBase, self).__init__(
            self._tag_type, value, extra, group_id)

    def _format_main(self):
        return "{self.prefix_name}:{self.value}".format(self=self)

    
class GenericModification(TagBase):
    __slots__ = ()
    
    def __init__(self, value, extra=None, group_id=None):
        super(GenericModification, self).__init__(TagTypeEnum.generic, value, extra, group_id)
    
    def _format_main(self):
        return self.value
    

class UnimodModification(ControlledVocabularyModificationBase):
    __slots__ = ()
    
    prefix_name = "UNIMOD"
    short_prefix = "U"
    _tag_type = TagTypeEnum.unimod


class PSIModModification(ControlledVocabularyModificationBase):
    __slots__ = ()
    
    prefix_name = "MOD"
    short_prefix = 'M'
    _tag_type = TagTypeEnum.psimod


class GNOmeModification(ControlledVocabularyModificationBase):
    __slots__ = ()
    
    prefix_name = "GNO"
    short_prefix = 'G'
    _tag_type = TagTypeEnum.gnome

    
class XLMODModification(ControlledVocabularyModificationBase):
    __slots__ = ()
    
    prefix_name = "XLMOD"
#     short_prefix = 'XL'
    _tag_type = TagTypeEnum.xlmod
    
    
class TagParserStateEnum(Enum):
    start = 0
    group_id = 1

def split_tags(tokens):
    starts = [0]
    ends = []
    for i, c in enumerate(tokens):
        if c == '|':
            ends.append(i)
            starts.append(i + 1)
    ends.append(len(tokens))
    out = []
    for i, start in enumerate(starts):
        end = ends[i]
        out.append(tokens[start:end])
    return out

def find_prefix(tokens):
    for i, c in enumerate(tokens):
        if c == ':':
            return ''.join(tokens[:i]), ''.join(tokens[i + 1:])
    return None, tokens
    
def process_tag_tokens(tokens):
    parts = split_tags(tokens)
    main_tag = parts[0]
    if main_tag[0] in ('+', '-'):
        main_tag = ''.join(main_tag)
        main_tag = MassModification(main_tag)
    else:
        prefix, value = find_prefix(main_tag)
        if prefix is None:
            main_tag = GenericModification(''.join(value))
        else:
            tag_type = TagBase.prefix_map[prefix]
            main_tag = tag_type(value)
    if len(parts) > 1:
        extras = []
        for part in parts:
            prefix, value = find_prefix(part)
            if prefix is None:
                if value.startswith("#"):
                    main_tag.group_id = value
                else:
                    main_tag.extra.append(GenericModification(''.join(value)))
            else:
                tag_type = TagBase.prefix_map[prefix]
                main_tag.extra.append(tag_type(value))
    return main_tag

In [103]:
from pyteomics import parser
''.join(parser.std_amino_acids)

'QWERTYIPASDFGHKLCVNM'

In [104]:
class ParserStateEnum(Enum):
    before_sequence = 0
    tag_before_sequence = 1
    global_tag = 2
    fixed_spec = 3
    labile_tag = 4
    sequence = 5
    tag_in_sequence = 6
    interval_tag = 7
    tag_after_sequence = 8
    
    done = 999


BEFORE = ParserStateEnum.before_sequence
TAG_BEFORE = ParserStateEnum.tag_before_sequence
FIXED = ParserStateEnum.fixed_spec
GLOBAL = ParserStateEnum.global_tag
LABILE = ParserStateEnum.labile_tag
SEQ = ParserStateEnum.sequence
TAG = ParserStateEnum.tag_in_sequence
INTERVAL_TAG = ParserStateEnum.interval_tag
TAG_AFTER = ParserStateEnum.tag_after_sequence
DONE = ParserStateEnum.done

VALID_AA = set("QWERTYIPASDFGHKLCVNM")

def tokenize_proforma(sequence):
    labile_modifications = []
    fixed_modifications = []
    unlocalized_modifications = []
    intervals = []
    isotopes = []
    
    n_term = None
    c_term = None
    
    i = 0
    n = len(sequence)
    
    positions = []
    state = BEFORE
    depth = 0
    
    current_aa = None
    current_tag = []
    current_interval = None
    
    while i < n:
        c = sequence[i]
        i += 1
        if state == BEFORE:
            if c == '[':
                state = TAG_BEFORE
                depth = 1
            elif c == '{':
                state = LABILE
                depth = 1
            elif c == '<':
                state = FIXED
            elif c in VALID_AA:
                current_aa = c
                state = SEQ
            elif c == '?':
                if current_tag:
                    unlocalized_modifications.append(process_tag_tokens(current_tag))
                    current_tag = []
                else:
                    raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
            elif c == '-':
                if current_tag:
                    n_term = process_tag_tokens(current_tag)
                    current_tag = []
                else:
                    raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
            else:
                raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
        elif state == SEQ:
            if c in VALID_AA:
                positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))
                current_aa = c
                current_tag = []
            elif c == '[':
                state = TAG
                depth = 1
            elif c == '(':
                current_interval = [len(positions), None, None]
            elif c == ')':
                if current_interval is None:
                    raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
                else:
                    current_interval[1] = len(positions)
                    if i >= n or sequence[i] != '[':
                        raise Exception("Missing Interval Tag")
                    i += 1
                    depth = 1
                    state = INTERVAL_TAG
            elif c == '-':
                state = TAG_AFTER
                if i >= n or sequence[i] != '[':
                    raise Exception("Missing Interval Tag")
                i += 1
                depth = 1                
            else:
                raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
        elif state == TAG or state == TAG_BEFORE or state == TAG_AFTER:
            if c == '[':
                depth += 1
            elif c == ']':
                depth -= 1
                if depth <= 0:
                    depth = 0
                    if state == TAG: 
                        state = SEQ
                    elif state == TAG_BEFORE:
                        state = BEFORE
                    elif state == TAG_AFTER:
                        c_term = process_tag_tokens(current_tag)
                        state = DONE
            else:
                current_tag.append(c)
        elif state == LABILE:
            if c == '{':
                depth += 1
            elif c == '}':
                depth -= 1
                if depth <= 0:
                    depth = 0
                    labile_modifications.append(process_tag_tokens(current_tag))
                    current_tag = []
                    state = BEFORE
            else:
                current_tag.append(c)
        elif state == INTERVAL_TAG:
            if c == '[':
                depth += 1
            elif c == ']':
                depth -= 1
                if depth <= 0:
                    depth = 0
                    current_interval[2] = process_tag_tokens(current_tag)
                    current_tag = []
                    intervals.append(current_interval)
                    current_interval = None
                    state = SEQ
            else:
                current_tag.append(c)
        else:
            raise Exception("Error In State {state}, unexpected {c} found at index {i}".format(**locals()))
    if current_aa:
        positions.append((current_aa, process_tag_tokens(current_tag) if current_tag else None))
    return positions, {
        'n_term': n_term,
        'c_term': c_term,
        'unlocalized_modifications': unlocalized_modifications,
        'labile_modifications': labile_modifications,
        'intervals': intervals,
    }

In [110]:
seq, fields = tokenize_proforma("{Foo}[Hex]-ST[U:Ox](EPP)[+18]ING")
seq, fields

([('S', None),
  ('T', UnimodModification('Ox', [], None)),
  ('E', None),
  ('P', None),
  ('P', None),
  ('I', None),
  ('N', None),
  ('G', None)],
 {'n_term': GenericModification('Hex', [], None),
  'c_term': None,
  'unlocalized_modifications': [],
  'labile_modifications': [GenericModification('Foo', [], None)],
  'intervals': [[1, 4, MassModification(18.0, [], None)]]})