In [1]:
import spacy

In [8]:
text = "The quick brown             fox jumps over the lazy dog." 

In [9]:
nlp = spacy.load("en")

In [10]:
doc = nlp(text)

In [12]:
[tok for tok in doc]

[The, quick, brown,             , fox, jumps, over, the, lazy, dog, .]

In [7]:
doc

The quick brown fox jumps over the lazy dog.

In [22]:
import collections

def Matcher(tok, *args, **kwargs):
    """Match x
    
    
    tok: :class:`spacy.tokens.Token`
        A spacy token    
    *args, **kwargs:
        See description
      
    Match a token aginst a series of rules. By default, the result is ``True``. 
    If all patterns are matched, the function returns ``True``. If any 
    pattern does not match, the function returns ``False``.
    
    Global patterns: Let ``x`` be the token, and ``pat`` the pattern:
    
    - ``None``: ``False``
    - ``bool``: ``pat``
    - callables, including functions: ``pat(x)``
    - regular expression: ``pat.search(x.text)``
    - iterable: ``x in pat``
    - otherwise: ``x == pat``
    
    Attribute patterns: Let ``x`` be the value of the attribute, and ``pat`` the pattern:
    
    - callables, including functions: ``pat(x)``
    - regular expression: ``pat.search(str(x))``
    - iterable: ``x in pat``
    - otherwise: ``x == pat``
    
    
    """
    def __init__(self, *args, **kwargs):
        self._global_patterns = args
        self._attr_patterns = kwargs

    def match(self, token):
        for pat in self._global_patterns:
            if pat is None:
                return False
            elif isinstance(pat, bool):
                if not pat:
                    return False
            if isinstance(pat, collections.Callable):
                if not pat(tok):
                    return False
            elif hasattr(pat, 'match'):
                # regexp
                if not arg.search(tok.text):
                    return False
            elif hasattr(pat, '__iter__'):
                # match against an iterable
                if tok not in pat:
                    return False
            else:
                # if all else fails, test equality
                if tok != pat:
                    return False
        for attr, pat in self._attr_patterns:            
            if isinstance(pat, collections.Callable):
                if not pat(tok.__getattr__(attr)):
                    return False
            elif hasattr(arg, 'match'):
                # regexp
                if not arg.search(str(tok.__getattr__(attr))):
                    return False
            # if attribute is iterable, maybe I should do something different?
            # if object is an iterable, matching equality also become an issue
            elif hasattr(arg, '__iter__'):
                # match against an iterable
                if tok.__getattr__(attr) not in pat:
                    return False
            else:
                if tok.__getattr__(attr) != pat:
                    return False
        # default is that 
        # only return True if all patterns have matched
        return True
        

The basic idea of matching objects has several 

- LINQ: 

    - https://github.com/viralogic/py-enumerable
    - https://pypi.python.org/pypi/py-linq/0.4.0
    - https://github.com/heynemann/pynq/wiki
    - maybe LINQ is overthinking it: http://sayspy.blogspot.com.au/2006/02/why-python-doesnt-need-something-like.html
    
- Functional pattern matching

    - https://pypi.python.org/pypi/pypatt
    - https://github.com/Suor/patterns
    - https://github.com/martinblech/pyfpm
    
- Regular exprssions for objects: https://github.com/machinalis/refo (This is closest to what I'm looking for)
- PyParsing: http://pyparsing.wikispaces.com/. But what if it used objects instead of strings? 
    

Some NLP stuff

- CoreNLP: TokensRegex https://nlp.stanford.edu/software/tokensregex.html
- CoreNLP: Tregex, Tsurgeon, and Semgrex: https://nlp.stanford.edu/software/tregex.shtml
- tgrep2: http://tedlab.mit.edu/~dr/Tgrep2/
- nltk tgrep: https://pypi.python.org/pypi/nltk_tgrep/
- A python implementation of Tregex: https://github.com/yandex/dep_tregex but for dependency parsing
- pyparsing example with tgrep: https://www.programcreek.com/python/example/74156/pyparsing.White
- NLTK chunker: http://www.nltk.org/api/nltk.chunk.html

- A javascript POS regex https://github.com/markbirbeck/pos-chunker

In [23]:
- practNLPTools has https://github.com/jawahar273/practNLPTools-lite

SyntaxError: invalid syntax (<ipython-input-23-cb451c3dfab3>, line 1)

Adapt code from conll-U

In [None]:
import re
from collections import OrderedDict, defaultdict

DEFAULT_FIELDS = ('id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc')

def parse(text, fields=DEFAULT_FIELDS):
    return [
        [
            parse_line(line, fields)
            for line in sentence.split("\n")
            if line and not line.strip().startswith("#")
        ]
        for sentence in text.split("\n\n")
        if sentence
    ]

def parse_line(line, fields=DEFAULT_FIELDS):
    line = re.split(r"\t| {2,}", line)
    data = OrderedDict()

    for i, field in enumerate(fields):
        # Allow parsing CoNNL-U files with fewer columns
        if i >= len(line):
            break

        if field == "id":
            value = parse_int_value(line[i])

        elif field == "xpostag":
            value = parse_nullable_value(line[i])

        elif field == "feats":
            value = parse_dict_value(line[i])

        elif field == "head":
            value = parse_int_value(line[i])

        elif field == "deps":
            value = parse_nullable_value(line[i])

        elif field == "misc":
            value = parse_dict_value(line[i])

        else:
            value = line[i]

        data[field] = value

    return data

def parse_int_value(value):
    if value.isdigit():
        return int(value)

    return None

def parse_dict_value(value):
    if "=" in value:
        return OrderedDict([
            (part.split("=")[0], parse_nullable_value(part.split("=")[1]))
            for part in value.split("|")
        ])

    return parse_nullable_value(value)

def parse_nullable_value(value):
    if not value or value == "_":
        return None

    return value


spacy