In [27]:
import re
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy.tokenizer import Tokenizer

def custom_tokenizer(nlp):
    custom_infixes = [r'-']  # "[a-z]+[\\-][\n][a-z]+"
    prefix_re = spacy.util.compile_prefix_regex(list(nlp.Defaults.prefixes) + custom_infixes)
    suffix_re = spacy.util.compile_suffix_regex(list(nlp.Defaults.suffixes) + custom_infixes)
    infix_re  = spacy.util.compile_infix_regex(list(nlp.Defaults.infixes)+ custom_infixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                                suffix_search=suffix_re.search,
                                infix_finditer=infix_re.finditer,
                                token_match=None)

nlp = spacy.load("es_core_news_md")
nlp.tokenizer = custom_tokenizer(nlp)

In [28]:
poem = """el remedio tie-
ne de otros pla-
nos malamente
pero eran ver-
des y azules -y
que otro sol oculta-
Dos gemas muy bonitas."""

In [33]:
class Tmesis:
    def __init__(self, nlp):
        self.nlp = nlp
        if not Token.has_extension("has_tmesis"):
            Token.set_extension("has_tmesis", default=False)
            Token.set_extension("tmesis_text", default="")
        if not Token.has_extension("verse"):
            Token.set_extension("verse", default=0)
    def __call__(self, doc):
        matcher = Matcher(doc.vocab)
        matcher.add('tmesis', None, [
            {"TEXT": {"REGEX": r"[a-zñ]+"}},
            {"TEXT": {"REGEX": r"-$"}},
            {"TEXT": {"REGEX": r"\n+"}},
            {"TEXT": {"REGEX": r"^[a-zñ]+"}},
        ])
        with doc.retokenize() as retokenizer:
            for _, start, end in matcher(doc):
                span_text_raw = str(doc[start:end])
                span_text = re.sub(r"-\n", "", span_text_raw)
                attrs = {
                    "LEMMA": self.nlp.Defaults.lemma_lookup.get(span_text, span_text_raw),
                    "_": {"has_tmesis": True, "tmesis_text": span_text}
                }
                retokenizer.merge(doc[start:end], attrs=attrs)
        verse_count = 0
        for token in doc:
            token._.verse = verse_count
            if '\n' in token.text:
                verse_count += 1
        return doc

In [34]:
nlp.remove_pipe("tmesis") if nlp.has_pipe("tmesis") else None
nlp.add_pipe(Tmesis(nlp), name="tmesis", first=True)
# [(t.text, t.lemma_, t._.verse, t._.tmesis_text, t.tag_) for t in nlp(poem)]
doc = nlp(poem)

In [35]:
doc_pos = nlp(' '.join([t._.tmesis_text or t.text for t in doc]))

In [38]:
for token, token_pos in zip(*[doc, doc_pos]):
    token.pos = token_pos.pos
    token.pos_ = token_pos.pos_
    token.tag = token_pos.tag
    token.tag_ = token_pos.tag_
[(str(t),t.pos_, t._.verse, t.is_oov,t._.has_tmesis) for t in doc]

[('el', 'DET', 0, False, False),
 ('remedio', 'NOUN', 0, False, False),
 ('tie-\nne', 'VERB', 0, True, True),
 ('de', 'ADP', 1, False, False),
 ('otros', 'DET', 1, False, False),
 ('pla-\nnos', 'NOUN', 1, True, True),
 ('malamente', 'ADV', 2, False, False),
 ('\n', 'SPACE', 2, True, False),
 ('pero', 'CONJ', 3, False, False),
 ('eran', 'AUX', 3, False, False),
 ('ver-\ndes', 'ADJ', 3, True, True),
 ('y', 'CONJ', 4, False, False),
 ('azules', 'ADJ', 4, False, False),
 ('-', 'PUNCT', 4, False, False),
 ('y', 'CONJ', 4, False, False),
 ('\n', 'SPACE', 4, True, False),
 ('que', 'PRON', 5, False, False),
 ('otro', 'DET', 5, False, False),
 ('sol', 'NOUN', 5, False, False),
 ('oculta', 'ADJ', 5, False, False),
 ('-', 'PUNCT', 5, False, False),
 ('\n', 'SPACE', 5, True, False),
 ('Dos', 'NUM', 6, False, False),
 ('gemas', 'NOUN', 6, False, False),
 ('muy', 'ADV', 6, False, False),
 ('bonitas', 'ADJ', 6, False, False),
 ('.', 'PUNCT', 6, False, False)]

In [42]:
vars().keys()

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', '_i', '_ii', '_iii', '_i1', 're', 'spacy', 'Matcher', 'Token', 'Tokenizer', 'custom_tokenizer', 'nlp', '_i2', 'poem', '_i3', 'Tmesis', '_i4', 'doc', '_i5', 'doc_pos', '_i6', 'token', 'token_pos', '_6', '_i7', '_exit_code', '_i8', '_i9', '_i10', '_i11', '_i12', '_i13', '_13', '_i14', '_i15', '_i16', '_i17', '_i18', '_i19', '_19', '_i20', '_i21', '_i22', '_i23', '_i24', '_i25', '_i26', '_26', '_i27', '_i28', '_i29', '_i30', '_i31', '_i32', '_32', '_i33', '_i34', '_i35', '_i36', '_36', '_i37', '_37', '_i38', '_38', '_i39', '_39', '_i40', '_40', '_i41', '_41', '_i42'])