In [1]:
import re
import spacy

from spacy.matcher import Matcher
from spacy.tokenizer import Tokenizer
from spacy.tokens import Token


def custom_tokenizer(nlp):
    custom_infixes = [r'-']  # "[a-z]+[\\-][\n][a-z]+"
    prefix_re = spacy.util.compile_prefix_regex(list(nlp.Defaults.prefixes) + custom_infixes)
    suffix_re = spacy.util.compile_suffix_regex(list(nlp.Defaults.suffixes) + custom_infixes)
    infix_re = spacy.util.compile_infix_regex(list(nlp.Defaults.infixes)+ custom_infixes)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer, token_match=None)


nlp = spacy.load('es_core_news_md')
nlp.tokenizer = custom_tokenizer(nlp)

In [58]:
class Tmesis:
    def __init__(self, nlp):
        self.nlp = nlp
        if not Token.has_extension("has_tmesis"):
            Token.set_extension("has_tmesis", default=False)
            Token.set_extension("tmesis_text", default="")
        if not Token.has_extension("line"):
            Token.set_extension("line", default=0)
    def __call__(self, doc):
        matcher = Matcher(doc.vocab)
        matcher.add('tmesis', None, [
            {"TEXT": {"REGEX": r"[a-zñ]+"}},
            {"TEXT": {"REGEX": r"-$"}},
            {"TEXT": {"REGEX": r"\n+"}},
            {"TEXT": {"REGEX": r"^[a-zñ]+"}},
        ])
        with doc.retokenize() as retokenizer:
            lookup = self.nlp.Defaults.lemma_lookup
            for _, start, end in matcher(doc):
                span_text_raw = str(doc[start:end])
                span_text = re.sub(r"-\n", "", span_text_raw)
                has_tmesis =  span_text in lookup.values() or span_text in lookup.keys()
                attrs = {
                    "LEMMA": lookup.get(span_text, span_text),
                    "_": {"has_tmesis": has_tmesis, "tmesis_text": span_text}
                }
                retokenizer.merge(doc[start:end], attrs=attrs)
        line_count = 0
        for token in doc:
            token._.line = line_count  # noqa
            if '\n' in token.text:
                line_count += 1
        return doc


In [59]:
nlp.remove_pipe("tmesis") if nlp.has_pipe("tmesis") else None
nlp.add_pipe(Tmesis(nlp), name="tmesis", first=True)

In [60]:
text = """El año 2000
-a la luz de un candil-
los supervivientes
no vean más plan,
que un terrible llan-
to y crujir de dientes
bellos se puede apos-
tar,
sin menospreciar
a aquellos profetas,
que aseguran que,
el remedio viene
de otros planetas
malamente pero ok."""

In [61]:
def has_tmesis_enjambment(previous_token, token, next_token):
    """

    :param previous_token:
    :param next_token:
    :return:
    """
    return token._.has_tmesis  # noqa


def get_sirrematic_enjambment(previous_token, next_token):
    """
    Checks if sirrematic enjambment exists between two lines
    :param previous_token: The word before a newline character
    :param next_token: The word after a newline character
    :return:
    """
    sirremactic_pairs = [['ADJ', 'NOUN'],
                         ['ADV', 'NOUN'],
                         ['ADP', 'ADJ'],
                         ['ADP', 'NOUN'],
                         ['ADJ', 'ADV'],
                         ['ADV', 'VERB']
                         ]
    while sirremactic_pairs:
        sirrematic_pair = sirremactic_pairs.pop()
        if sorted((previous_token.pos_, next_token.pos_)) == sorted(sirrematic_pair)\
                and (next_token.is_ancestor(previous_token) or previous_token.is_ancestor(next_token)):
            return sirrematic_pair
    return None


def get_enjambment(original_poem):
    """
    Scan a text for all possible enjambment types.
    :param original_poem:
    :return:
    """
    enjambment_types = ['sirrematic']
    enjambments = {}
    nlp_poem = nlp(original_poem)
    # We iterate through all the text up to the penultimate line
    for i, token in enumerate(nlp_poem[:-1]):
        previous_token = nlp_poem[i - 1]
        next_token = nlp_poem[i + 1]
        # We look for enjambment when if there are words before and after a newline character
        if token._.has_tmesis:  # noqa
            print(token.text)
            enjambments[token._.line] = ('tmesis', token.text.split('-\n'))  # noqa
            continue
        elif token.text == '\n' and not previous_token.is_punct and not next_token.is_punct:
            for enjambment_type in enjambment_types:
                enjambment_func = globals()[f'get_{enjambment_type}_enjambment']
                enjambment = enjambment_func(previous_token, next_token)
                if enjambment:
                    enjambments[token._.line] = (enjambment_type, enjambment)  # noqa
                    break
    return enjambments

In [62]:
get_enjambment(text) 

llan-
to
apos-
tar


{4: ('tmesis', ['llan', 'to']),
 6: ('tmesis', ['apos', 'tar']),
 12: ('sirrematic', ['ADV', 'NOUN'])}

In [63]:
lookup = nlp.Defaults.lemma_lookup

In [69]:
'verdes' in lookup.keys()

True

In [67]:
type(nlp)

spacy.lang.es.Spanish