In [11]:
import spacy
from spacy.language import Language
from spacy.tokens import Doc, Span, SpanGroup
import numerizer
import pint

In [12]:
nlp = spacy.load('en_core_web_md')
ureg = pint.UnitRegistry()
ureg.define('USD = [currency] = dollar')
ureg.define('cent = 0.01 * USD')

In [13]:
@Language.component("numerizer_component")
def numerizer_component(doc):
    numerized_text = numerizer.numerize(doc.text)
    return Doc(doc.vocab, words=numerized_text.split())

In [14]:
if "numerizer_component" in nlp.pipe_names:
    nlp.remove_pipe("numerizer_component")
nlp.add_pipe("numerizer_component", first=True)

<function __main__.numerizer_component(doc)>

In [15]:
@Language.component("fuzzy_entities")
def fuzzy_entities(doc):
    span_group = doc.spans["fuzzy_ents"] = SpanGroup(doc)
    fuzzy_term_pos = ["ADV", "ADJ", "NOUN", "ADP"]
    fuzzy_range_pos = ["CCONJ"]
    fuzzy_range_terms = ["to"] # ADP that can be used as a range term
    for i in range(len(doc) - 1):
        if i> 0 and (doc[i].like_num or doc[i].text.isdigit()): 
            if doc[i - 1].pos_ in fuzzy_term_pos:
                span_group.append(Span(doc, i - 1, i, label="FUZZY_TERM"))
                if i < (len(doc)-1) \
                        and doc[i + 1].pos_ not in fuzzy_range_pos \
                        and doc[i + 1].text not in fuzzy_range_terms:
                    span_group.append(Span(doc, i, i + 1, label="FUZZY_VALUE"))
            if (doc[i - 1].pos_ in fuzzy_range_pos
                or doc[i - 1].text in fuzzy_range_terms) \
                    and (doc[i - 2].like_num or doc[i - 2].text.isdigit()):
                span_group.append(Span(doc, i - 2, i + 1, label="FUZZY_RANGE"))
        try:
            ureg(doc[i].text)
            if i > 0 and (doc[i - 1].like_num or doc[i - 1].text.isdigit()):
                span_group.append(Span(doc, i, i + 1, label="MEASUREMENT_UNIT"))
        except:
            continue

    return doc

In [16]:
if "fuzzy_entities" in nlp.pipe_names:
    nlp.remove_pipe("fuzzy_entities")
nlp.add_pipe("fuzzy_entities", after="ner")

<function __main__.fuzzy_entities(doc)>

In [17]:
doc = nlp("My break lasts from 40 to 50 minutes, and I take around 10 and a half minutes to eat a lunch, how much time do I have left to rest?")
[(ent.text, ent.label_) for ent in doc.spans["fuzzy_ents"]]

[('from', 'FUZZY_TERM'),
 ('40 to 50', 'FUZZY_RANGE'),
 ('minutes,', 'MEASUREMENT_UNIT'),
 ('around', 'FUZZY_TERM'),
 ('10.5', 'FUZZY_VALUE'),
 ('minutes', 'MEASUREMENT_UNIT')]