In [1]:
import datetime
import spacy
from spacy.tokens import Span
from dateparser.search import search_dates

In [2]:
nlp = spacy.load("nl_core_news_sm")
ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True})
patterns = [{"label": "DIAGNOSIS", "pattern": "CVID"}]
ruler.add_patterns(patterns)

In [4]:
def get_diag_date(span):
    if span.label_ == 'DIAGNOSIS':
        spaces_to_left = [s for s in span.doc[:span.start] if s.is_space]
        spaces_to_right = [s for s in span.doc[span.start:] if s.is_space]
        if spaces_to_left and spaces_to_right: # if there's at least one space char to left and right of diagnosis
            text = span.doc[spaces_to_left[-1].i+1:spaces_to_right[0].i-1].text # get all text inbetween
        else: # we might not be in a list; just get the sentence the diagnosis occurs in
            text = span.sent.text 
        dates = search_dates(text, languages=['nl'], # look for dates in Dutch text;
                             settings={'RELATIVE_BASE': datetime.datetime(2020, 1, 1), # if no month/day assume it's 1st of January
                                       'REQUIRE_PARTS': ['year'], # must have at least a "year" keyword
                                       'PARSERS': ['custom-formats', 'absolute-time']}) # avoid relative times / timestamps
    return dates

In [5]:
Span.set_extension("diag_date", getter = get_diag_date)

In [22]:
text_example_list = """Voorgeschiedenis

2010: DM type 2
2014-11: CVID
2018: trauma capitis

"""

text_example_sent = "Eerste opname: 01-04-2022. CVID is gediagnosticeerd in jan 2023. "

In [25]:
doc = nlp(text_example_list)
print([(ent.label_, ent._.diag_date) for ent in doc.ents if ent.label_ == "DIAGNOSIS"])

[('DIAGNOSIS', [('2014-11', datetime.datetime(2014, 11, 1, 0, 0))])]


In [24]:
doc = nlp(text_example_sent)
print([(ent.label_, ent._.diag_date) for ent in doc.ents if ent.label_ == "DIAGNOSIS"])

[('DIAGNOSIS', [('in jan 2023', datetime.datetime(2023, 1, 1, 0, 0))])]
