In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [98]:
from spacy.matcher import Matcher

# extract entity use matcher, and return the desire pattern only
matcher = Matcher(nlp.vocab)
patterns = [
  [
    {'lower': 'from'},
    # {'text': {'regex': '\w+'}},
    # {'tag': {'regex': '^NNP?$'}}
  ]
]
matcher.add('DEPATURE', patterns)
matcher.add('DESTINATION', [[
  {'lower': 'to'},
  {'text': {'regex': '\w+'}}
]])

doc = nlp('From the Japan, Hong Kong  to JP, from 12/12')
matches = matcher(doc)
for match_id, start, end in matches:
  string_id = nlp.vocab.strings[match_id]
  phase_span = doc[start:end]
  span = doc[end-1]
  print(match_id, string_id, start, end, phase_span.text, span.text, span.tag_)

11686797903579700592 DEPATURE 0 1 From From IN
762183493788757442 DESTINATION 7 9 to JP JP NNP
11686797903579700592 DEPATURE 10 11 from from IN


In [9]:
# no support REGEX
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
matcher.add('depature', [nlp('from \w+')])
doc = nlp('from hk to jp')

matches = matcher(doc)
for match_id, start, end in matches:
  string_id = nlp.vocab.strings[match_id]
  phase_span = doc[start:end]
  span = doc[end-1]
  print(match_id, string_id, start, end)

from spacy

In [71]:
from spacy.pipeline import EntityRuler

# return custom NER and other (extend the NER model)
nlp = spacy.load('en_core_web_sm')
ruler = nlp.add_pipe('entity_ruler')
patterns = [
  {
    'label': 'FROM',
    'pattern': [
      {'lower': 'from'}
    ]
  },
  {
    'label': 'TO',
    'pattern': [
      {'lower': 'to'}
    ]  
  },
  {
    'label': 'DPTL', # depature location
    'pattern': [
      {'lemma': {'in': ['depature', 'from']}},
      {'lower': {'in': ['location', 'place']}, 'op': '?'}
      # {'text': {'regex': '.+'}},
      # {'TAG': {'IN': ['NNP', 'NN']}},
    ]
  },
  {
    'label': 'DSTT', # destination
    'pattern': [
      {'lemma': {'in':['destination', 'to']}},
      {'lower': {'in': ['location', 'place'], 'op': '?'}}
      # {'text': {'regex': '\w+'}},
      # {'tag': {'regex': 'NNP?'}},
    ]
  },
  {
    'label': 'DPTD', # depature date
    'pattern': [
      {'lemma': {'in': ['depature', 'leave']}},
      {'lower': {'in': ['time', 'date', 'at', 'in']}}
    ]
  },
  {
    'label': 'RTND', # return date
    'pattern': [
      {'lemma': {'in': ['return', 'back']}},
      {'lower': {'in': ['time', 'date', 'on', 'at', 'in']}}
    ]
  }
]
ruler.add_patterns(patterns)
doc = nlp('from the US, Japan, Hong Kong to japan or US, and from 12/12 and return date is 12/12/2022')
print(doc.ents)
for ent in doc.ents:
  print(ent.text, ent.label_, ent.start_char, ent.end_char, ent.text.split(' '))

(from, US, Japan, Hong Kong, to, japan, US, from, 12/12, return date, 12/12/2022)
from FROM 0 4 ['from']
US GPE 9 11 ['US']
Japan GPE 13 18 ['Japan']
Hong Kong GPE 20 29 ['Hong', 'Kong']
to TO 30 32 ['to']
japan GPE 33 38 ['japan']
US GPE 42 44 ['US']
from DPTL 50 54 ['from']
12/12 CARDINAL 55 60 ['12/12']
return date RTND 65 76 ['return', 'date']
12/12/2022 DATE 80 90 ['12/12/2022']


In [68]:


doc = nlp('depature time, leave date, leaves time, leaving date, Leave time return time, return Date, Returns on')
for ent in doc.ents:
  print(ent.text, ent.label_)

depature time DPTD
leave date DPTD
leaves time DPTD
leaving date DPTD
Leave time DPTD
return time RTND
Returns on RTND


In [72]:
nlp.to_disk('./model-extend')

In [19]:
from spacy.pipeline import EntityRuler

# return custom NER and other (extend the NER model)
nlp = spacy.load('en_core_web_sm')
ruler = nlp.add_pipe('entity_ruler')
patterns = [
  {
    'label': 'ACTION',
    'pattern': [
      {'lower': {'in': ['book']}}
    ]
  },
]
ruler.add_patterns(patterns)
doc = nlp('we want book a flight, the book is good. The flight')
print(doc.ents)
for ent in doc.ents:
  print(ent.text, ent.label_, ent.start_char, ent.end_char, ent.text.split(' '))

(book a, book is)
book a ACTION 8 14 ['book', 'a']
book is ACTION 27 34 ['book', 'is']
