- https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

https://spacy.io/usage/visualizers

https://explosion.ai/blog/deep-learning-formula-nlp

https://spacy.io/usage/training#ner
- video 
https://www.youtube.com/watch?v=l4scwf8KeIA
- training 
https://towardsdatascience.com/a-review-of-named-entity-recognition-ner-using-automatic-summarization-of-resumes-5248a75de175

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
example = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [7]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [11]:
sent = preprocess(example)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [12]:
# NP noun phrase
# DT optional dterminer
# JJ any number of adjectives
# NN noun

pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [15]:
# chunking
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

In [17]:
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [18]:
# IOB = standard way to represent chunk structures in files

In [19]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [22]:
iob_tagged = tree2conlltags(cs)
iob_tagged

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]

In [35]:
# conlltags2tree to convert the tag sequences into a chunk tree
from nltk import ne_chunk
ne_tree = ne_chunk(iob_tagged)
print(ne_tree)

(S
  (GPE European/JJ/O)
  authorities/NNS/O
  fined/VBD/O
  (PERSON Google/NNP/O)
  a/DT/B-NP
  record/NN/I-NP
  $/$/O
  5.1/CD/O
  billion/CD/O
  on/IN/O
  Wednesday/NNP/O
  for/IN/O
  abusing/VBG/O
  its/PRP$/O
  power/NN/B-NP
  in/IN/O
  the/DT/B-NP
  mobile/JJ/I-NP
  phone/NN/I-NP
  market/NN/B-NP
  and/CC/O
  ordered/VBD/O
  the/DT/B-NP
  company/NN/I-NP
  to/TO/O
  alter/VB/O
  its/PRP$/O
  practices/NNS/O)


### Spacy

In [36]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

  return f(*args, **kwds)
  return f(*args, **kwds)


In [37]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [39]:
pprint([(X.text, X.label_) for X in doc.ents])
#NORD : nationalities or religious or political groups

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [40]:
# BILUO tagging scheme to describe the entity boundaries
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

# B : first token of a multi-token entity
# I : inner token of a multi-token entity
# L : final token of a multi-token entity
# U : single-token entity
# O : non-entity token

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [45]:
list(filter(lambda x: x.ent_iob_!='O', [X for X in doc]))

[European, Google, $, 5.1, billion, Wednesday]

In [46]:
# display
displacy.render(doc, jupyter=True, style='ent')