# Named entity recognition
Based on [this article](https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da).

In [1]:
import nltk, spacy

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/kimmo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kimmo/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/kimmo/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/kimmo/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [4]:
def preprocess(string):
    tokenized = nltk.word_tokenize(string)
    pos_tagged = nltk.pos_tag(tokenized)
    return tokenized, pos_tagged
  
tokenized, pos_tagged = preprocess(ex)
pos_tagged

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

A noun phrase (NP) formed when an optional determiner (DT) is followed by any number of adjectives (JJ) and a noun (NN) finishes the phrase.

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

Use the `RegexpParser` to parse `pos_tagged` sentence with the `pattern`:

In [6]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(pos_tagged)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [7]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


### Named entity recognition

In [8]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

NameError: name 'pos_tag' is not defined

## Spacy

In [9]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [10]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [11]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


European is `NORD` (nationalities or religious or political groups), `Google` is an organization, `$5.1 billion` is monetary value and `Wednesday` is a `Date`. 

Now work on multi-token entities using the [IOB](https://spacy.io/api/annotation#iob) scheme (`B` for a token that begins an entity, `I` for token inside entity, `O` for non-entity token).

In [12]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


## Label a news article

Install [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/):

In [13]:
!pip install beautifulsoup4
!pip install html5lib



Read a news article from `nytimes.com`:

In [14]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, "html5lib")
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
ny_bb[0:500]


'     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times                                                                       SectionsSEARCHSkip to contentSkip to site indexPoliticsSubscribeLog InLog InToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredAdvertisementSupported byF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredImagePeter Strzok, a top F.B.I. counterintelligence agent who was taken off the sp'

Run SpaCy on article and render named entities:

In [23]:
article = nlp(ny_bb)
displacy.render(article, jupyter=True, style='ent')

Show all available SpaCy attributes on words:

In [29]:
pprint([attribute for attribute in dir(article[0]) if not attribute.startswith("_")])

['ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_extension',
 'has_vector',
 'head',
 'i',
 'idx',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'left_edge',
 'lefts',
 'lemma',
 'lemma_',
 'lex_id',
 'like_email',
 'like_num',
 'like_url',
 'lower',
 'lower_',
 'n_lefts',
 'n_rights',
 'nbor',
 'norm',
 'norm_',
 'orth',
 'orth_',
 'pos',
 'pos_',
 'prefix',
 'prefix_',
 'prob',
 'rank',
 'remove_extension',
 'right_edge',
 'rights',
 'sent',
 'sent_start',
 'sentiment',
 'set_extension',
 'shape',
 'shape_',
 'similarity',
 'string',
 'subtree',
 'suffix',
 'suffix_',
 'tag',
 'tag_',
 'text',
 'text_with_ws',
 'vector',
 'vector_n

Print `IOB` schemes and entity types:

In [30]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in article[0:20]])

[(     , 'O', ''),
 (F.B.I., 'O', ''),
 (Agent, 'O', ''),
 (Peter, 'B', 'PERSON'),
 (Strzok, 'I', 'PERSON'),
 (,, 'O', ''),
 (Who, 'O', ''),
 (Criticized, 'B', 'PERSON'),
 (Trump, 'I', 'PERSON'),
 (in, 'O', ''),
 (Texts, 'B', 'GPE'),
 (,, 'O', ''),
 (Is, 'O', ''),
 (Fired, 'O', ''),
 (-, 'O', ''),
 (The, 'O', ''),
 (New, 'O', ''),
 (York, 'O', ''),
 (Times, 'O', ''),
 (                                                                      ,
  'O',
  '')]


Count entity labels:

In [31]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 82,
         'GPE': 16,
         'CARDINAL': 5,
         'ORG': 39,
         'DATE': 24,
         'NORP': 2,
         'ORDINAL': 1,
         'FAC': 1,
         'PRODUCT': 2,
         'LOC': 1,
         'TIME': 1})

Take a closer look at one of the sentences:

In [35]:
example_sentence = [X for X in article.sents][10]
example_sentence

The F.B.I. had been under immense political pressure by Mr. Trump to dismiss Mr. Strzok, who was removed last summer from the staff of the special counsel, Robert S. Mueller III.

Render entities:

In [38]:
displacy.render(nlp(str(example_sentence)), jupyter=True, style='ent')

Visualize dependencies:

In [44]:
displacy.render(example_sentence, style='dep', jupyter = True, options = {'distance': 120})

In [45]:
[(x.orth_,x.pos_, x.lemma_) for x in [word 
                                      for word
                                      in example_sentence
                                      if not word.is_stop and word.pos_ != 'PUNCT']]

[('F.B.I.', 'PROPN', 'F.B.I.'),
 ('immense', 'ADJ', 'immense'),
 ('political', 'ADJ', 'political'),
 ('pressure', 'NOUN', 'pressure'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Trump', 'PROPN', 'Trump'),
 ('dismiss', 'VERB', 'dismiss'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('removed', 'VERB', 'remove'),
 ('summer', 'NOUN', 'summer'),
 ('staff', 'NOUN', 'staff'),
 ('special', 'ADJ', 'special'),
 ('counsel', 'NOUN', 'counsel'),
 ('Robert', 'PROPN', 'Robert'),
 ('S.', 'PROPN', 'S.'),
 ('Mueller', 'PROPN', 'Mueller'),
 ('III', 'PROPN', 'III')]

Extract entities from example sentence:

In [47]:
dict([(str(x), x.label_) for x in example_sentence.ents])

{'F.B.I.': 'ORG',
 'Trump': 'PERSON',
 'Strzok': 'PERSON',
 'last summer': 'DATE',
 'Robert S. Mueller III': 'PERSON'}