<a href="https://colab.research.google.com/github/hientrinh93/AI/blob/main/Copy_of_Token_classification_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Named entity recognition (NER), or token-classification, or entity extraction**, is the first step of information extraction that classifies the text into defined categories, including the persons names, organizations, locations, times, percentages etc.

In this tutorial, an URL website will be scraped to extract information: https://www.cnbc.com/2019/12/31/the-stock-market-boomed-in-2019-heres-how-it-happened.html, using 3 different methods NLTK, SpaCy, BERT model on Hugging Face.


In [None]:
# libraries required
!pip install transformers
!pip install language_tool_python

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import tree2conlltags

import spacy
from spacy import displacy
import en_core_web_sm

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

from bs4 import BeautifulSoup
import requests
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
# web scraping
url = "https://www.cnbc.com/2019/12/31/the-stock-market-boomed-in-2019-heres-how-it-happened.html"
html_content = requests.get(url).content
soup = BeautifulSoup(html_content, "html.parser")

In [None]:
# extract the main content
job_elements = soup.find_all("p")
content = ''
for job_element in job_elements:
    content += job_element.text + ' '
print (content)



In [None]:
# word tokenization and part-of-speech tagging to the sentence
# Part Of Speech (POS) Tagging: https://www.ibm.com/docs/en/wca/3.5.0?topic=analytics-part-speech-tag-sets**
def postag(sentence):
    sentence = word_tokenize(sentence)
    sentence = pos_tag(sentence)
    return sentence

In [None]:
preprocess = postag(content)
preprocess

[('It', 'PRP'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('year', 'NN'),
 ('that', 'WDT'),
 ('began', 'VBD'),
 ('with', 'IN'),
 ('investors', 'NNS'),
 ('courting', 'VBG'),
 ('a', 'DT'),
 ('bear', 'JJ'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ended', 'VBD'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('biggest', 'JJS'),
 ('gains', 'NNS'),
 ('from', 'IN'),
 ('stocks', 'NNS'),
 ('since', 'IN'),
 ('2013', 'CD'),
 ('.', '.'),
 ('Twelve', 'CD'),
 ('months', 'NNS'),
 ('ago', 'RB'),
 (',', ','),
 ('few', 'JJ'),
 ('could', 'MD'),
 ('have', 'VB'),
 ('imagined', 'VBN'),
 ('the', 'DT'),
 ('S', 'NNP'),
 ('&', 'CC'),
 ('P', 'NNP'),
 ('500', 'CD'),
 ('delivering', 'VBG'),
 ('a', 'DT'),
 ('gain', 'NN'),
 ('of', 'IN'),
 ('more', 'JJR'),
 ('than', 'IN'),
 ('28', 'CD'),
 ('%', 'NN'),
 ('in', 'IN'),
 ('2019', 'CD'),
 ('.', '.'),
 ('It', 'PRP'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('performance', 'NN'),
 ('that', 'WDT'),
 ('flirted', 'VBD'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('31', 'CD'),
 ('%', 'NN'),
 ('gain', 'NN'),
 ('of', '

In [None]:
# create a tag pattern to chunk
# NP (noun phrase) = DT (One optional Determiner) + JJ (any number of adjs) + NN (one Noun).
pattern = 'NP: {<DT>?<JJ>*<NN>}'

# chunking with Regular Expressions
chunking_noun = nltk.RegexpParser(pattern).parse(preprocess)
#print(chunking_noun)

# print Noun Phrases
for chunk in chunking_noun:
  if hasattr(chunk, 'label'):
    print(chunk.label(), ' '.join(c[0] for c in chunk)) 

NP a year
NP a bear market
NP a gain
NP %
NP a performance
NP %
NP gain
NP %
NP return
NP a gain
NP %
NP money
NP position
NP %
NP a year
NP a global economic slowdown
NP disruptive trade
NP policy
NP The year
NP an unforeseen boom
NP the tech
NP sector
NP the major stock
NP the key
NP the market
NP success
NP a low base
NP A steep sell-off
NP %
NP a bear market
NP %
NP decline
NP closing
NP peak
NP a loss
NP %
NP trading
NP perspective
NP %
NP the average return
NP %
NP the stock
NP market
NP a dramatic policy
NP shift
NP hike
NP key rate
NP percent
NP a different story
NP a change
NP heart
NP interest
NP a quest
NP yield
NP money
NP key rate
NP a range
NP %
NP %
NP clarity
NP top
NP the central bank
NP insurance
NP rate
NP global economy
NP global economic growth
NP trade
NP the re-crafting
NP trade
NP war
NP the financial news
NP the year
NP tweet
NP the course
NP the year
NP the trade
NP war
NP the stock
NP market
NP the year
NP a close
NP replacement
NP administration
NP a phase
N

**IOB tags**: Each token is tagged with one of three special chunk tags, I (inside), O (outside), or B (begin). 

A token is tagged as B if it marks the beginning of a chunk. 

Tokens within the chunk are tagged I. 

All other tokens are tagged O. 

In [None]:
# Convert a tree to the CoNLL IOB tag
iob_tagged = tree2conlltags(chunking_noun)
for child in iob_tagged:             
    print (child)

('It', 'PRP', 'O')
('was', 'VBD', 'O')
('a', 'DT', 'B-NP')
('year', 'NN', 'I-NP')
('that', 'WDT', 'O')
('began', 'VBD', 'O')
('with', 'IN', 'O')
('investors', 'NNS', 'O')
('courting', 'VBG', 'O')
('a', 'DT', 'B-NP')
('bear', 'JJ', 'I-NP')
('market', 'NN', 'I-NP')
('and', 'CC', 'O')
('ended', 'VBD', 'O')
('with', 'IN', 'O')
('the', 'DT', 'O')
('biggest', 'JJS', 'O')
('gains', 'NNS', 'O')
('from', 'IN', 'O')
('stocks', 'NNS', 'O')
('since', 'IN', 'O')
('2013', 'CD', 'O')
('.', '.', 'O')
('Twelve', 'CD', 'O')
('months', 'NNS', 'O')
('ago', 'RB', 'O')
(',', ',', 'O')
('few', 'JJ', 'O')
('could', 'MD', 'O')
('have', 'VB', 'O')
('imagined', 'VBN', 'O')
('the', 'DT', 'O')
('S', 'NNP', 'O')
('&', 'CC', 'O')
('P', 'NNP', 'O')
('500', 'CD', 'O')
('delivering', 'VBG', 'O')
('a', 'DT', 'B-NP')
('gain', 'NN', 'I-NP')
('of', 'IN', 'O')
('more', 'JJR', 'O')
('than', 'IN', 'O')
('28', 'CD', 'O')
('%', 'NN', 'B-NP')
('in', 'IN', 'O')
('2019', 'CD', 'O')
('.', '.', 'O')
('It', 'PRP', 'O')
('was', 'VBD',

In [None]:
# NER using NLTK
for chunk in nltk.ne_chunk(pos_tag(nltk.word_tokenize(content))):
  if hasattr(chunk, 'label'):
    print(chunk.label(), ' '.join(c[0] for c in chunk))  

PERSON Nasdaq
PERSON Apple
PERSON Microsoft
ORGANIZATION Dow Jones Industrial Average
ORGANIZATION Federal Reserve
ORGANIZATION Federal Reserve
ORGANIZATION Fed
ORGANIZATION Fed
ORGANIZATION Fed
ORGANIZATION Fed
PERSON Fed Chairman Jerome Powell
ORGANIZATION Fed
GPE U.S.
PERSON Donald Trump
GPE China
LOCATION North
ORGANIZATION American Free Trade
GPE Canada
GPE Mexico
PERSON Trump
ORGANIZATION House
ORGANIZATION United
PERSON Trump
ORGANIZATION NAFTA
ORGANIZATION Senate
ORGANIZATION Trump
GPE China
GPE U.S.
GPE China
ORGANIZATION European Union
PERSON Brexit
ORGANIZATION International Monetary Fund
GPE EU
GPE U.S.
GPE China
PERSON Energy
LOCATION West Texas
GPE Saudi
PERSON Aramco
GPE August
GPE Europe
GPE U.S.
ORGANIZATION Federal Reserve
ORGANIZATION Treasuries
ORGANIZATION Fed
GPE U.S.
ORGANIZATION Institute
PERSON Supply Management
GPE U.S.
ORGANIZATION ISM
PERSON P Dow Jones Indices
PERSON Apple
PERSON Microsoft
GPE U.S.
PERSON Facebook
GPE Alphabet
ORGANIZATION Google
PERSON Ama

In [None]:
# NER using SpaCy
nlp = en_core_web_sm.load()

# if we want to try this on real large datasets, we can use the medium and large models in spacy
# nlp = spacy.load('en_core_web_md')
# nlp = spacy.load('en_core_web_lg')

In [None]:
doc = nlp(content)
for item in doc.ents:
  print (item.text, item.label_)

a year DATE
2013 DATE
Twelve months ago DATE
more than 28% PERCENT
2019 DATE
31% PERCENT
1997 DATE
one CARDINAL
29.6% PERCENT
2013 DATE
Nasdaq ORG
35% PERCENT
Apple ORG
Microsoft ORG
trillion-dollar MONEY
22% PERCENT
a year DATE
Federal Reserve ORG
The year DATE
One CARDINAL
2019 DATE
December 2018 DATE
just 0.2% PERCENT
20% PERCENT
2018 DATE
more than 6% PERCENT
2,485.74 CARDINAL
Dec. 31, 2018 DATE
the final hours TIME
2019 DATE
3,220 CARDINAL
S&P ORG
2019 DATE
about 10% PERCENT
2018 DATE
roughly 2,900 CARDINAL
90 years DATE
9.8% PERCENT
2019 DATE
the Federal Reserve ORG
Fed ORG
four CARDINAL
2018 DATE
December 2018 DATE
2.5 percent PERCENT
2019 DATE
Fed ORG
three CARDINAL
Fed ORG
1.50% to 1.75% PERCENT
Fed ORG
2020 DATE
Fed ORG
Jerome Powell PERSON
Fed ORG
U.S. GPE
Donald Trump PERSON
China GPE
the North American Free Trade Agreement GPE
Canada GPE
Mexico GPE
the year DATE
Trump PERSON
the year DATE
the year DATE
House ORG
the United States GPE
Trump PERSON
NAFTA ORG
Senate ORG
Trump

In [None]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [None]:
# visualize
displacy.render(doc, style="ent", jupyter=True)

In [None]:
# lemmatization
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(content) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[(' ', 'SPACE', ' '),
 ('year', 'NOUN', 'year'),
 ('began', 'VERB', 'begin'),
 ('investors', 'NOUN', 'investor'),
 ('courting', 'VERB', 'court'),
 ('bear', 'NOUN', 'bear'),
 ('market', 'NOUN', 'market'),
 ('ended', 'VERB', 'end'),
 ('biggest', 'ADJ', 'big'),
 ('gains', 'NOUN', 'gain'),
 ('stocks', 'NOUN', 'stock'),
 ('2013', 'NUM', '2013'),
 ('months', 'NOUN', 'month'),
 ('ago', 'ADV', 'ago'),
 ('imagined', 'VERB', 'imagine'),
 ('S&P', 'PROPN', 'S&P'),
 ('500', 'NUM', '500'),
 ('delivering', 'VERB', 'deliver'),
 ('gain', 'NOUN', 'gain'),
 ('28', 'NUM', '28'),
 ('%', 'NOUN', '%'),
 ('2019', 'NUM', '2019'),
 ('performance', 'NOUN', 'performance'),
 ('flirted', 'VERB', 'flirt'),
 ('31', 'NUM', '31'),
 ('%', 'NOUN', '%'),
 ('gain', 'NOUN', 'gain'),
 ('1997', 'NUM', '1997'),
 ('came', 'VERB', 'come'),
 ('close', 'ADV', 'close'),
 ('topping', 'VERB', 'top'),
 ('29.6', 'NUM', '29.6'),
 ('%', 'NOUN', '%'),
 ('return', 'NOUN', 'return'),
 ('2013', 'NUM', '2013'),
 (' ', 'SPACE', ' '),
 ('tech',

In [None]:
# NER using BERT Hugging Face
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
generator = pipeline("ner",
                     model=model,
                     tokenizer=tokenizer,
                     grouped_entities=True)

# print words 'organization'
for x in generator(content):
  for key, value in x.items():
     if value=='ORG':
       print (x)


  f'`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="{aggregation_strategy}"` instead.'


{'entity_group': 'ORG', 'score': 0.99783033, 'word': 'Apple', 'start': 455, 'end': 460}
{'entity_group': 'ORG', 'score': 0.9978947, 'word': 'Microsoft', 'start': 465, 'end': 474}
{'entity_group': 'ORG', 'score': 0.982275, 'word': 'Federal Reserve', 'start': 698, 'end': 713}
{'entity_group': 'ORG', 'score': 0.9968617, 'word': 'S & P', 'start': 1233, 'end': 1236}
{'entity_group': 'ORG', 'score': 0.89660454, 'word': 'P', 'start': 1348, 'end': 1349}
{'entity_group': 'ORG', 'score': 0.9968209, 'word': 'Federal Reserve', 'start': 1470, 'end': 1485}
{'entity_group': 'ORG', 'score': 0.9983499, 'word': 'Fed', 'start': 1491, 'end': 1494}
{'entity_group': 'ORG', 'score': 0.996633, 'word': 'Fed', 'start': 1665, 'end': 1668}
{'entity_group': 'ORG', 'score': 0.99665403, 'word': 'Fed', 'start': 1838, 'end': 1841}
{'entity_group': 'ORG', 'score': 0.9967719, 'word': 'Fed', 'start': 1913, 'end': 1916}
{'entity_group': 'ORG', 'score': 0.9969798, 'word': 'Fed', 'start': 2043, 'end': 2046}
{'entity_group':