In [51]:
#Data Frame
import pandas

#NLP libraries
## Import nltk and dependcies
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.tag import StanfordPOSTagger # For french language
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

#Extracting Named entity from article
from bs4 import BeautifulSoup
import requests
import re

##Import spacy for named Entity
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to /home/keberger/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/keberger/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/keberger/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/keberger/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [52]:
def preprocesss_and_tag(txt):
    txt = nltk.word_tokenize(txt)
    txt = nltk.pos_tag(txt)
    return txt

In [53]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [61]:
#hbr : what boards needs to know : https://hbr.org/2019/05/what-boards-need-to-know-about-ai
url = "https://hbr.org/2019/05/what-boards-need-to-know-about-ai"
text = url_to_string(url)
article = nlp(text)
len(article.ents)

66

In [59]:
#There are 66 entities in the article and they are represented as 10 unique labels:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 27,
         'PERSON': 16,
         'WORK_OF_ART': 1,
         'DATE': 4,
         'GPE': 4,
         'CARDINAL': 7,
         'EVENT': 1,
         'PERCENT': 2,
         'LANGUAGE': 1,
         'LAW': 1,
         'PRODUCT': 1,
         'NORP': 1})

In [62]:
#The following are three most frequent tokens.
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('ML', 4), ('AI', 3), ('Douglas Merrill', 2)]

In [79]:
#Let’s randomly select one sentence to learn more.
sentences = [x for x in article.sents]
print(sentences)

[          What Boards Need to Know About AI                                                                                                      , 2/3 Free Articles, leftRemaining  , Register for more  , |  Subscribe + Save!, Subscribe Sign, In CLEAR SUGGESTED TOPICS Explore HBR, The Latest, The Magazine, Most Popular Podcasts Video Store Webinars Newsletters Popular Topics, Managing, Yourself Leadership Strategy Managing Teams Gender Innovation Work-life, Balance All Topics, For Subscribers, The Big Idea Visual Library Reading Lists Case, Selections Subscribe My Account My Library, Topic Feeds Purchases Account Settings Email, Preferences Log Out, Sign, In Subscribe The Latest Podcasts Video, The Magazine Store Webinars Newsletters, All Topics, The Big Idea Visual Library Reading Lists Case Selections, My Library Account Settings Log Out Sign, In Your Cart, Your Shopping Cart is empty., Visit Our Store Guest User  , Subscriber My Library Topic Feeds Purchases Account Settings Email P

In [80]:
#Let’s run displacy.render to generate the raw markup.
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [89]:
#Using spaCy’s built-in displaCy visualizer, here’s what the above sentence and its dependencies look like:
displacy.render(nlp(str(sentences[1:5])), style='dep', jupyter = True, options = {'distance': 120})

In [82]:
#Next, we verbatim, extract part-of-speech and lemmatize this sentence.
[(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences))if not y.is_stop and y.pos_ != 'PUNCT']]

[('         ', 'SPACE', '         '),
 ('Boards', 'PROPN', 'Boards'),
 ('Need', 'VERB', 'need'),
 ('Know', 'VERB', 'know'),
 ('AI', 'PROPN', 'AI'),
 ('                                                                                                     ',
  'SPACE',
  '                                                                                                     '),
 ('2/3', 'NUM', '2/3'),
 ('Free', 'PROPN', 'Free'),
 ('Articles', 'PROPN', 'Articles'),
 ('leftRemaining', 'NOUN', 'leftremaining'),
 (' ', 'SPACE', ' '),
 ('Register', 'PROPN', 'Register'),
 (' ', 'SPACE', ' '),
 ('|', 'PROPN', '|'),
 (' ', 'SPACE', ' '),
 ('Subscribe', 'PROPN', 'Subscribe'),
 ('+', 'CCONJ', '+'),
 ('Save', 'VERB', 'save'),
 ('Subscribe', 'PROPN', 'Subscribe'),
 ('Sign', 'PROPN', 'Sign'),
 ('CLEAR', 'PROPN', 'CLEAR'),
 ('SUGGESTED', 'VERB', 'suggest'),
 ('TOPICS', 'PROPN', 'TOPICS'),
 ('Explore', 'PROPN', 'Explore'),
 ('HBR', 'PROPN', 'HBR'),
 ('Latest', 'ADJ', 'late'),
 ('Magazine', 'PROPN', 'Magazin

In [88]:
#Final Dictionnary to upgrade
dict([(str(x), x.label_) for x in nlp(str(sentences)).ents])

{'2/3': 'CARDINAL',
 'Register': 'PERSON',
 'Subscribe Sign': 'PERSON',
 'HBR': 'ORG',
 'The Magazine, Most Popular Podcasts Video Store': 'ORG',
 'Managing, Yourself Leadership Strategy Managing Teams Gender Innovation Work-life': 'ORG',
 'Balance All Topics': 'ORG',
 'Subscribers': 'PERSON',
 'The Big Idea Visual Library Reading Lists Case': 'ORG',
 'Sign': 'ORG',
 'All Topics': 'ORG',
 'The Big Idea Visual Library Reading Lists Case Selections': 'ORG',
 'Library Account Settings Log Out Sign': 'PERSON',
 'Guest User  ': 'PERSON',
 'Boards': 'GPE',
 'Boards Need': 'WORK_OF_ART',
 'Douglas Merrill': 'PERSON',
 'May 24, 2019  ': 'DATE',
 'Summary Full Text  ': 'PERSON',
 'Save  Share  ': 'PERSON',
 'Comment  ': 'PERSON',
 'Text Size  ': 'PERSON',
 'Print Loading...': 'PERSON',
 'Summary': 'PERSON',
 'AI': 'ORG',
 'C. J. Burton': 'PERSON',
 'Getty Images': 'PERSON',
 'More than half': 'CARDINAL',
 'Gartner CIO Survey': 'EVENT',
 'the end of 2020': 'DATE',
 '14%': 'PERCENT',
 'today': 'D

In [57]:
pprint([(X.text, X.label_) for X in article.ents])

[('Boards Need', 'ORG'),
 ('Know', 'PERSON'),
 ('Register', 'PERSON'),
 ('Subscribe Sign', 'PERSON'),
 ('Balance All Topics', 'ORG'),
 ('Library Account Settings Log Out Sign', 'PERSON'),
 ('Reading List Reading Lists Boards', 'WORK_OF_ART'),
 ('Douglas Merrill', 'PERSON'),
 ('May 24, 2019  ', 'DATE'),
 ('Text  ', 'PERSON'),
 ('Summary', 'PERSON'),
 ('Boards', 'GPE'),
 ('C. J. Burton', 'PERSON'),
 ('Getty Images', 'PERSON'),
 ('More than half', 'CARDINAL'),
 ('Gartner CIO Survey', 'EVENT'),
 ('the end of 2020', 'DATE'),
 ('14%', 'PERCENT'),
 ('today', 'DATE'),
 ('NewVantage Partners', 'ORG'),
 ('75%', 'PERCENT'),
 ('half', 'CARDINAL'),
 ('billions', 'CARDINAL'),
 ('four', 'CARDINAL'),
 ('ML', 'ORG'),
 ('AI', 'ORG'),
 ('AI', 'ORG'),
 ('English', 'LANGUAGE'),
 ('ERP', 'ORG'),
 ('ML', 'ORG'),
 ('ML', 'ORG'),
 ('one', 'CARDINAL'),
 ('Facebook', 'PERSON'),
 ('Google', 'ORG'),
 ('ML', 'ORG'),
 ('’s', 'ORG'),
 ('AI', 'ORG'),
 ('one', 'CARDINAL'),
 ('Formula 1', 'LAW'),
 ('’s', 'GPE'),
 ('Doug