# Extracting named entity from an article

In [29]:
from bs4 import BeautifulSoup
import requests
import re
from ner_spacy import nlp
from collections import Counter
from spacy import displacy

# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

In [17]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
#     for script in soup(["script", "style", 'aside']):
#         script.extract()
#     return " ".join(re.split(r'[\n\t]+', soup.get_text()))

    # https://medium.com/@adamaulia/python-simple-crawling-using-beautifulsoup-8247657c2de5
    # find news content
    news_content = soup.find("div",{'class':'description'})
    
    # find paragrapgs in news content
    p = news_content.find_all('p')
    content = ' '.join(item .text for item in p)
    encoded_content = content.encode('utf8','replace')
    news_content = encoded_content.decode("utf-8") 
    
    return news_content

In [18]:
ny_bb = url_to_string('https://www.indiatoday.in/crime/story/uttar-pradesh-fatehpur-man-kills-wife-mob-beats-death-flee-1615197-2019-11-02')
article = nlp(ny_bb)
len(article.ents)

24

In [21]:
# labels of entities
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 4, 'DATE': 3, 'PERSON': 8, 'CARDINAL': 4, 'ORG': 5})

In [22]:
# most frequent tokens
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Qureshi', 4), ('Fatehpur', 2), ('three', 2)]

In [24]:
# displaying some random token
sentences = [x for x in article.sents]
print(sentences[10])

A doctor, who conducted the post-mortem examination, said that apart from sustaining injuries to his head and mouth, a number of Qureshi's bones were fractured.


In [32]:
# generating raw mark up - Render a dependency parse tree or named entity visualization
displacy.render(nlp(str(sentences[10])), jupyter=True, style='ent')

In [34]:
displacy.render(nlp(str(sentences[10])), style='dep', jupyter = True, options = {'distance': 120})

In [36]:
# verbatim, extract part-of-speech and lemmatize
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[10])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('doctor', 'NOUN', 'doctor'),
 ('conducted', 'VERB', 'conduct'),
 ('post', 'ADJ', 'post'),
 ('-', 'ADJ', '-'),
 ('mortem', 'ADJ', 'mortem'),
 ('examination', 'NOUN', 'examination'),
 ('said', 'VERB', 'say'),
 ('apart', 'ADV', 'apart'),
 ('sustaining', 'VERB', 'sustain'),
 ('injuries', 'NOUN', 'injury'),
 ('head', 'NOUN', 'head'),
 ('mouth', 'NOUN', 'mouth'),
 ('number', 'NOUN', 'number'),
 ('Qureshi', 'PROPN', 'Qureshi'),
 ('bones', 'NOUN', 'bone'),
 ('fractured', 'VERB', 'fracture')]

In [37]:
dict([(str(x), x.label_) for x in nlp(str(sentences[10])).ents])

{'Qureshi': 'PERSON'}

In [38]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[10]])

[(A, 'O', ''), (doctor, 'O', ''), (,, 'O', ''), (who, 'O', ''), (conducted, 'O', ''), (the, 'O', ''), (post, 'O', ''), (-, 'O', ''), (mortem, 'O', ''), (examination, 'O', ''), (,, 'O', ''), (said, 'O', ''), (that, 'O', ''), (apart, 'O', ''), (from, 'O', ''), (sustaining, 'O', ''), (injuries, 'O', ''), (to, 'O', ''), (his, 'O', ''), (head, 'O', ''), (and, 'O', ''), (mouth, 'O', ''), (,, 'O', ''), (a, 'O', ''), (number, 'O', ''), (of, 'O', ''), (Qureshi, 'B', 'PERSON'), ('s, 'O', ''), (bones, 'O', ''), (were, 'O', ''), (fractured, 'O', ''), (., 'O', '')]


In [47]:
article.ents

(Fatehpur,
 Saturday,
 Nisar Qureshi's,
 three,
 Uttar,
 Police OP Singh,
 Amethi,
 Wednesday,
 Qureshi,
 Chhattisgarh,
 Fatehpur,
 Afsari,
 35,
 Qureshi,
 DGP,
 Gazipur Station,
 Qureshi,
 Ishfaq,
 100-150,
 Friday,
 Ishfaq,
 Police Prashant Verma,
 three,
 Qureshi)

In [39]:
displacy.render(article, jupyter=True, style='ent')