In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
doc = nlp('I am happy to write this letter of recommendation for AAA who intends to pursue a Masters degree at your prestigious university. I have known him only for the past year and it was easy to observe that he is a quick learner and hardworking. I handled the course Natural Language Processing for him during his sixth semester.  ')


print([(X.text, X.label_) for X in doc.ents])

[('AAA', 'ORG'), ('the past year', 'DATE'), ('his sixth semester', 'DATE')]


In [3]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(I, 'O', ''), (am, 'O', ''), (happy, 'O', ''), (to, 'O', ''), (write, 'O', ''), (this, 'O', ''), (letter, 'O', ''), (of, 'O', ''), (recommendation, 'O', ''), (for, 'O', ''), (AAA, 'B', 'ORG'), (who, 'O', ''), (intends, 'O', ''), (to, 'O', ''), (pursue, 'O', ''), (a, 'O', ''), (Masters, 'O', ''), (degree, 'O', ''), (at, 'O', ''), (your, 'O', ''), (prestigious, 'O', ''), (university, 'O', ''), (., 'O', ''), (I, 'O', ''), (have, 'O', ''), (known, 'O', ''), (him, 'O', ''), (only, 'O', ''), (for, 'O', ''), (the, 'B', 'DATE'), (past, 'I', 'DATE'), (year, 'I', 'DATE'), (and, 'O', ''), (it, 'O', ''), (was, 'O', ''), (easy, 'O', ''), (to, 'O', ''), (observe, 'O', ''), (that, 'O', ''), (he, 'O', ''), (is, 'O', ''), (a, 'O', ''), (quick, 'O', ''), (learner, 'O', ''), (and, 'O', ''), (hardworking, 'O', ''), (., 'O', ''), (I, 'O', ''), (handled, 'O', ''), (the, 'O', ''), (course, 'O', ''), (Natural, 'O', ''), (Language, 'O', ''), (Processing, 'O', ''), (for, 'O', ''), (him, 'O', ''), (during, 'O',

In [4]:
len(doc.ents)

3

In [5]:
labels = [x.label_ for x in doc.ents]
Counter(labels)

Counter({'ORG': 1, 'DATE': 2})

In [6]:
sentences = [x for x in doc.sents]
print(sentences)

[I am happy to write this letter of recommendation for AAA who intends to pursue a Masters degree at your prestigious university., I have known him only for the past year and it was easy to observe that he is a quick learner and hardworking., I handled the course Natural Language Processing for him during his sixth semester.  ]


In [7]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [8]:
for token in doc:
    print(token, token.idx)

I 0
am 2
happy 5
to 11
write 14
this 20
letter 25
of 32
recommendation 35
for 50
AAA 54
who 58
intends 62
to 70
pursue 73
a 80
Masters 82
degree 90
at 97
your 100
prestigious 105
university 117
. 127
I 129
have 131
known 136
him 142
only 146
for 151
the 155
past 159
year 164
and 169
it 173
was 176
easy 180
to 185
observe 188
that 196
he 201
is 204
a 207
quick 209
learner 215
and 223
hardworking 227
. 238
I 240
handled 242
the 250
course 254
Natural 261
Language 269
Processing 278
for 289
him 293
during 297
his 304
sixth 308
semester 314
. 322
  324


In [9]:
displacy.render(doc, style="dep", jupyter=True)

In [10]:
from bs4 import BeautifulSoup
import requests
import re

In [11]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [12]:
#IMPORTING FROM WEB

ny_bb = url_to_string('https://www.studentnewsdaily.com/archive/daily-news-article/')

In [13]:
article = nlp(ny_bb)
len(article.ents)

1509

In [14]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 551,
         'WORK_OF_ART': 23,
         'DATE': 283,
         'CARDINAL': 143,
         'PERSON': 54,
         'GPE': 259,
         'NORP': 63,
         'ORDINAL': 23,
         'FAC': 10,
         'EVENT': 11,
         'LOC': 19,
         'PERCENT': 2,
         'MONEY': 43,
         'TIME': 6,
         'PRODUCT': 10,
         'QUANTITY': 8,
         'LAW': 1})

In [15]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Reuters', 66), ('U.S.', 36), ('New York Post', 35)]

In [16]:
sentences = [x for x in article.sents]
print(sentences[10])

Questions relating to the week's Daily News Articles                                                     keyboard_arrow_right                                                                                       


In [17]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')

In [18]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [21]:
 print("TEXT	START	END	LABEL	DESCRIPTION")
for ent in article.ents:
    print('\n',ent.text,'\t',ent.start_char,'\t', ent.end_char,'\t', ent.label_)
   

TEXT	START	END	LABEL	DESCRIPTION

 Student News Daily                                                                       Archives                                                                                                 	 212 	 405 	 ORG

 Daily News 	 405 	 415 	 WORK_OF_ART

 keyboard_arrow_right 	 517 	 537 	 ORG

 Tuesday 	 624 	 631 	 DATE

 Three 	 667 	 672 	 CARDINAL

 Tuesday 	 688 	 695 	 DATE

 keyboard_arrow_right 	 748 	 768 	 ORG

 Wednesday 	 855 	 864 	 DATE

 Media Bias 	 878 	 888 	 ORG

 Thursday 	 1103 	 1111 	 DATE

 Friday 	 1339 	 1345 	 DATE

 week 	 1404 	 1408 	 DATE

 Daily News 	 1411 	 1421 	 ORG

 Friday 	 1590 	 1596 	 DATE

 Editorial Cartoon                      	 1599 	 1637 	 ORG

 Human Interest News 	 1877 	 1896 	 ORG

 keyboard_arrow_right 	 1949 	 1969 	 CARDINAL

 Liberal Beliefs 	 2390 	 2405 	 PERSON

 Media Bias 	 2941 	 2951 	 ORG

 keyboard_arrow_right 	 3189 	 3209 	 CARDINAL

 keyboard_arrow_right 	 3545 	 3565 	 ORG

 keyboard_a

 11/21/2019 	 23289 	 23299 	 DATE

 US 	 23302 	 23304 	 GPE

 Israeli 	 23310 	 23317 	 NORP

 NPR 	 23379 	 23382 	 ORG

 South Korea 	 23534 	 23545 	 GPE

 UPI 	 23601 	 23604 	 ORG

 11/18/2019 - US military academy athletes 	 23732 	 23773 	 MONEY

 pro - Associated Press 	 23800 	 23822 	 ORG

 Trump 	 23976 	 23981 	 ORG

 Democrats 	 23989 	 23998 	 NORP

 New York Post 	 24015 	 24028 	 ORG

 The Washington Post 	 24267 	 24286 	 ORG

 USS Grayback 	 24452 	 24464 	 FAC

 75-year 	 24481 	 24488 	 DATE

 UPI 	 24513 	 24516 	 ORG

 ABC News 	 24521 	 24529 	 ORG

 11/12/2019 	 24657 	 24667 	 CARDINAL

 1 	 24677 	 24678 	 MONEY

 China 	 24681 	 24686 	 GPE

 11/8/2019 - 2019 Veterans Day 	 24847 	 24876 	 DATE

 11/7/2019 - US 	 25004 	 25018 	 MONEY

 YahooNews 	 25081 	 25090 	 ORG

 Iranians 	 25448 	 25456 	 NORP

 American 	 25466 	 25474 	 NORP

 Ronald Reagan’s 	 25487 	 25502 	 PERSON

 40 years ago 	 25520 	 25532 	 DATE

 Associated Press 	 25535 	 25551 	 ORG

 

 PhillyVoice 	 45800 	 45811 	 ORG

 WSJ 	 45820 	 45823 	 ORG

 Mexico 	 45998 	 46004 	 GPE

 New York Post 	 46014 	 46027 	 ORG

 El Paso Times 	 46032 	 46045 	 ORG

 3/6/2019 	 46173 	 46181 	 CARDINAL

 India 	 46230 	 46235 	 GPE

 Turkey - UPI 	 46237 	 46249 	 ORG

 3/5/2019 	 46377 	 46385 	 CARDINAL

 1 	 46395 	 46396 	 MONEY

 U.S. 	 46399 	 46403 	 GPE

 THAAD missile 	 46412 	 46425 	 PRODUCT

 Israel 	 46444 	 46450 	 GPE

 3/4/2019 	 46588 	 46596 	 QUANTITY

 CNN 	 46658 	 46661 	 ORG

 Wire 	 46662 	 46666 	 LAW

 UPI 	 46671 	 46674 	 ORG

 3/1/2019 	 46802 	 46810 	 CARDINAL

 North Korea 	 46830 	 46841 	 GPE

 Fox News 	 46888 	 46896 	 ORG

 February 2019 	 46961 	 46974 	 DATE

 2/28/2019 	 47037 	 47046 	 DATE

 Senate 	 47049 	 47055 	 ORG

 Democrats 	 47056 	 47065 	 NORP

 Born-Alive Abortion Survivors Protection Act - Fox News 	 47072 	 47127 	 ORG

 2/27/2019 	 47255 	 47264 	 CARDINAL

 US 	 47267 	 47269 	 GPE

 Vietnam 	 47288 	 47295 	 GPE

 New Yor

 5/7/2018 - Iowa 	 72243 	 72258 	 DATE

 Catholic News Agency 	 72306 	 72326 	 ORG

 5/4/2018 	 72454 	 72462 	 CARDINAL

 Haley 	 72465 	 72470 	 GPE

 America 	 72521 	 72528 	 GPE

 UN 	 72539 	 72541 	 ORG

 n’t - Fox News 	 72561 	 72575 	 ORG

 5/3/2018 - Calif. 	 72703 	 72720 	 DATE

 7-Eleven 	 72721 	 72729 	 ORG

 5/2/2018 	 72930 	 72938 	 DATE

 Reuters 	 72985 	 72992 	 ORG

 5/1/2018 - World #1 	 73120 	 73139 	 MONEY

 N. Korea 	 73142 	 73150 	 GPE

 May 	 73187 	 73190 	 DATE

 US 	 73199 	 73201 	 GPE

 Agence France-Presse 	 73212 	 73232 	 ORG

 April 2018 	 73297 	 73307 	 DATE

 4/30/2018 - U.S. 	 73370 	 73386 	 DATE

 Saudi Arabia 	 73388 	 73400 	 GPE

 Israel 	 73405 	 73411 	 GPE

 Iran 	 73414 	 73418 	 GPE

 NY Post 	 73453 	 73460 	 ORG

 4/27/2018 	 73588 	 73597 	 DATE

 Rock’ - Philly.com 	 73613 	 73631 	 ORG

 USA Today 	 73636 	 73645 	 ORG

 nearly $400M 	 73821 	 73833 	 MONEY

 Coast Guard - New York Post 	 73844 	 73871 	 ORG

 4/25/2018 	 739

 Houston Council 	 102076 	 102091 	 ORG

 Red Cross - Houston Press 	 102134 	 102159 	 ORG

 The Wall Street Journal 	 102351 	 102374 	 ORG

 Chicago Tribune 	 102581 	 102596 	 ORG

 9/12/2017 - Irma 	 102724 	 102740 	 ORG

 about 7.3 million 	 102761 	 102778 	 CARDINAL

 Reuters 	 102781 	 102788 	 ORG

 9/11/2017 - 	 102916 	 102927 	 DATE

 9/8/2017 - Trump 	 103107 	 103123 	 PRODUCT

 Democrats 	 103135 	 103144 	 NORP

 Reuters 	 103161 	 103168 	 ORG

 9/7/2017 	 103296 	 103304 	 CARDINAL

 - Appeals 	 103305 	 103314 	 ORG

 Texas 	 103327 	 103332 	 GPE

 The Hill 	 103360 	 103368 	 LOC

 North Korea 	 103520 	 103531 	 GPE

 US 	 103579 	 103581 	 GPE

 The Hill 	 103592 	 103600 	 LOC

 May 2017 	 103665 	 103673 	 DATE

 5/26/2017 	 103736 	 103745 	 DATE

 Memorial Day 2017 - USMemorialDay.org 	 103767 	 103804 	 EVENT

 5/25/2017 - Thousands 	 103932 	 103953 	 CARDINAL

 Philippine 	 103959 	 103969 	 NORP

 Islamic State - Compiled 	 103992 	 104016 	 ORG

 Reut