In [16]:
import spacy
from spacy import displacy

import nltk
from nltk.tokenize import word_tokenize
from nltk import ne_chunk

In [2]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [3]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 billion  |  MONEY  |  Monetary values, including unit


In [4]:
displacy.render(doc, style = "ent")

In [5]:
nlp.pipe_labels["ner"]

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [6]:
doc = nlp("Henry Ford founded Ford in 1903")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Henry Ford | PERSON | People, including fictional
Ford | ORG | Companies, agencies, institutions, etc.
1903 | DATE | Absolute or relative dates or periods


In [7]:
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")

for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", ent.start_char, "|", ent.end_char)

Tesla Inc  |  ORG  |  0 | 9
Twitter Inc  |  ORG  |  30 | 41
$45 billion  |  MONEY  |  46 | 57


In [8]:
doc = nlp("Tesla is going to acquire Twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Twitter  |  PERSON
$45 billion  |  MONEY


In [9]:
s = doc[2:5]
s

going to acquire

In [10]:
type(s)

spacy.tokens.span.Span

In [11]:
from spacy.tokens import Span

In [12]:
s1 = Span(doc, 0, 1, label = "ORG")
s2 = Span(doc, 5, 6, label = "ORG")

doc.set_ents([s1, s2], default = "unmodified")

In [13]:
for ent in doc.ents:
    print(ent.text, " | ", ent.label_)

Tesla  |  ORG
Twitter  |  ORG
$45 billion  |  MONEY


In [14]:
text = "Barack Obama was born in Hawaii and became the 44th President of the United States."

In [15]:
tokens = word_tokenize(text)

#perform named entity recognition
tagged = nltk.pos_tag(tokens)
entities = ne_chunk(tagged)

#iterate over the entities and print named entities
for entity in entities:
    if hasattr(entity, "label") and entity.label() in ["PERSON", "ORGANIZATION", "LOCATION"]:
        print(entity.label(), ' '.join(c[0] for c in entity.leaves()))

PERSON Barack
PERSON Obama
