In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")
for text in doc2:
    print(text)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [3]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
    print(t)

A
5
km
NYC
cab
ride
costs
$
10.30


In [4]:
# Punctuation that exists as part of a known abbreviation will be kept as part of the token.
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for t in doc4:
    print(t)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


In [5]:
len(doc3) , len(doc4)

(9, 11)

In [6]:
len(doc3.vocab) , len(doc4.vocab) , len(doc2.vocab)

(789, 789, 789)

In [7]:
doc3

A 5km NYC cab ride costs $10.30

In [8]:
doc3[5]  #individual tokens can be retrieved by index position, and spans of tokens can be retrieved through slicing:

ride

In [9]:
doc3[2:6]

km NYC cab ride

In [10]:
doc3[-4:]

ride costs $10.30

In [12]:
# Tokens cannot be reassigned 

doc6 = nlp(u'My dinner was horrible.')
doc7 = nlp(u'Your dinner was delicious.')

In [13]:
#  Try to change "My dinner was horrible" to "My dinner was delicious"
doc6[3] = doc7[3]

TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment

#  Named Entities

##Going a step beyond tokens, *named entities* add another layer of context. 
##The language model recognizes that certain words are organizational names while 
##others are locations, and still other combinations relate to money, dates, etc. 
##Named entities are accessible through the `ents` property of a `Doc` object. -

In [15]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
for t in doc8:
    print(t.text , end = ' | ')
print('\n----')

for ent in doc8.ents:
    print(ent.text + '-' + ent.label_ + '-' +str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple-ORG-Companies, agencies, institutions, etc.
Hong Kong-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


# Noun Chunks 

In [16]:
doc9 = nlp(u'Autonomous cars shift insurance liability toward manufacturers.')
for chunk in doc9.noun_chunks:
    print(chunk.text)

Autonomous cars
insurance liability
manufacturers


In [17]:
doc10 = nlp(u"Red cars do not carry higher insurance rates.")
for chunk in doc10.noun_chunks:
    print(chunk.text)

Red cars
higher insurance rates


In [18]:
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")
for chunk in doc11.noun_chunks:
    print(chunk)

He
a one-eyed, one-horned, flying, purple people-eater


# Visualizing the dependency parse

In [19]:
from spacy import displacy

In [23]:
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style = 'dep', jupyter = True , options = {'distance': 130})

In [24]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style = 'ent', jupyter = True)