# `Tokenization`

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
mystring = '"We\'re moving to L.A.!"'

In [3]:
print(mystring)

"We're moving to L.A.!"


In [4]:
doc = nlp(mystring)

In [5]:
for token in doc:
    print(token.text)

"
We
're
moving
to
L.A.
!
"


In [6]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")

for token in doc2:
    print(token)

We
're
here
to
help
!
Send
snail
-
mail
,
email
support@oursite.com
or
visit
us
at
http://www.oursite.com
!


In [7]:
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')

for token in doc3:
    print(token)

A
5
km
NYC
cab
ride
costs
$
10.30


In [8]:
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")

for token in doc4:
    print(token)

Let
's
visit
St.
Louis
in
the
U.S.
next
year
.


## Get number of tokens

In [9]:
len(doc4) # number of tokens

11

In [10]:
len(doc4.vocab) # total number of vocabulary in 'en_core_web_sm'

791

In [11]:
doc5 = nlp(u'It is better to give than to receive.')

In [12]:
doc5[0]

It

In [13]:
doc5[2:5]

better to give

## Name entities

In [14]:
doc8 = nlp(u"Apple to build a Hong Kong factory for $6 million ")

In [15]:
for token in doc8:
    print(token.text, end=" | ")

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [16]:
for entity in doc8.ents:
    print(f"{str(entity):{10}} | {entity.label_:{10}} | {str(spacy.explain(entity.label_))}")

Apple      | ORG        | Companies, agencies, institutions, etc.
Hong Kong  | GPE        | Countries, cities, states
$6 million | MONEY      | Monetary values, including unit


## Noun chunks

In [17]:
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
# Noun chunk
# Autonomous cars : Noun 'cars' and 'Autonomous' describe the noun 'cars'

In [19]:
for chunk in doc9.noun_chunks:
    print(chunk)

Autonomous cars
insurance liability
manufacturers


In [None]:
# manufacturers : this is a Noun chunk but the describer is not here.