# **Tokenization in Spacy**

In [23]:
import spacy

**Word Tokenization**

In [24]:
nlp = spacy.blank('en')

doc = nlp('Dr. Strange love pav bhaji of karachi as it costs only 2$ per plate.')

In [25]:
for word in doc:
  print(word)

Dr.
Strange
love
pav
bhaji
of
karachi
as
it
costs
only
2
$
per
plate
.


In [26]:
doc[0:4]

Dr. Strange love pav

In [27]:
doc = nlp('Tony gave two $ to Peter')

In [28]:
token0 = doc[0]
token0

Tony

In [29]:
token0.like_num

False

In [30]:
token1 = doc[2]
token1

two

In [31]:
token1.like_num

True

In [32]:
token2 = doc[3]
token2

$

In [33]:
token2.is_currency

True

In [35]:
text = 'Dayton high school, 8th grade students information\n ==================================================\n \n Name\tbirth day   \temail\n -----\t------------\t------\n Virat   5 June, 1882    virat@kohli.com\n Maria\t12 April, 2001  maria@sharapova.com\n Serena  24 June, 1998   serena@williams.com \n Joe      1 May, 1997    joe@root.com\n \n \n \n'

In [36]:
doc = nlp(text)

In [37]:
emails = []
for tokens in doc:
  if tokens.like_email:
    emails.append(tokens.text)


In [38]:
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [39]:
nlp = spacy.blank('ur')

In [40]:
doc = nlp('کیسی ہیں آپ اور کیسی جا رہی ہے۔')

In [41]:
for tokens in doc:
  print(tokens)

کیسی
ہیں
آپ
اور
کیسی
جا
رہی
ہے
۔


In [42]:
doc = nlp('gimme double cheese extra large healthy pizza')

In [45]:
tokens = [token.text for token in doc]

In [46]:
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [50]:
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case('gimme', [
    {ORTH: 'gim'},
    {ORTH: 'me'}
])

doc = nlp('gimme double cheese extra large healthy pizza')

In [51]:
tokens_1 = [token.text for token in doc]
tokens_1

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [52]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7a98607a6640>

In [53]:
doc = nlp('Dr. Strange loves pav bhaji of karachi. Hulk loves chaat of Khairpur Mirs')

for sentence in doc.sents:
  print(sentence)

Dr.
Strange loves pav bhaji of karachi.
Hulk loves chaat of Khairpur Mirs
