# Tokenization in spacy library

In [1]:
# download the module
!pip install spacy accelerate



In [2]:
# import module
import spacy

### --> spacy uses by default word tokenization

In [7]:
nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of Mumbai as it costs only 2$ per plate.")

for word in doc :
    print(word)

Dr.
Strange
loves
pav
bhaji
of
Mumbai
as
it
costs
only
2
$
per
plate
.


In [8]:
doc = nlp('''"Let's go to N.Y.!"''')
for token in doc :
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [9]:
type(nlp)

spacy.lang.en.English

In [11]:
doc[1:3]

Let's

In [13]:
token0 = doc[1]
token0

Let

In [18]:
# checking for num
token0.like_num

False

In [19]:
# checking for alphabetical
token0.is_alpha

True

In [22]:
doc2 = nlp("Tony gave peter a two $ in cash")

In [24]:
# to check by loop
for token in doc2 :
    print(token,"-->","Index:",token.i,
         "is_alpha : ", token.is_alpha,
          "is_num : ", token.like_num,
          "is_currency : ", token.is_currency,
          "is_punct : ", token.is_punct
         )

Tony --> Index: 0 is_alpha :  True is_num :  False is_currency :  False is_punct :  False
gave --> Index: 1 is_alpha :  True is_num :  False is_currency :  False is_punct :  False
peter --> Index: 2 is_alpha :  True is_num :  False is_currency :  False is_punct :  False
a --> Index: 3 is_alpha :  True is_num :  False is_currency :  False is_punct :  False
two --> Index: 4 is_alpha :  True is_num :  True is_currency :  False is_punct :  False
$ --> Index: 5 is_alpha :  False is_num :  False is_currency :  True is_punct :  False
in --> Index: 6 is_alpha :  True is_num :  False is_currency :  False is_punct :  False
cash --> Index: 7 is_alpha :  True is_num :  False is_currency :  False is_punct :  False


In [28]:
# input the file
with open("/kaggle/input/studentdata/students.txt","r") as f :
    text = f.readlines()
print(text)

['Modern Public Academy -- 12th grade student\n', '--------------------------------------------\n', '\n', 'Name        Birthday        email\n', '1. Virat    5 June 1982     virat@kohli.com\n', '2. Maria    12 April 2001   maria@sharapova.com\n', '3. Serena   24 June 1998    serena@williams.com\n', '4. Joe      1 May 1997      joe@root.com\n']


In [29]:
# join the file in space dilemter
text = " ".join(text)
text

'Modern Public Academy -- 12th grade student\n --------------------------------------------\n \n Name        Birthday        email\n 1. Virat    5 June 1982     virat@kohli.com\n 2. Maria    12 April 2001   maria@sharapova.com\n 3. Serena   24 June 1998    serena@williams.com\n 4. Joe      1 May 1997      joe@root.com\n'

In [31]:
# fetch the emails
doc = nlp(text)
emails = []
for token in doc : 
    if token.like_email :
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [32]:
# looping by list comprehensive
text = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in text]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [34]:
# customize the tokenizer
from spacy.symbols import ORTH

# custom tokenizer
nlp.tokenizer.add_special_case("gimme",[
    {ORTH : "gim"},
    {ORTH : "me"}
])
text = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in text]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [35]:
# sentence tokenization
text = nlp("Dr. Strange loves pav bhaji of Mumbai. Hulk loves chat of Delhi.")
for sentence in text : 
    print(sentence)

Dr.
Strange
loves
pav
bhaji
of
Mumbai
.
Hulk
loves
chat
of
Delhi
.


In [42]:
# checking the pipe
nlp = spacy.blank("en")

In [43]:
nlp.pipe_names

[]

In [45]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7e0ba63f3680>

In [46]:
nlp.pipe_names

['sentencizer']