In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Character Tokenizer

In [6]:
text = "Hello World"
lst = [x for x in text]
print(lst)

['H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd']


## Word Tokenizer

In [None]:
from nltk.tokenize import word_tokenize

In [7]:
text = "hello there! You're welcome to N.L.P's and San Francisco."
print(word_tokenize(text))

['hello', 'there', '!', 'You', "'re", 'welcome', 'to', 'N.L.P', "'s", 'and', 'San', 'Francisco', '.']


## Sentence Tokenizer

In [8]:
from nltk.tokenize import sent_tokenize

In [9]:
text = "hello there! You're welcome to U.S.A. and San Francisco. This is the best!"
print(sent_tokenize(text))

['hello there!', "You're welcome to U.S.A. and San Francisco.", 'This is the best!']


## White space Tokenizer

In [10]:
from nltk.tokenize import WhitespaceTokenizer

In [11]:
text = "hello there! \nYou're welcome to U.S.A. and San Francisco. \tThis is   the best!"
wt = WhitespaceTokenizer()
print(wt.tokenize(text))

['hello', 'there!', "You're", 'welcome', 'to', 'U.S.A.', 'and', 'San', 'Francisco.', 'This', 'is', 'the', 'best!']


## Word Punctuation Tokenizer

In [12]:
from nltk.tokenize import WordPunctTokenizer

In [13]:
text = "hello there! \nYou're welcome to U.S.A. and San Francisco. \tThis is   the best!"
wt = WordPunctTokenizer()
print(wt.tokenize(text))

['hello', 'there', '!', 'You', "'", 're', 'welcome', 'to', 'U', '.', 'S', '.', 'A', '.', 'and', 'San', 'Francisco', '.', 'This', 'is', 'the', 'best', '!']


In [14]:
import re

In [15]:
text = "hello-hi-dhanya,how are you"
re.sub("\W+", " ", text)

'hello hi dhanya how are you'

## RegEx Tokenizer

In [16]:
from nltk.tokenize import RegexpTokenizer

In [17]:
text = "hello-hi-dhanya,how are you"
wt = RegexpTokenizer(r"\w+")
print(wt.tokenize(text))

['hello', 'hi', 'dhanya', 'how', 'are', 'you']


In [None]:
print(" ".join(wt.tokenize(text)))

hello hi dhanya how are you


In [18]:
import pandas as pd
from nltk.tokenize import word_tokenize
df = pd.DataFrame({'Phrases': ['The greatest glory in living lies not in never falling, but in rising every time we fall.',
'The way to get started is to quit talking and begin doing.',
'If life were predictable it would cease to be life, and be without flavor.',
"If you set your goals ridiculously high and it's a failure, you will fail above everyone else's success."]})
df['tokenized'] = df.apply(lambda row: nltk.word_tokenize(row['Phrases']), axis=1)
df.head()

Unnamed: 0,Phrases,tokenized
0,The greatest glory in living lies not in never...,"[The, greatest, glory, in, living, lies, not, ..."
1,The way to get started is to quit talking and ...,"[The, way, to, get, started, is, to, quit, tal..."
2,If life were predictable it would cease to be ...,"[If, life, were, predictable, it, would, cease..."
3,If you set your goals ridiculously high and it...,"[If, you, set, your, goals, ridiculously, high..."


In [19]:
text="This is the first line of text.\nThis is the second line of text."
print(text)
print(text.split('\n')) #Spliting the text by '\n'.
print(text.split('\t')) #Spliting the text by '\t'.
print(text.split('s')) #Spliting by charecter 's'.
print(text.split()) #Spliting the text by space.
print(word_tokenize(text)) #Tokenizing by using word

This is the first line of text.
This is the second line of text.
['This is the first line of text.', 'This is the second line of text.']
['This is the first line of text.\nThis is the second line of text.']
['Thi', ' i', ' the fir', 't line of text.\nThi', ' i', ' the ', 'econd line of text.']
['This', 'is', 'the', 'first', 'line', 'of', 'text.', 'This', 'is', 'the', 'second', 'line', 'of', 'text.']
['This', 'is', 'the', 'first', 'line', 'of', 'text', '.', 'This', 'is', 'the', 'second', 'line', 'of', 'text', '.']


## Line Tokenizer

In [None]:
from nltk.tokenize import LineTokenizer

In [None]:
text = "hello world.\n Hi this \nis dhanya"
wt = LineTokenizer()
print(wt.tokenize(text))

['hello world.', ' Hi this ', 'is dhanya']


## Tweet Tokenizer

In [None]:
from nltk.tokenize import TweetTokenizer

In [None]:
text = "hello world.\n Hi this \nis soooo coooool Damn:) #cool"
wt = TweetTokenizer()
print(wt.tokenize(text))

['hello', 'world', '.', 'Hi', 'this', 'is', 'soooo', 'coooool', 'Damn', ':)', '#cool']


## Space Tokenizer

In [None]:
from nltk.tokenize import SpaceTokenizer

In [None]:
text = "hello world.\n Hi this \nis so    cool !"
wt = SpaceTokenizer()
print(wt.tokenize(text))

['hello', 'world.\n', 'Hi', 'this', '\nis', 'so', '', '', '', 'cool', '!']


## Spacy Tokenizer

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("Hello world! Welcome to U.S.A")
for token in doc:
  print(token.text)

Hello
world
!
Welcome
to
U.S.A


In [None]:
doc = nlp("Don't")
for token in doc:
  print(token.text)

Do
n't


In [None]:
doc = nlp("Let's go to U.S.A")
for token in doc:
  print(token.text)

Let
's
go
to
U.S.A


In [None]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()
# Creating a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer("Let's go to N.Y.")
print("Blank tokenizer",end=" : ")
for token in tokens:
    print(token,end=', ')

Blank tokenizer : Let's, go, to, N.Y., 

In [None]:
# Construction 2
from spacy.lang.en import English
nlp = English()
# Creating a Tokenizer with the default settings for English
tokenizer = nlp.tokenizer
tokens = tokenizer("Let's go to N.Y.")
print("\nDefault tokenizer",end=' : ')
for token in tokens:
    print(token,end=', ')


Default tokenizer : Let, 's, go, to, N.Y., 

In [None]:
from spacy.symbols import ORTH
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("ViratKohli is the world's best batsman")
for token in doc:
    print(token,end=', ')

ViratKohli, is, the, world, 's, best, batsman, 

In [None]:
special_case = [{ORTH: "Virat"}, {ORTH: "Kohli"}]
nlp.tokenizer.add_special_case("ViratKohli", special_case)
doc = nlp("ViratKohli is the world's best batsman")
for token in doc:
    print(token,end=', ')

Virat, Kohli, is, the, world, 's, best, batsman, 

In [None]:
special_case = [{ORTH: "give"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)
doc = nlp("Please gimme the book")
for token in doc:
    print(token,end=', ')

Please, give, me, the, book, 

In [None]:
nlp = English()
text = "Let's don't move to L.A."
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
for t in tok_exp:
  print(t[1], "\t", t[0])

Let 	 SPECIAL-1
's 	 SPECIAL-2
do 	 SPECIAL-1
n't 	 SPECIAL-2
move 	 TOKEN
to 	 TOKEN
L.A. 	 TOKEN


In [None]:
print(doc)

I'm having a good time!!!


In [None]:
doc = nlp("I'm having a good time!!!")
for token in doc:
  print(token.text, end=' | ')

I | 'm | having | a | good | time | ! | ! | ! | 

In [None]:
doc1 = nlp(u"We're here to help! send mail to abc@gmail.com or visit www.abc.com")
for token in doc1:
  print(token.text, end=' | ')

We | 're | here | to | help | ! | send | mail | to | abc@gmail.com | or | visit | www.abc.com | 

In [None]:
doc2 = nlp(u"A 5km ola cab ride costs $10.00")
for token in doc2:
  print(token.text, end=' | ')

A | 5 | km | ola | cab | ride | costs | $ | 10.00 | 

In [None]:
doc2[2:5]

km ola cab

In [None]:
doc2[-4:]

ride costs $10.00

In [None]:
doc3 = nlp(u"my dinner was horrible")
doc4 = nlp(u"my dinner was good")
doc3[3] = doc4[3]

TypeError: ignored

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
doc5 = nlp(u"Apple to build a Hong Kong factory for $6 million")
for token in doc5:
  print(token.text, end=' | ')

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 

In [None]:
for ent in doc5.ents:
  print(ent.text+ '-'+ ent.label_+ '-'+ str(spacy.explain(ent.label_)))

Apple-ORG-Companies, agencies, institutions, etc.
Hong Kong-GPE-Countries, cities, states
$6 million-MONEY-Monetary values, including unit


In [None]:
doc6 = nlp(u"This apple looks delicious")
for ent in doc6.ents:
  print(ent.text+ '-'+ ent.label_+ '-'+ str(spacy.explain(ent.label_)))

In [None]:
doc7 = nlp("autonomous cars insurance liability ")
for i in doc7.noun_chunks:
  print(i)

autonomous cars insurance liability


In [None]:
from spacy import displacy
displacy.render(doc5, style='dep',jupyter=True, options={'distance':110})

In [None]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)

In [21]:
text = "I like bananabread breadbanana breadbananabread"
print([t.text for t in nlp(text)])

NameError: ignored

In [20]:
prefixes = ("banana",)+nlp.Defaults.prefixes
suffixes = ("banana",)+nlp.Defaults.suffixes
infixes = ("banana",)+nlp.Defaults.infixes

NameError: ignored

In [None]:
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
infix_regex = spacy.util.compile_infix_regex(infixes)

In [None]:
nlp.tokenizer.prefix_search = prefix_regex.search
nlp.tokenizer.suffix_search = suffix_regex.search
nlp.tokenizer.infix_finditer = infix_regex.finditer

In [None]:
print([t.text for t in nlp(text)])

['I', 'like', 'banana', 'bread', 'bread', 'banana', 'bread', 'banana', 'bread']


In [None]:
doc = nlp("hello-[ world] #. :/")
print([t.text for t in doc])

['hello-', '[', 'world', ']', '#', '.', ':/']


In [None]:
suffixes = list(nlp.Defaults.suffixes)
suffixes.remove("\\[")
suffix_regex = spacy.util.compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search

In [None]:
doc = nlp("hello-[ world] $. :)")
print([t.text for t in doc])

['hello-[', 'world', ']', '$', '.', ':)']
