In [3]:
# install nltk if you're not using virtual environment: 
# !pip install nltk
from nltk.tokenize import sent_tokenize
text = "All work and no play makes jack dull boy. \
All work and no play makes jack a dull boy."
print(sent_tokenize(text))

['All work and no play makes jack dull boy.', 'All work and no play makes jack a dull boy.']


In [1]:
sentence = 'Sunil tweeted at U.S.A., "Witnessing 70th \
Republic Day of India from Rajpath, \
New Delhi. Mesmerizing performance by Indian Army! \
Exciting to see the state-of-the-art weapons! \
Awesome airshow! @india_official \
@indian_army #India #70thRepublic_Day. \
For more photos ping me at e-mail email sunil@photoking.com :)'

In [2]:
# simplest tokenization: just split words by whitespaces
print(sentence.split())

['Sunil', 'tweeted', 'at', 'U.S.A.,', '"Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath,', 'New', 'Delhi.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army!', 'Exciting', 'to', 'see', 'the', 'state-of-the-art', 'weapons!', 'Awesome', 'airshow!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day.', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e-mail', 'email', 'sunil@photoking.com', ':)']


In [7]:
# using nltk's tokenizer
from nltk.tokenize import word_tokenize
words = word_tokenize(sentence)
print(words)

['Sunil', 'tweeted', 'at', 'U.S.A.', ',', '``', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', ',', 'New', 'Delhi', '.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', '!', 'Exciting', 'to', 'see', 'the', 'state-of-the-art', 'weapons', '!', 'Awesome', 'airshow', '!', '@', 'india_official', '@', 'indian_army', '#', 'India', '#', '70thRepublic_Day', '.', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e-mail', 'email', 'sunil', '@', 'photoking.com', ':', ')']


In [8]:
from nltk.tokenize import TweetTokenizer
print(TweetTokenizer().tokenize(sentence))

['Sunil', 'tweeted', 'at', 'U', '.', 'S', '.', 'A', '.', ',', '"', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', ',', 'New', 'Delhi', '.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', '!', 'Exciting', 'to', 'see', 'the', 'state-of-the-art', 'weapons', '!', 'Awesome', 'airshow', '!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day', '.', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e-mail', 'email', 'sunil@photoking.com', ':)']


In [9]:
# using textblob's tokenizer
from textblob import TextBlob
blob = TextBlob(sentence)
print(blob.words)

['Sunil', 'tweeted', 'at', 'U.S.A', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Delhi', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', 'Exciting', 'to', 'see', 'the', 'state-of-the-art', 'weapons', 'Awesome', 'airshow', 'india_official', 'indian_army', 'India', '70thRepublic_Day', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e-mail', 'email', 'sunil', 'photoking.com']


In [10]:
# using spacy's tokenizer
import spacy
nlp = spacy.load('en')
doc = nlp(sentence)
print([ww for ww in doc])

[Sunil, tweeted, at, U.S.A., ,, ", Witnessing, 70th, Republic, Day, of, India, from, Rajpath, ,, New, Delhi, ., Mesmerizing, performance, by, Indian, Army, !, Exciting, to, see, the, state, -, of, -, the, -, art, weapons, !, Awesome, airshow, !, @india_official, @indian_army, #, India, #, 70thRepublic_Day, ., For, more, photos, ping, me, at, e, -, mail, email, sunil@photoking.com, :)]


There exist many other tokenizers, but there's no clear winner. Use the one most meeting your requirements!
* **MWE tokenizer**: MWE stands for Multi-Word Expression. Here, certain groups of multiple words are treated as one entity during tokenization, such as "United States of America," "People's Republic of China," "not only", etc.
* **Regular expression tokenizer**: These tokenizers are developed using regular expressions. Sentences are split based on the occurrence of a particular pattern.
* **Word Punkt tokenizer**: This splits a text into a list of alphabetical characters, digits, and non-alphabetical characters.

In [11]:
from nltk.tokenize import MWETokenizer
mwe_tokenizer = MWETokenizer([('Republic', 'Day')])
mwe_tokenizer.add_mwe(('Indian', 'Army'))
print(mwe_tokenizer.tokenize(sentence.split()))
# note that 'Indian Army' should be treated as one word, but fails. Why? How to fix it?

['Sunil', 'tweeted', 'at', 'U.S.A.,', '"Witnessing', '70th', 'Republic_Day', 'of', 'India', 'from', 'Rajpath,', 'New', 'Delhi.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army!', 'Exciting', 'to', 'see', 'the', 'state-of-the-art', 'weapons!', 'Awesome', 'airshow!', '@india_official', '@indian_army', '#India', '#70thRepublic_Day.', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e-mail', 'email', 'sunil@photoking.com', ':)']


In [12]:
from nltk.tokenize import WordPunctTokenizer
wp_tokenizer = WordPunctTokenizer()
print(wp_tokenizer.tokenize(sentence))
# again, abbreviations (USA) are not handled appropriately

['Sunil', 'tweeted', 'at', 'U', '.', 'S', '.', 'A', '.,', '"', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', ',', 'New', 'Delhi', '.', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', '!', 'Exciting', 'to', 'see', 'the', 'state', '-', 'of', '-', 'the', '-', 'art', 'weapons', '!', 'Awesome', 'airshow', '!', '@', 'india_official', '@', 'indian_army', '#', 'India', '#', '70thRepublic_Day', '.', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e', '-', 'mail', 'email', 'sunil', '@', 'photoking', '.', 'com', ':)']


In [13]:
# regex tokenizer
# mistakes/undesirable splits: USA, e-mail
# pro: you can write your own regex to define the rule to tokenize
from nltk import regexp_tokenize
pattern = r'''([A-Z]\.)+
| \w+(-\w+)*
| \$?\d+(\.\d+)?\%?
| \.\.\.  
| [][.,'"?():-_`]
'''
print(regexp_tokenize(sentence,r'\w+'))

['Sunil', 'tweeted', 'at', 'U', 'S', 'A', 'Witnessing', '70th', 'Republic', 'Day', 'of', 'India', 'from', 'Rajpath', 'New', 'Delhi', 'Mesmerizing', 'performance', 'by', 'Indian', 'Army', 'Exciting', 'to', 'see', 'the', 'state', 'of', 'the', 'art', 'weapons', 'Awesome', 'airshow', 'india_official', 'indian_army', 'India', '70thRepublic_Day', 'For', 'more', 'photos', 'ping', 'me', 'at', 'e', 'mail', 'email', 'sunil', 'photoking', 'com']
