# Tokenization


In [1]:
sentence = "The capital of China is Beijing"
sentence.split()

['The', 'capital', 'of', 'China', 'is', 'Beijing']

## Issues

In [2]:
sentence = "China's capital is Beijing"
sentence.split()

["China's", 'capital', 'is', 'Beijing']

In [3]:
sentence = "Beijing is where we'll go"
sentence.split()

['Beijing', 'is', 'where', "we'll", 'go']

In [4]:
sentence = "I'm going to travel to Beijing"
sentence.split()

["I'm", 'going', 'to', 'travel', 'to', 'Beijing']

In [5]:
sentence = "Let's travel to Hong Kong from Beijing"
sentence.split()

["Let's", 'travel', 'to', 'Hong', 'Kong', 'from', 'Beijing']

In [6]:
sentence = "A friend is pursuing his M.S from Beijing"
sentence.split()

['A', 'friend', 'is', 'pursuing', 'his', 'M.S', 'from', 'Beijing']

## Types

In [9]:
#Regular expression based tokenizer
from nltk.tokenize import RegexpTokenizer
s = "A Rolex watch costs in the range of $3000.0 - $8000.0 in USA."
tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokenizer.tokenize(s)

['A',
 'Rolex',
 'watch',
 'costs',
 'in',
 'the',
 'range',
 'of',
 '$3000.0',
 '-',
 '$8000.0',
 'in',
 'USA',
 '.']

In [10]:
#Treebank Tokenizer
from nltk.tokenize import TreebankWordTokenizer
s = "I'm going to buy a Rolex watch that doesn't cost more than $3000.0"
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(s)

['I',
 "'m",
 'going',
 'to',
 'buy',
 'a',
 'Rolex',
 'watch',
 'that',
 'does',
 "n't",
 'cost',
 'more',
 'than',
 '$',
 '3000.0']

In [11]:
#Tweet Tokenizer
from nltk.tokenize import TweetTokenizer
s = "@amankedia I'm going to buy a Rolexxxxxxxx watch!!! :-D #happiness #rolex <3"
tokenizer = TweetTokenizer()
tokenizer.tokenize(s)

['@amankedia',
 "I'm",
 'going',
 'to',
 'buy',
 'a',
 'Rolexxxxxxxx',
 'watch',
 '!',
 '!',
 '!',
 ':-D',
 '#happiness',
 '#rolex',
 '<3']

In [12]:
from nltk.tokenize import TweetTokenizer
s = "@amankedia I'm going to buy a Rolexxxxxxxx watch!!! :-D #happiness #rolex <3"
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokenizer.tokenize(s)

["I'm",
 'going',
 'to',
 'buy',
 'a',
 'Rolexxx',
 'watch',
 '!',
 '!',
 '!',
 ':-D',
 '#happiness',
 '#rolex',
 '<3']

# Understanding Word Normalization

### Stemming

In [13]:
from nltk.stem.snowball import SnowballStemmer
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [15]:
plurals = ['caresses', 'flies', 'dies', 'mules', 'died', 'agreed', 'owned','humbled', 'sized', 'meeting', 'stating','siezing', 'itemization',
           'traditional', 'reference', 'colonizer', 'plotted','having', 'generously']

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have gener


In [16]:
stemmer2 = SnowballStemmer(language='english')
singles = [stemmer2.stem(plural) for plural in plurals]
print(' '.join(singles))

caress fli die mule die agre own humbl size meet state siez item tradit refer colon plot have generous


## Lemmatization

### WordNet Lemmatizer

In [17]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [18]:
lemmatizer = WordNetLemmatizer()
s = "We are putting in efforts to enhance our understanding of \
Lemmatization"
token_list = s.split()
print("The tokens are: ", token_list)
lemmatized_output = ' '.join([lemmatizer.lemmatize(token) for token \
in token_list])
print("The lemmatized output is: ", lemmatized_output)

The tokens are:  ['We', 'are', 'putting', 'in', 'efforts', 'to', 'enhance', 'our', 'understanding', 'of', 'Lemmatization']
The lemmatized output is:  We are putting in effort to enhance our understanding of Lemmatization


In [19]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [20]:
pos_tags = nltk.pos_tag(token_list)
pos_tags

[('We', 'PRP'),
 ('are', 'VBP'),
 ('putting', 'VBG'),
 ('in', 'IN'),
 ('efforts', 'NNS'),
 ('to', 'TO'),
 ('enhance', 'VB'),
 ('our', 'PRP$'),
 ('understanding', 'NN'),
 ('of', 'IN'),
 ('Lemmatization', 'NN')]

In [21]:
from nltk.corpus import wordnet
##This is a common method which is widely used across the NLP community of practitioners and readers
def get_part_of_speech_tags(token):
    """Maps POS tags to first character lemmatize() accepts.
    We are focusing on Verbs, Nouns, Adjectives and Adverbs here."""
    tag_dict = {"J": wordnet.ADJ,
    "N": wordnet.NOUN,
    "V": wordnet.VERB,
    "R": wordnet.ADV}
    tag = nltk.pos_tag([token])[0][1][0].upper()
    return tag_dict.get(tag, wordnet.NOUN)

In [23]:
lemmatized_output_with_POS_information = [lemmatizer.lemmatize(token, get_part_of_speech_tags(token)) for token in token_list]

print(' '.join(lemmatized_output_with_POS_information))

We be put in effort to enhance our understand of Lemmatization


In [24]:
#Comparision with Snowball Stemmer
stemmer2 = SnowballStemmer(language='english')
stemmed_sentence = [stemmer2.stem(token) for token in token_list]
print(' '.join(stemmed_sentence))

we are put in effort to enhanc our understand of lemmat


### Spacy lemmatizer

In [25]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [26]:
doc = nlp("We are putting in efforts to enhance our understanding of Lemmatization")

" ".join([token.lemma_ for token in doc])

'we be put in effort to enhance our understanding of lemmatization'

### Stopword Removal

In [27]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [28]:
stop = set(stopwords.words('english'))

", ".join(stop)

"been, haven, each, their, as, will, mightn, that'll, it, don, does, o, what, shouldn, most, into, wouldn, didn't, she, is, you'll, he, such, below, had, under, ll, mightn't, just, mustn't, are, couldn't, won, how, during, weren't, this, shouldn't, down, herself, being, why, haven't, other, over, shan, our, than, do, don't, needn, were, with, until, some, didn, who, hasn, when, hasn't, those, where, ours, himself, mustn, itself, or, now, own, yours, them, wasn, aren, through, no, an, won't, you'd, up, off, y, because, m, wasn't, you're, hadn, ain, needn't, shan't, she's, if, between, from, it's, did, which, has, but, at, before, should've, yourselves, about, that, only, you, his, hadn't, while, out, both, once, after, isn't, myself, a, ma, was, here, and, in, on, aren't, ourselves, should, not, having, for, doesn't, then, theirs, s, him, of, me, have, doing, more, be, my, further, i, so, doesn, few, against, all, by, nor, same, themselves, these, any, hers, yourself, there, whom, your,

In [29]:
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
stop = set(stopwords.words('english'))
sentence = "how are we putting in efforts to enhance our understanding of Lemmatization"

for word in wh_words:
    stop.remove(word)
    sentence_after_stopword_removal = [token for token in sentence.split() if token not in stop]

" ".join(sentence_after_stopword_removal)

'how putting efforts enhance understanding Lemmatization'

### Case Folding

In [30]:
s = "We are putting in efforts to enhance our understanding of Lemmatization"
s = s.lower()
s

'we are putting in efforts to enhance our understanding of lemmatization'

### N-grams

In [32]:
from nltk.util import ngrams
s = "Natural Language Processing is the way to go"
tokens = s.split()
bigrams = list(ngrams(tokens, 2))

[" ".join(token) for token in bigrams]

['Natural Language',
 'Language Processing',
 'Processing is',
 'is the',
 'the way',
 'way to',
 'to go']

In [33]:
s = "Natural Language Processing is the way to go"
tokens = s.split()
trigrams = list(ngrams(tokens, 3))

[" ".join(token) for token in trigrams]

['Natural Language Processing',
 'Language Processing is',
 'Processing is the',
 'is the way',
 'the way to',
 'way to go']

### Taking care of HTML tags

In [34]:
html = "<!DOCTYPE html><html><body><h1>My First Heading</h1><p>My first paragraph.</p></body></html>"
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
text = soup.get_text()
print(text)

My First HeadingMy first paragraph.
