In [3]:
import nltk
#nltk.download()  #<-- Run this if it's your first time using nltk to download all of the datasets and models
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer


from nltk.util import ngrams
from nltk.corpus import stopwords

import pandas as pd
import re # Regular expression library
import string

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [4]:
### sentence tokensization using sentences

my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?"
print(sent_tokenize(my_text))


['Hi Mr. Smith!', 'I’m going to buy some vegetables (tomatoes and cucumbers) from the store.', 'Should I pick up some black-eyed peas as well?']


In [5]:
#### Tokenization using words

my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?"

print(word_tokenize(my_text)) 


['Hi', 'Mr.', 'Smith', '!', 'I', '’', 'm', 'going', 'to', 'buy', 'some', 'vegetables', '(', 'tomatoes', 'and', 'cucumbers', ')', 'from', 'the', 'store', '.', 'Should', 'I', 'pick', 'up', 'some', 'black-eyed', 'peas', 'as', 'well', '?']


In [6]:
### Tokenize from text to sentences to words

my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?"
sentences = sent_tokenize(my_text)

#print (sentences)
my_text_tokens = []

for sentence in sentences:
   my_text_tokens.append(word_tokenize(sentence))  
print (my_text_tokens)



[['Hi', 'Mr.', 'Smith', '!'], ['I', '’', 'm', 'going', 'to', 'buy', 'some', 'vegetables', '(', 'tomatoes', 'and', 'cucumbers', ')', 'from', 'the', 'store', '.'], ['Should', 'I', 'pick', 'up', 'some', 'black-eyed', 'peas', 'as', 'well', '?']]


In [7]:
#### Tokenization in bigrams

my_text = "European authorities fined Google a record 5.1 billion on Wednesday for abusing its power in the mobile phone market"


bigram_mytext = list(ngrams(word_tokenize(my_text),2))
print (bigram_mytext)

[('European', 'authorities'), ('authorities', 'fined'), ('fined', 'Google'), ('Google', 'a'), ('a', 'record'), ('record', '5.1'), ('5.1', 'billion'), ('billion', 'on'), ('on', 'Wednesday'), ('Wednesday', 'for'), ('for', 'abusing'), ('abusing', 'its'), ('its', 'power'), ('power', 'in'), ('in', 'the'), ('the', 'mobile'), ('mobile', 'phone'), ('phone', 'market')]


In [6]:

# tokenization in bigrams
my_words = word_tokenize(my_text) # This is the list of all words
twograms = list(ngrams(my_words,2)) # This is for two-word combos, but can pick any nprint(twograms)
print (twograms)

[('European', 'authorities'), ('authorities', 'fined'), ('fined', 'Google'), ('Google', 'a'), ('a', 'record'), ('record', '5.1'), ('5.1', 'billion'), ('billion', 'on'), ('on', 'Wednesday'), ('Wednesday', 'for'), ('for', 'abusing'), ('abusing', 'its'), ('its', 'power'), ('power', 'in'), ('in', 'the'), ('the', 'mobile'), ('mobile', 'phone'), ('phone', 'market')]


In [7]:
# Use regular expression to tokenize using white space

whitespace_tokenizer = RegexpTokenizer("\s+", gaps=True)
print(whitespace_tokenizer.tokenize(my_text))

['European', 'authorities', 'fined', 'Google', 'a', 'record', '5.1', 'billion', 'on', 'Wednesday', 'for', 'abusing', 'its', 'power', 'in', 'the', 'mobile', 'phone', 'market']


In [8]:

# RegexpTokenizer to match only capitalized words
cap_tokenizer = RegexpTokenizer("[A-Z]['\w]+")
print(cap_tokenizer.tokenize(my_text))

['European', 'Google', 'Wednesday']


In [9]:

# Replace punctuations with a white space
my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?"
clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
clean_text2 = re.sub("[.,\/#!$%\^&\*;:?{}=\-_`~()]", " ",  my_text)
#replace(/[.,\/#!$%\^&\*;:{}=\-_`~()]/g,"")

print("my_text :", my_text)
print("clean_text :",clean_text)
print("clean_text ;",clean_text2)

my_text : Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from the store. Should I pick up some black-eyed peas as well?
clean_text : Hi Mr  Smith  I’m going to buy some vegetables  tomatoes and cucumbers  from the store  Should I pick up some black eyed peas as well 
clean_text ; Hi Mr  Smith  I’m going to buy some vegetables  tomatoes and cucumbers  from the store  Should I pick up some black eyed peas as well 


In [10]:
# Change to lower case
clean_text = clean_text.lower()
clean_text

'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers  from the store  should i pick up some black eyed peas as well '

In [11]:
# Removes all words containing digits
clean_text = re.sub('\w*\d\w*', ' ', clean_text)
clean_text

'hi mr  smith  i’m going to buy some vegetables  tomatoes and cucumbers  from the store  should i pick up some black eyed peas as well '

In [12]:
# Get stop words from NLTK
my_stopwords = set(stopwords.words('english'))
my_stopwords.add('OMG')
my_stopwords.remove('because')
print (my_stopwords)



{"needn't", "aren't", 'the', 'she', 'just', "won't", 'hadn', "hasn't", "should've", 'that', 'did', 'it', 'below', 'for', 'not', 'nor', 'isn', 'this', 'which', 'wouldn', "that'll", 'haven', 'same', 'more', 'needn', 'before', 'has', 'didn', 'of', 'yours', 'but', 'your', 'had', 'what', 'you', 'down', 'during', 'his', 'on', "mustn't", "hadn't", 'only', "haven't", 'as', 'few', 'him', 'most', "shouldn't", 'me', 'a', "mightn't", 'over', 'in', 'an', 'hers', 'after', 'being', 'under', 'own', 'each', "wouldn't", 'been', 'again', 'so', 'they', 'wasn', 'll', 's', 'then', "wasn't", 'into', 'is', 'doesn', 'o', 'its', 'aren', 'above', 'can', 'these', 'OMG', 'too', 'doing', 'will', 'through', 'until', 'mightn', 'we', 'no', "didn't", 'to', "shan't", 'at', 'their', 'when', 'whom', 'very', 'ours', 'themselves', 'up', "doesn't", 'having', 'don', 'yourselves', 'if', 'here', 'd', 'won', 'couldn', 'by', 'or', 'there', 'who', 'from', 'be', "you've", "you'd", 'any', 'both', 'himself', 'does', "weren't", 'other

In [13]:
## danger of mindlessly removing stopwords

text = 'the product is not good'
stop_words = set(stopwords.words('english')) 
#stop_words.remove('not')
word_tokens = word_tokenize(text) 
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
 
print("Original word tokens: " , word_tokens) 
print("Filtered word tokens: " , filtered_sentence) 

# To resolve this above problem, you may have to remove the stop word "NOT" from the list

Original word tokens:  ['the', 'product', 'is', 'not', 'good']
Filtered word tokens:  ['product', 'good']


In [14]:
## Stemming
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
# Try some stems
print('drive: {}'.format(stemmer.stem('drive')))
print('drives: {}'.format(stemmer.stem('drives')))
print('driver: {}'.format(stemmer.stem('driver')))
print('drivers: {}'.format(stemmer.stem('drivers')))
print('driven: {}'.format(stemmer.stem('driven')))


drive: driv
drives: driv
driver: driv
drivers: driv
driven: driv


In [15]:
# Part of speech
#nltk.download('tagsets')
from nltk.tag import pos_tag
my_text = "James Smith lives in the United States."
tokens = pos_tag(word_tokenize(my_text))
print(tokens)


[('James', 'NNP'), ('Smith', 'NNP'), ('lives', 'VBZ'), ('in', 'IN'), ('the', 'DT'), ('United', 'NNP'), ('States', 'NNPS'), ('.', '.')]


In [16]:
# If you need to know what the tag means
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [17]:
# Complex words

from nltk.tokenize import MWETokenizer # multi-word expression
my_text = "You all are the greatest students of all time."
mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
mwe_tokens


['You_all', 'are', 'the', 'greatest', 'students', 'of_all_time', '.']

In [18]:
import nltk
from nltk.corpus import stopwords

set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [19]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example_sent) 
filtered_sentence = [w for w in word_tokens if not w in stop_words] 
 
print("Original word tokens: " , word_tokens) 
print("Filtered word tokens: " , filtered_sentence) 


Original word tokens:  ['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
Filtered word tokens:  ['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


In [26]:
from nltk.corpus import stopwords
my_stopwords = set(stopwords.words('english'))
print (my_stopwords)

{"needn't", "aren't", 'the', 'she', 'just', "won't", 'hadn', "hasn't", "should've", 'that', 'did', 'it', 'below', 'for', 'not', 'nor', 'isn', 'this', 'which', 'wouldn', "that'll", 'haven', 'same', 'more', 'needn', 'before', 'has', 'didn', 'of', 'yours', 'but', 'your', 'had', 'what', 'you', 'down', 'during', 'his', 'on', "mustn't", "hadn't", 'only', "haven't", 'as', 'few', 'him', 'most', "shouldn't", 'me', 'a', "mightn't", 'over', 'in', 'an', 'hers', 'after', 'being', 'under', 'own', 'each', "wouldn't", 'been', 'again', 'so', 'they', 'wasn', 'because', 'll', 's', 'then', "wasn't", 'into', 'is', 'doesn', 'o', 'its', 'aren', 'above', 'can', 'these', 'too', 'doing', 'will', 'through', 'until', 'mightn', 'we', 'no', "didn't", 'to', "shan't", 'at', 'their', 'when', 'whom', 'very', 'ours', 'themselves', 'up', "doesn't", 'having', 'don', 'yourselves', 'if', 'here', 'd', 'won', 'couldn', 'by', 'or', 'there', 'who', 'from', 'be', "you've", "you'd", 'any', 'both', 'himself', 'does', "weren't", 'o

In [25]:
from nltk.corpus import stopwords
my_stopwords = set(stopwords.words('english'))
my_stopwords.add('OMG')
print (my_stopwords)


{"needn't", "aren't", 'the', 'she', 'just', "won't", 'hadn', "hasn't", "should've", 'that', 'did', 'it', 'below', 'for', 'not', 'nor', 'isn', 'this', 'which', 'wouldn', "that'll", 'haven', 'same', 'more', 'needn', 'before', 'has', 'didn', 'of', 'yours', 'but', 'your', 'had', 'what', 'you', 'down', 'during', 'his', 'on', "mustn't", "hadn't", 'only', "haven't", 'as', 'few', 'him', 'most', "shouldn't", 'me', 'a', "mightn't", 'over', 'in', 'an', 'hers', 'after', 'being', 'under', 'own', 'each', "wouldn't", 'been', 'again', 'so', 'they', 'wasn', 'because', 'll', 's', 'then', "wasn't", 'into', 'is', 'doesn', 'o', 'its', 'aren', 'above', 'can', 'these', 'OMG', 'too', 'doing', 'will', 'through', 'until', 'mightn', 'we', 'no', "didn't", 'to', "shan't", 'at', 'their', 'when', 'whom', 'very', 'ours', 'themselves', 'up', "doesn't", 'having', 'don', 'yourselves', 'if', 'here', 'd', 'won', 'couldn', 'by', 'or', 'there', 'who', 'from', 'be', "you've", "you'd", 'any', 'both', 'himself', 'does', "weren

In [27]:
from nltk import pos_tag
from nltk.stem import  WordNetLemmatizer

lemmatiser = WordNetLemmatizer()


words = ["car", "cars", "care", "caring", "careful", "boats", "boating"]



In [28]:
for word in words:
   print("Lemmatise %s  --> %s" % (word, lemmatiser.lemmatize(word)))

Lemmatise car  --> car
Lemmatise cars  --> car
Lemmatise care  --> care
Lemmatise caring  --> caring
Lemmatise careful  --> careful
Lemmatise boats  --> boat
Lemmatise boating  --> boating


In [29]:

for word in words:
   print("Lemmatise %s  --> %s" % (word, lemmatiser.lemmatize(word, pos="n")))

Lemmatise car  --> car
Lemmatise cars  --> car
Lemmatise care  --> care
Lemmatise caring  --> caring
Lemmatise careful  --> careful
Lemmatise boats  --> boat
Lemmatise boating  --> boating


In [30]:
for word in words:
   print("Lemmatise %s --> %s" % (word, lemmatiser.lemmatize(word, pos="v")))

Lemmatise car --> car
Lemmatise cars --> cars
Lemmatise care --> care
Lemmatise caring --> care
Lemmatise careful --> careful
Lemmatise boats --> boat
Lemmatise boating --> boat


In [31]:
# import these modules 
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer() 

print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 

# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a")) 
print("best :", lemmatizer.lemmatize("best", pos ="a"))

print("better :", lemmatizer.lemmatize("better", pos ="a")) 
print("best :", lemmatizer.lemmatize("best", pos ="a"))

print("transport :", lemmatizer.lemmatize("transported", pos ="v")) 
print("transport :", lemmatizer.lemmatize("transportation", pos ="v"))

rocks : rock
corpora : corpus
better : good
best : best
better : good
best : best
transport : transport
transport : transportation


In [32]:
# Reference: https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
    
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

text = "European authorities fined Google a record 5.1 billion on Wednesday for abusing its power in the mobile phone market"


tokenized_text = nltk.word_tokenize(text)
sent = nltk.pos_tag(tokenized_text)

for s in sent:
    print (s)



('European', 'JJ')
('authorities', 'NNS')
('fined', 'VBD')
('Google', 'NNP')
('a', 'DT')
('record', 'NN')
('5.1', 'CD')
('billion', 'CD')
('on', 'IN')
('Wednesday', 'NNP')
('for', 'IN')
('abusing', 'VBG')
('its', 'PRP$')
('power', 'NN')
('in', 'IN')
('the', 'DT')
('mobile', 'JJ')
('phone', 'NN')
('market', 'NN')


In [33]:
print (tokenized_text)

['European', 'authorities', 'fined', 'Google', 'a', 'record', '5.1', 'billion', 'on', 'Wednesday', 'for', 'abusing', 'its', 'power', 'in', 'the', 'mobile', 'phone', 'market']


In [34]:
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN')]

In [35]:
# Example on Extracting Compound Words

from nltk.tokenize import MWETokenizer

text = "I am interested in data science and artificial intelligence"
mwe_tokenizer = MWETokenizer([("artificial","intelligence"), ("data","science")], separator='_')

word_tokens = word_tokenize(text)
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(text))

print ("Word tokens -> ", word_tokens)
print("Compound words tokens -> ", mwe_tokens)

Word tokens ->  ['I', 'am', 'interested', 'in', 'data', 'science', 'and', 'artificial', 'intelligence']
Compound words tokens ->  ['I', 'am', 'interested', 'in', 'data_science', 'and', 'artificial_intelligence']


In [36]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [37]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [38]:
for tag in ['NNP', 'VBZ', 'DT']:
    print(nltk.help.upenn_tagset(tag))

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
None
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...
None
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
None
