# NLTK

## Sentence and Word Tokenization

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

In [3]:
# Sentence Tokenization
print(sent_tokenize(EXAMPLE_TEXT))

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


In [4]:
# Word Tokenization
print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


## Stopwords

In [6]:
from nltk.corpus import stopwords

In [7]:
# Printing all stopwords (english)
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 'd',
 'did',
 'didn',
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'has',
 'hasn',
 'have',
 'haven',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 'it',
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 'more',
 'most',
 'mustn',
 'my',
 'myself',
 'needn',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 'she',
 'should',
 'shouldn',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'their',
 'theirs',
 'them',
 

In [9]:
example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)
filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


## Stemming words

In [10]:
# Porter Stemmer is a stemming algorithm
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [11]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [12]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [13]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [14]:
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


## Part of Speech Tagging

# POS tag list:

CC	coordinating conjunction  
CD	cardinal digit  
DT	determiner  
EX	existential there (like: "there is" ... think of it like "there exists")  
FW	foreign word  
IN	preposition/subordinating conjunction  
JJ	adjective	'big'  
JJR	adjective, comparative	'bigger'  
JJS	adjective, superlative	'biggest'  
LS	list marker	1)  
MD	modal	could, will  
NN	noun, singular 'desk'  
NNS	noun plural	'desks'  
NNP	proper noun, singular	'Harrison'  
NNPS proper noun, plural	'Americans'  
PDT	predeterminer	'all the kids'  
POS	possessive ending	parent's  
PRP	personal pronoun	I, he, she  
PRP\$ possessive pronoun	my, his, hers  
RB	adverb	very, silently,  
RBR	adverb, comparative	better  
RBS	adverb, superlative	best  
RP	particle	give up  
TO	to	go 'to' the store.  
UH	interjection	errrrrrrrm  
VB	verb, base form	take  
VBD	verb, past tense	took  
VBG	verb, gerund/present participle	taking  
VBN	verb, past participle	taken  
VBP	verb, sing. present, non-3d	take  
VBZ	verb, 3rd person sing. present	takes  
WDT	wh-determiner	which  
WP	wh-pronoun	who, what  
WP$	possessive wh-pronoun	whose  
WRB	wh-abverb	where, when  

#### PunktSentenceTokenizer
> This tokenizer is capable of unsupervised machine learning, so you can actually train it on any body of text that you use.

In [15]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [17]:
# Create training and testing data
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

In [18]:
# Train Punkt tokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [19]:
# Actually tokenize
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [20]:
print(tokenized)

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.', '31, 2006.', "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.", 'We have gathered under this Capitol dome in moments of national mourning and national achievemen

In [22]:
# Create a function that will run through and tag all of the parts of speech per sentence

def process_content():
    try:
        for i in tokenized[ :5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
           print(str(e))

In [23]:
# Output should be a list of tuples, where the first element in the tuple is the word, and the second is the part of speech tag

process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

## Lemmatizing

> A very similar operation to stemming is called lemmatizing. The major difference between these is, as you saw earlier, stemming can often create non-existent words, whereas lemmas are actual words.  

> So, your root stem, meaning the word you end up with, is not something you can just look up in a dictionary, but you can look up a lemma.  

> Some times you will wind up with a very similar word, but sometimes, you will wind up with a completely different word.

In [24]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [25]:
print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

# Here, we've got a bunch of examples of the lemma for the words that we use. 
# The only major thing to note is that lemmatize takes a part of speech parameter, "pos." 
# If not supplied, the default is "noun." This means that an attempt will be made to find the closest noun, which can create trouble for you. 
# Keep this in mind if you use lemmatizing!

cat
cactus
goose
rock
python
good
best
run
run


## Corpora

> The NLTK corpus is a massive dump of all kinds of natural language data sets 

In [26]:
# Opening the Gutenberg Bible, and reading the first few lines

from nltk.tokenize import sent_tokenize, PunktSentenceTokenizer
from nltk.corpus import gutenberg

#sample text
sample = gutenberg.raw('bible-kjv.txt')

tok = sent_tokenize(sample)

for x in range(5):
    print(tok[x])

[The King James Bible]

The Old Testament of the King James Bible

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth.
1:2 And the earth was without form, and void; and darkness was upon
the face of the deep.
And the Spirit of God moved upon the face of the
waters.
1:3 And God said, Let there be light: and there was light.
1:4 And God saw the light, that it was good: and God divided the light
from the darkness.


## Wordnet

> Wordnet is a collection of words, definitions, examples of their use, synonyms, antonyms, and more.

In [27]:
# Import wordnet
from nltk.corpus import wordnet

In [29]:
# use the term "program" to find synsets
syns = wordnet.synsets('program')

print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [30]:
#Print first synset
print(syns[0].name())

plan.n.01


In [33]:
# Print only the word
print(syns[0].lemmas()[0].name())

plan


In [34]:
# Definition for that first synset
print(syns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [35]:
# Examples of the word in use
print(syns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [42]:
# Synonyms and Antonyms

# The lemmas will be synonyms, 
# and then you can use .antonyms to find the antonyms to the lemmas

synonyms = []
antonyms = []

for syn in wordnet.synsets('good'):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(set(synonyms))
print(set(antonyms))

{'in_effect', 'honest', 'unspoiled', 'serious', 'ripe', 'beneficial', 'full', 'respectable', 'good', 'skilful', 'secure', 'trade_good', 'honorable', 'soundly', 'practiced', 'unspoilt', 'estimable', 'skillful', 'right', 'goodness', 'safe', 'in_force', 'adept', 'salutary', 'proficient', 'dependable', 'well', 'upright', 'expert', 'sound', 'undecomposed', 'thoroughly', 'commodity', 'near', 'dear', 'just', 'effective'}
{'evil', 'evilness', 'badness', 'ill', 'bad'}


In [44]:
# compare the similarity of two words and their tenses

w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('boat.n.01')
print(w1.wup_similarity(w2))

w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('car.n.01')
print(w1.wup_similarity(w2))

w1 = wordnet.synset('ship.n.01')
w2 = wordnet.synset('cat.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091
0.6956521739130435
0.32
