
## Installing and Using NLTK


In [2]:
import nltk
nltk.download() # download all packages/collections in nltk

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Example of Tokenizing

In [7]:
from nltk.tokenize import sent_tokenize, word_tokenize
example_text = "Hello Mr. Sampath, how are you doing today? The weather is great and NLTK is interesting. It is snowing here and really, it is pretty cold. Though at -5 degrees, actually quite warmer than usual! Okay, bye now. ttyl."
print(sent_tokenize(example_text))

['Hello Mr. Sampath, how are you doing today?', 'The weather is great and NLTK is interesting.', 'It is snowing here and really, it is pretty cold.', 'Though at -5 degrees, actually quite warmer than usual!', 'Okay, bye now.', 'ttyl.']


In [8]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Sampath', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', 'and', 'NLTK', 'is', 'interesting', '.', 'It', 'is', 'snowing', 'here', 'and', 'really', ',', 'it', 'is', 'pretty', 'cold', '.', 'Though', 'at', '-5', 'degrees', ',', 'actually', 'quite', 'warmer', 'than', 'usual', '!', 'Okay', ',', 'bye', 'now', '.', 'ttyl', '.']


## Filtering Stop Words
Stop words are words which are filtered out before or after processing of natural language data. Stop words usually refers to the most common words in a language.

(Note: there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list).

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
example_sentence = "This is an example sentence showing stop word filtration. Hopefully, this will work! Or, will it? Wont it?? Lets find out!!"
stop_words = set(stopwords.words("english"))
words = word_tokenize(example_sentence)
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)

# The above for loop as a one-liner
# filtered_sentence = [w for w in words if w not in stop_words]

print(filtered_sentence)

['This', 'example', 'sentence', 'showing', 'stop', 'word', 'filtration', '.', 'Hopefully', ',', 'work', '!', 'Or', ',', '?', 'Wont', '?', '?', 'Lets', 'find', '!', '!']


## Stemming
Reducing different forms of a word to a common base form. For example: 
- “I am a student” = “I be a student”;  
- “My dog’s fur is dark” = “My dog fur be dark”.

Note: Stemming may not be needed aynmore with modern NLP tools.

In [19]:
from nltk.stem import PorterStemmer # Using the Porter Stemming Algorithm from 1979!
from nltk.tokenize import word_tokenize
ps = PorterStemmer()

example_words = ["python","pythoning","pythoner","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))

example_sentence = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned badly at least once."
words = word_tokenize(example_sentence)
for w in words:
    print(ps.stem(w))

python
python
python
python
pythonli
I
t
 
i
s
 
v
e
r
y
 
i
m
p
o
r
t
a
n
t
 
t
o
 
b
e
 
p
y
t
h
o
n
l
y
 
w
h
i
l
e
 
y
o
u
 
a
r
e
 
p
y
t
h
o
n
i
n
g
 
w
i
t
h
 
p
y
t
h
o
n
.
 
A
l
l
 
p
y
t
h
o
n
e
r
s
 
h
a
v
e
 
p
y
t
h
o
n
e
d
 
b
a
d
l
y
 
a
t
 
l
e
a
s
t
 
o
n
c
e
.
