# Manipulating Text with NLTK

may need to update nltk after initially installing it: **pip install -U nltk**

http://www.nltk.org/

In [None]:
import nltk

# Terms to know:

Tokenizing - word tokenizers, sentence tokenizers

Stop Words - meaningless words (or filler words) such as "like", "as", "the", "it", etc…

Corpora (Corpus) - body of text. ex: medical journals, presidential speeches, etc…

Lexicon - words and their meanings. ex: "well" can beither an adjective or a noun

# Tokenizing

In [None]:
# imports
from nltk.tokenize import sent_tokenize, word_tokenize

example_text = 'Hello Mr. Smith, how are you doing today? The weather is great and python is awesome. The sky is pinkish-blue. You should not eat cardboard.'

# tokenize by sentence
print('Tokenized by Sentence:\n')
print(sent_tokenize(example_text))


# tokenize by word
print('\nTokenized by word:\n')
print(word_tokenize(example_text))
# notice that this counts punctuation as its own word. You can change this parameter if you so choose.

# if you want to tokenize the words but exclude punctuation, use this RegexpTokenizer():
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
print('\nTokenized by word without punctuation:\n')
print(tokenizer.tokenize(example_text))

# Removing Stop Words

In [None]:
# imports
from nltk.corpus import stopwords

example_sentence = 'This is an example showing off stop word filtration.'

# NLTK has multiple sets of stop words, but we are going to use the set made specifically for the English language.
stop_words = set(stopwords.words('english'))
print('Stop words:\n')
print(stop_words)

print('\nExamples of removing stop words:\n')
      
words = word_tokenize(example_sentence)

# non-fancy way to filter the stop words out of your text:
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)
# notice that only the "important" words are left

# fancy way to filter the stop words out of your text:
filtered_sentence = [word for word in words if not word in stop_words] # basically the for-loop in a single line
print(filtered_sentence)

# Stemming

Taking words and "stemming" the ends of them. Such as "-ed", or "-ing" at the end of a word.

So, the words "worked" and "working" would both be converted to "work" because they basically have the same root meaning.

In [None]:
# imports
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_words = ['python','pythoner','pythoning','pythoned','pythonly']

print('Example words:\n')
for word in example_words:
    print(ps.stem(word))

print('\nStemmed sentence:\n')
# Lets see what a stemmed sentence looks like
sent_to_stem = 'It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once.'

words = word_tokenize(sent_to_stem)
# we tokenize because PorterStemmer can only stem one word at a time, not an entire sentence.

for w in words:
    print(ps.stem(w))

Stemming can sometimes act strangely with "-ly" endings and, as you can see above, the word "once", but overall serves its purpose to take different forms of words and convert them to their root meaning.

# Part of Speech Tagging

In [None]:
# imports
from nltk.corpus import state_union # imports a pre-loaded corpus with a bunch of state of the union addresses

# get our state of the union address to analyze
sample_text = state_union.raw("2006-GWBUSH.txt")

# tokenize each word of the text
words = nltk.word_tokenize(sample_text)

# use nltk.pos_tag() on the words to get their parts of speech in one long list
tagged = nltk.pos_tag(words)

tagged[0:20] # only printed the first 20

### part of speech tags:

CC - coordinating conjunction<br>
CD - cardinal digit<br>
DT - determiner<br>
EX - existential there (like: "there is" ... think of it like "there exists")<br>
FW - foreign word<br>
IN - preposition/subordinating conjunction<br>
JJ - adjective	'big'<br>
JJR - adjective, comparative	'bigger'<br>
JJS - adjective, superlative	'biggest'<br>
LS - list marker	1)<br>
MD - modal	could, will<br>
NN - noun, singular 'desk'<br>
NNS - noun plural	'desks'<br>
NNP - proper noun, singular	'Harrison'<br>
NNPS - proper noun, plural	'Americans'<br>
PDT - predeterminer	'all the kids'<br>
POS - possessive ending	parent's<br>
PRP - personal pronoun	I, he, she<br>
PRP\$ - possessive pronoun	my, his, hers<br>
RB - adverb	very, silently<br>
RBR - adverb, comparative	better<br>
RBS - adverb, superlative	best<br>
RP - particle	give up<br>
TO - to	go 'to' the store.<br>
UH - interjection	errrrrrrrm<br>
VB - verb, base form	take<br>
VBD - verb, past tense	took<br>
VBG - verb, gerund/present participle	taking<br>
VBN - verb, past participle	taken<br>
VBP - verb, sing. present, non-3d	take<br>
VBZ - verb, 3rd person sing. present	takes<br>
WDT - wh-determiner	which<br>
WP - wh-pronoun	who, what<br>
WP$ - possessive wh-pronoun	whose<br>
WRB - wh-abverb	where, when

# Word Frequency

In [None]:
# imports
from nltk.corpus import movie_reviews # imports a corpus with a bunch of movie reviews
import re


# so that we are not finding the frequency of more or less useless things, lets remove stop words and punctuation
# all of the words in the corpus:
words = movie_reviews.words()
# removing stop words:
stop_words = set(stopwords.words('english')) # we make this a set because sets are faster than lists
filtered_words = [word for word in words if not word in stop_words]


# define "all_words" as a list that will be all of the words in the corpus (converted to lowercase)
all_words = []
for word in filtered_words: # movie_reviews.words() is a list of all of the documents tokenized by word
    # lets stem and convert the words to lowercase so that we avoid duplicated words and get more meaningful results
    if re.match('[a-zA-Z]+', word): # filtering out punctuation
        try:
            # stem the word
            word = ps.stem(word)
        except IndexError: 
            pass
        finally:
            # convert word to lowercase and append it to all_words
            all_words.append(word.lower())


# define "word_frequencies" as the frequencies of all of the words in "all_words"
word_frequencies = nltk.FreqDist(all_words)
# print the 20 most common words
print(word_frequencies.most_common(20))