# Part of Speech Tagging with NLTK

In [1]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [2]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [27]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)
print tokenized[:5]

[u"PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", u'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', u'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', u'(Applause.)', u'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.']


In [26]:
for i in tokenized[:5]:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print(tagged)

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'NNP'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'NNP'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'NNP'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'January', 'NNP'), (u'31', 'CD'), (u',', ','), (u'2006', 'CD'), (u'THE', 'DT'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Thank', 'NNP'), (u'you', 'PRP'), (u'all', 'DT'), (u'.', '.')]
[(u'Mr.', 'NNP'), (u'Speaker', 'NNP'), (u',', ','), (u'Vice', 'NNP'), (u'President', 'NNP'), (u'Cheney', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'Congress', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'Supreme', 'NNP'), (u'Court', 'NNP'), (u'and', 'CC'), (u'diplomatic', 'JJ'), (u'corps', 'NNS'), (u',', ','), (u'distinguished', 'VBD'), (u'guests', 'NNS'), (u',', ','), (u'and', 'CC'), (u'fellow', 'JJ'), (u'citizens',

# Chunking with NLTK

In [29]:
for i in tokenized[:5]:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser.parse(tagged)
    chunked.draw()  

# Named Entity Recognition with NLTK

In [30]:
for i in tokenized[:5]:
    words = nltk.word_tokenize(i)
    tagged = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    namedEnt.draw()

# Lemmatizing with NLTK

In [31]:
from nltk.stem import WordNetLemmatizer

In [32]:
lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


# Text Classification with NLTK

In [33]:
import nltk
import random
from nltk.corpus import movie_reviews

In [36]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print(documents[1])

([u'"', u'book', u'"', u'should', u'have', u'remained', u'in', u'shadows', u'book', u'of', u'shadows', u':', u'blair', u'witch', u'2', u'a', u'film', u'review', u'by', u'michael', u'redman', u'copyright', u'2000', u'by', u'michael', u'redman', u'certain', u'things', u'in', u'our', u'lives', u'are', u'inevitable', u'.', u'death', u',', u'sorrow', u',', u'love', u',', u'heartbreak', u',', u'pain', u',', u'joy', u'.', u'we', u'expect', u'these', u'events', u'.', u'we', u'know', u'they', u"'", u're', u'going', u'to', u'happen', u'and', u'some', u',', u'we', u'even', u'look', u'forward', u'to', u'.', u'it', u"'", u's', u'part', u'of', u'the', u'human', u'condition', u'.', u'we', u'have', u'also', u'become', u'accustomed', u'to', u'inevitable', u'occurrences', u'in', u'our', u'society', u'.', u'as', u'we', u'near', u'the', u'fall', u'election', u',', u'several', u'of', u'them', u'are', u'hitting', u'us', u'in', u'the', u'face', u'.', u'politicians', u'exaggerate', u'their', u'own', u'importa

In [37]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(u',', 77717), (u'the', 76529), (u'.', 65876), (u'a', 38106), (u'and', 35576), (u'of', 34123), (u'to', 31937), (u"'", 30585), (u'is', 25195), (u'in', 21822), (u's', 18513), (u'"', 17612), (u'it', 16107), (u'that', 15924), (u'-', 15595)]
