## Tokenizing words and sentences

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."
print "Sentence tokens : \n",sent_tokenize(EXAMPLE_TEXT)
print
print "Word tokens : \n",word_tokenize(EXAMPLE_TEXT)

Sentence tokens : 
['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]

Word tokens : 
['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


## Stopwords

In [3]:
from nltk.corpus import stopwords

example_sent = "This is a sample sentence, showing off the stop words filtration."
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
print "Word tokens : \n",word_tokens
print 
print "Filtered tokens : \n",filtered_sentence

Word tokens : 
['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']

Filtered tokens : 
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


## Stemming

In [4]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

for word in example_words:
    print (ps.stem(word))
    
print
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

words = word_tokenize(new_text)

for w in words:
    print (ps.stem(w))

python
python
python
python
pythonli

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


## Part of speech tagging

In [5]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_data = state_union.raw("2005-GWBush.txt")
test_data = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_data)
tokenized = custom_sent_tokenizer.tokenize(test_data)

In [6]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print (tagged)
    except Exception as e:
        print (str(e))
        
process_content()

[(u'PRESIDENT', 'NNP'), (u'GEORGE', 'NNP'), (u'W.', 'NNP'), (u'BUSH', 'NNP'), (u"'S", 'POS'), (u'ADDRESS', 'NNP'), (u'BEFORE', 'IN'), (u'A', 'NNP'), (u'JOINT', 'NNP'), (u'SESSION', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'CONGRESS', 'NNP'), (u'ON', 'NNP'), (u'THE', 'NNP'), (u'STATE', 'NNP'), (u'OF', 'IN'), (u'THE', 'NNP'), (u'UNION', 'NNP'), (u'January', 'NNP'), (u'31', 'CD'), (u',', ','), (u'2006', 'CD'), (u'THE', 'NNP'), (u'PRESIDENT', 'NNP'), (u':', ':'), (u'Thank', 'NNP'), (u'you', 'PRP'), (u'all', 'DT'), (u'.', '.')]
[(u'Mr.', 'NNP'), (u'Speaker', 'NNP'), (u',', ','), (u'Vice', 'NNP'), (u'President', 'NNP'), (u'Cheney', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'Congress', 'NNP'), (u',', ','), (u'members', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'Supreme', 'NNP'), (u'Court', 'NNP'), (u'and', 'CC'), (u'diplomatic', 'JJ'), (u'corps', 'NN'), (u',', ','), (u'distinguished', 'JJ'), (u'guests', 'NNS'), (u',', ','), (u'and', 'CC'), (u'fellow', 'JJ'), (u'citizens', 'NN

## Chunking

In [7]:
def process_content_chunking():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print (chunked)
            
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

    except Exception as e:
        print (str(e))
        
process_content_chunking()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  g

## Chinking

In [8]:
def process_content_chunking():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<.*>+}
            }<VB.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print (chunked)
            
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

    except Exception as e:
        print (str(e))
        
process_content_chunking()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk
    THE/NNP
    UNION/NNP
    January/NNP
    31/CD
    ,/,
    2006/CD
    THE/NNP
    PRESIDENT/NNP
    :/:
    Thank/NNP
    you/PRP)
  all/DT
  (Chunk ./.))
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP)
(Chunk ./.)
(S
  (Chunk
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS)
  of/IN
  (Chunk Congress/NNP ,/, members/NNS)
  of/IN
  the/DT
  (Chunk
    Supreme/NNP
    Court/NNP
    and/CC
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    and/CC
    fellow/JJ
  

## Named entity recognition

In [9]:
def process_content_chunking():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=True)
            print (namedEnt)
            
    except Exception as e:
        print (str(e))
        
process_content_chunking()

(S
  PRESIDENT/NNP
  (NE GEORGE/NNP)
  W./NNP
  BUSH/NNP
  'S/POS
  (NE ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  JOINT/NNP
  SESSION/NNP
  OF/IN
  (NE THE/NNP)
  (NE CONGRESS/NNP)
  ON/NNP
  THE/NNP
  STATE/NNP
  OF/IN
  (NE THE/NNP UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (NE Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (NE Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (NE Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (NE Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (NE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted/VBN
  by/IN
  the

## Lemmatizing

In [10]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


## Text Classification

In [11]:
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

print documents[1]

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

print all_words.most_common(15)
print all_words["stupid"]

([u'seen', u'may', u'31', u',', u'1999', u'on', u'home', u'video', u'(', u'rented', u')', u'.', u'one', u'of', u'the', u'best', u'things', u'about', u'the', u'movies', u'is', u'that', u'they', u'can', u'make', u'you', u'really', u'start', u'to', u'wonder', u'and', u'analyze', u'things', u'in', u'your', u'own', u',', u'everyday', u'life', u'that', u'you', u'might', u'not', u'have', u'thought', u'much', u'of', u'otherwise', u'.', u'with', u'some', u'films', u'it', u'might', u'be', u'just', u'a', u'secondary', u'issue', u',', u'but', u'with', u'`', u'dark', u'city', u"'", u'what', u'we', u'get', u'is', u'nearly', u'an', u'entire', u'film', u'committed', u'to', u'philosophizing', u'and', u'theorizing', u'about', u'human', u'memory', u'and', u'how', u'it', u'defines', u'who', u'we', u'are', u',', u'which', u'is', u'a', u'poignant', u'theme', u'made', u'through', u'an', u'absolutely', u'beautifully', u'-', u'constructed', u'film', u'with', u'the', u'kind', u'of', u'story', u'and', u'setting'

## Converting words to features

In [12]:
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

print find_features(movie_reviews.words('neg/cv000_29416.txt'))

featuresets = [(find_features(rev), category) for (rev, category) in documents]



## Naive Bayes Classifier

In [13]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)

print "Classifier accuracy percent : ", (nltk.classify.accuracy(classifier, testing_set))*100

classifier.show_most_informative_features(15)

Classifier accuracy percent :  69.0
Most Informative Features
               insulting = True              neg : pos    =     17.6 : 1.0
                    sans = True              neg : pos    =      9.0 : 1.0
               uplifting = True              pos : neg    =      8.6 : 1.0
            refreshingly = True              pos : neg    =      8.4 : 1.0
                 wasting = True              neg : pos    =      8.3 : 1.0
              mediocrity = True              neg : pos    =      7.7 : 1.0
               dismissed = True              pos : neg    =      7.0 : 1.0
                   wires = True              neg : pos    =      6.3 : 1.0
                  doubts = True              pos : neg    =      5.8 : 1.0
                     ugh = True              neg : pos    =      5.8 : 1.0
                    wits = True              pos : neg    =      5.7 : 1.0
                    lang = True              pos : neg    =      5.7 : 1.0
                  fabric = True       

## Saving classifiers with pickle

In [14]:
import pickle

save_classifier = open('naivebayes.pickle','wb')
pickle.dump(classifier, save_classifier)
save_classifier.close()

classifier_f = open('naivebayes.pickle','rb')
classifier = pickle.load(classifier_f)
classifier_f.close()

## Other classifiers using sklearn

In [15]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [16]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print "MultinomialNB accuracy percent:",(nltk.classify.accuracy(MNB_classifier, testing_set))*100


BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print "BernoulliNB accuracy percent:",(nltk.classify.accuracy(BNB_classifier, testing_set))*100


LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print "LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100


SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print "SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100


SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print "SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100


LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print "LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100


NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print "NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100

MultinomialNB accuracy percent: 69.0
BernoulliNB accuracy percent: 72.0
LogisticRegression_classifier accuracy percent: 66.0
SGDClassifier_classifier accuracy percent: 64.0
SVC_classifier accuracy percent: 49.0
LinearSVC_classifier accuracy percent: 60.0
NuSVC_classifier accuracy percent: 73.0


## Combining algorithms

In [17]:
from nltk.classify import ClassifierI
from statistics import mode

In [19]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf
    
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BNB_classifier,
                                  LogisticRegression_classifier)

print "voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100

print "Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100
print "Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100
print "Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100
print "Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100
print "Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100
print "Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100

voted_classifier accuracy percent: 72.0
Classification: neg Confidence %: 100
Classification: neg Confidence %: 100
Classification: pos Confidence %: 100
Classification: neg Confidence %: 100
Classification: pos Confidence %: 100
Classification: pos Confidence %: 100


# Changing the datasets

In [28]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

In [31]:
short_pos = open("positive.txt","r").read()
short_neg = open("negative.txt","r").read()

In [32]:
documents = []

for r in short_pos.split('\n'):
    documents.append((r,"pos"))
    
for r in short_neg.split('\n'):
    documents.append((r,"neg"))

In [None]:
all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())
    
for w in short_neg_words:
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)