# Sentiment Analysis with `nltk.sentiment.SentimentAnalyzer` and VADER tools

## 1. Exploring the `subjectivity` corpus

In [30]:
from nltk.corpus import subjectivity

subjectivity.fileids()

['plot.tok.gt9.5000', 'quote.tok.gt9.5000']

In [31]:
subjectivity.sents('plot.tok.gt9.5000')

[['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a', 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi', 'from', 'a', 'hunter', '.'], ['emerging', 'from', 'the', 'human', 'psyche', 'and', 'showing', 'characteristics', 'of', 'abstract', 'expressionism', ',', 'minimalism', 'and', 'russian', 'constructivism', ',', 'graffiti', 'removal', 'has', 'secured', 'its', 'place', 'in', 'the', 'history', 'of', 'modern', 'art', 'while', 'being', 'created', 'by', 'artists', 'who', 'are', 'unconscious', 'of', 'their', 'artistic', 'achievements', '.'], ...]

In [32]:
subjectivity.sents('quote.tok.gt9.5000')

[['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], ['color', ',', 'musical', 'bounce', 'and', 'warm', 'seas', 'lapping', 'on', 'island', 'shores', '.', 'and', 'just', 'enough', 'science', 'to', 'send', 'you', 'home', 'thinking', '.'], ...]

In [33]:
subjectivity.categories() # The mapping between documents and categories does not depend on the file structure.

['obj', 'subj']

In [34]:
subjectivity.sents(categories='obj')

[['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a', 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi', 'from', 'a', 'hunter', '.'], ['emerging', 'from', 'the', 'human', 'psyche', 'and', 'showing', 'characteristics', 'of', 'abstract', 'expressionism', ',', 'minimalism', 'and', 'russian', 'constructivism', ',', 'graffiti', 'removal', 'has', 'secured', 'its', 'place', 'in', 'the', 'history', 'of', 'modern', 'art', 'while', 'being', 'created', 'by', 'artists', 'who', 'are', 'unconscious', 'of', 'their', 'artistic', 'achievements', '.'], ...]

In [35]:
subjectivity.sents(categories='subj')

[['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], ['color', ',', 'musical', 'bounce', 'and', 'warm', 'seas', 'lapping', 'on', 'island', 'shores', '.', 'and', 'just', 'enough', 'science', 'to', 'send', 'you', 'home', 'thinking', '.'], ...]

## 2. Building and testing a classifier with `SentimentAnalyzer`

In [36]:
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer # SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis.
from nltk.sentiment.util import (mark_negation, extract_unigram_feats) # mark_negation(): Append _NEG suffix to words that appear in the scope between a negation and a punctuation mark. extract_unigram_feats(): Populate a dictionary of unigram features, reflecting the presence/absence in the document of each of the tokens in unigrams.

n_instances = 100
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
len(obj_docs), len(subj_docs)

(100, 100)

In [37]:
obj_docs[0]

(['the',
  'movie',
  'begins',
  'in',
  'the',
  'past',
  'where',
  'a',
  'young',
  'boy',
  'named',
  'sam',
  'attempts',
  'to',
  'save',
  'celebi',
  'from',
  'a',
  'hunter',
  '.'],
 'obj')

In [38]:
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]

training_docs = train_obj_docs + train_subj_docs
testing_docs = test_obj_docs + test_subj_docs

sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])

In [39]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)

83

In [40]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [45]:
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
training_set[0]

({'contains(")': False,
  'contains(()': False,
  'contains())': False,
  'contains(,)': False,
  'contains(,_NEG)': False,
  'contains(--)': False,
  'contains(.)': True,
  'contains(:)': False,
  'contains(;)': False,
  'contains(a)': True,
  'contains(a_NEG)': False,
  'contains(about)': False,
  'contains(all)': False,
  'contains(an)': False,
  'contains(and)': False,
  'contains(are)': False,
  'contains(as)': False,
  'contains(at)': False,
  'contains(be)': False,
  'contains(begins)': True,
  'contains(both)': False,
  'contains(but)': False,
  'contains(but_NEG)': False,
  'contains(by)': False,
  'contains(can)': False,
  'contains(even)': False,
  'contains(film)': False,
  'contains(for)': False,
  'contains(from)': True,
  'contains(has)': False,
  'contains(have)': False,
  'contains(he)': False,
  'contains(her)': False,
  'contains(him)': False,
  'contains(his)': False,
  'contains(home)': False,
  'contains(if)': False,
  'contains(in)': True,
  'contains(into)': Fal

In [42]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)

Training classifier


In [43]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8


## 3. Sentiment analysis with `nltk.sentiment.vader.SentimentIntensityAnalyzer`

In [50]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentences = [
    "You are a piece of shit, and I will step on you.",
    "THIS SUX!!!",
    "This kinda sux...",
    "You're good, man",
    "HAHAHA YOU ARE THE BEST!!!!! VERY FUNNY!!!"
            ]


sid = SentimentIntensityAnalyzer()

for sentence in sentences:
    print('\n' + sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')


You are a piece of shit, and I will step on you.
compound: -0.5574, neg: 0.286, neu: 0.714, pos: 0.0, 
THIS SUX!!!
compound: -0.5229, neg: 0.771, neu: 0.229, pos: 0.0, 
This kinda sux...
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 
You're good, man
compound: 0.4404, neg: 0.0, neu: 0.408, pos: 0.592, 
HAHAHA YOU ARE THE BEST!!!!! VERY FUNNY!!!
compound: 0.8386, neg: 0.0, neu: 0.386, pos: 0.614, 

Above, `compound` represents the aggregated, final score.