# Sentiment Analysis with `nltk.sentiment.SentimentAnalyzer` and VADER

## 1. Exploring the `subjectivity` corpus

In [1]:
from nltk.corpus import subjectivity

subjectivity.fileids()

['plot.tok.gt9.5000', 'quote.tok.gt9.5000']

In [2]:
subjectivity.sents('plot.tok.gt9.5000')

[['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a', 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi', 'from', 'a', 'hunter', '.'], ['emerging', 'from', 'the', 'human', 'psyche', 'and', 'showing', 'characteristics', 'of', 'abstract', 'expressionism', ',', 'minimalism', 'and', 'russian', 'constructivism', ',', 'graffiti', 'removal', 'has', 'secured', 'its', 'place', 'in', 'the', 'history', 'of', 'modern', 'art', 'while', 'being', 'created', 'by', 'artists', 'who', 'are', 'unconscious', 'of', 'their', 'artistic', 'achievements', '.'], ...]

In [3]:
subjectivity.sents('quote.tok.gt9.5000')

[['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], ['color', ',', 'musical', 'bounce', 'and', 'warm', 'seas', 'lapping', 'on', 'island', 'shores', '.', 'and', 'just', 'enough', 'science', 'to', 'send', 'you', 'home', 'thinking', '.'], ...]

In [4]:
subjectivity.categories() # The mapping between documents and categories does not depend on the file structure.

['obj', 'subj']

In [5]:
subjectivity.sents(categories='obj')

[['the', 'movie', 'begins', 'in', 'the', 'past', 'where', 'a', 'young', 'boy', 'named', 'sam', 'attempts', 'to', 'save', 'celebi', 'from', 'a', 'hunter', '.'], ['emerging', 'from', 'the', 'human', 'psyche', 'and', 'showing', 'characteristics', 'of', 'abstract', 'expressionism', ',', 'minimalism', 'and', 'russian', 'constructivism', ',', 'graffiti', 'removal', 'has', 'secured', 'its', 'place', 'in', 'the', 'history', 'of', 'modern', 'art', 'while', 'being', 'created', 'by', 'artists', 'who', 'are', 'unconscious', 'of', 'their', 'artistic', 'achievements', '.'], ...]

In [6]:
subjectivity.sents(categories='subj')

[['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], ['color', ',', 'musical', 'bounce', 'and', 'warm', 'seas', 'lapping', 'on', 'island', 'shores', '.', 'and', 'just', 'enough', 'science', 'to', 'send', 'you', 'home', 'thinking', '.'], ...]

## 2. Building and testing a classifier with `SentimentAnalyzer`

In [7]:
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer # SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis.
from nltk.sentiment.util import (mark_negation, extract_unigram_feats) # mark_negation(): Append _NEG suffix to words that appear in the scope between a negation and a punctuation mark. extract_unigram_feats(): Populate a dictionary of unigram features, reflecting the presence/absence in the document of each of the tokens in unigrams.

n_instances = 100
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
len(obj_docs), len(subj_docs)

(100, 100)

In [8]:
obj_docs[0]

(['the',
  'movie',
  'begins',
  'in',
  'the',
  'past',
  'where',
  'a',
  'young',
  'boy',
  'named',
  'sam',
  'attempts',
  'to',
  'save',
  'celebi',
  'from',
  'a',
  'hunter',
  '.'],
 'obj')

In [9]:
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]

training_docs = train_obj_docs + train_subj_docs
testing_docs = test_obj_docs + test_subj_docs

sentiment_analyzer = SentimentAnalyzer()
all_words_neg = sentiment_analyzer.all_words([mark_negation(doc) for doc in training_docs])

In [10]:
unigram_feats = sentiment_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)

83

In [11]:
sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [12]:
training_set = sentiment_analyzer.apply_features(training_docs)
test_set = sentiment_analyzer.apply_features(testing_docs)
training_set[0]

({'contains(.)': True,
  'contains(the)': True,
  'contains(,)': False,
  'contains(a)': True,
  'contains(and)': False,
  'contains(of)': False,
  'contains(to)': True,
  'contains(is)': False,
  'contains(in)': True,
  'contains(with)': False,
  'contains(it)': False,
  'contains(that)': False,
  'contains(his)': False,
  'contains(on)': False,
  'contains(for)': False,
  'contains(an)': False,
  'contains(who)': False,
  'contains(by)': False,
  'contains(he)': False,
  'contains(from)': True,
  'contains(her)': False,
  'contains(")': False,
  'contains(as)': False,
  'contains(film)': False,
  'contains(movie)': True,
  'contains(this)': False,
  'contains(their)': False,
  'contains(but)': False,
  'contains(at)': False,
  'contains(one)': False,
  'contains(the_NEG)': False,
  'contains(about)': False,
  'contains(are)': False,
  "contains(there's)": False,
  'contains(story)': False,
  'contains(()': False,
  'contains(to_NEG)': False,
  'contains(a_NEG)': False,
  'contains(,_

In [13]:
trainer = NaiveBayesClassifier.train
classifier = sentiment_analyzer.train(trainer, training_set)

Training classifier


In [14]:
for key,value in sorted(sentiment_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8


## 3. Sentiment analysis with `nltk.sentiment.vader.SentimentIntensityAnalyzer`

In [15]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentences = [
    "You are a piece of shit, and I will step on you.",
    "THIS SUCKS!",
    "This kinda sux...",
    "You're good, man!",
    "DAMN, YOU ARE THE BEST! VERY FUNNY!!!"
            ]


sid = SentimentIntensityAnalyzer()

for sentence in sentences:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
    print('\n')

You are a piece of shit, and I will step on you.
compound: -0.5574, neg: 0.286, neu: 0.714, pos: 0.0, 

THIS SUCKS!
compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0, 

This kinda sux...
compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, 

You're good, man!
compound: 0.4926, neg: 0.0, neu: 0.385, pos: 0.615, 

DAMN, YOU ARE THE BEST! VERY FUNNY!!!
compound: 0.7821, neg: 0.177, neu: 0.262, pos: 0.561, 



Above, `compound` represents the aggregated, final score.