In [2]:
import nltk 
from nltk.corpus import sentence_polarity
import random

In [5]:
sentences = sentence_polarity.sents()
for sent in sentences[:4]:
    print(sent)

['simplistic', ',', 'silly', 'and', 'tedious', '.']
["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.']
['exploitative', 'and', 'largely', 'devoid', 'of', 'the', 'depth', 'or', 'sophistication', 'that', 'would', 'make', 'watching', 'such', 'a', 'graphic', 'treatment', 'of', 'the', 'crimes', 'bearable', '.']
['[garbus]', 'discards', 'the', 'potential', 'for', 'pathological', 'study', ',', 'exhuming', 'instead', ',', 'the', 'skewed', 'melodrama', 'of', 'the', 'circumstantial', 'situation', '.']


In [6]:
pos_sents = sentence_polarity.sents(categories="pos")
neg_sents = sentence_polarity.sents(categories="neg")
print(len(pos_sents))
print(len(neg_sents))

5331
5331


In [7]:
documents = [(sent, cat) for cat in sentence_polarity.categories() for sent in sentence_polarity.sents(categories=cat)]

print(documents[0])
print(documents[-1])
random.shuffle(documents)

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'neg')
(['provides', 'a', 'porthole', 'into', 'that', 'noble', ',', 'trembling', 'incoherence', 'that', 'defines', 'us', 'all', '.'], 'pos')


In [10]:
all_words_list = [word for (sent, cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(2000)
word_features = [word for (word, count) in word_items]
print(word_features[:50])

['.', 'the', ',', 'a', 'and', 'of', 'to', 'is', 'in', 'that', 'it', 'as', 'but', 'with', 'film', 'this', 'for', 'its', 'an', 'movie', "it's", 'be', 'on', 'you', 'not', 'by', 'about', 'more', 'one', 'like', 'has', 'are', 'at', 'from', 'than', '"', 'all', '--', 'his', 'have', 'so', 'if', 'or', 'story', 'i', 'too', 'just', 'who', 'into', 'what']


In [15]:
def document_features(document, word_features): 
    document_words = set(document)
    features = {}
    for word in word_features: 
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [16]:
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

In [19]:
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [20]:
nltk.classify.accuracy(classifier, test_set)

0.756

In [21]:
classifier.show_most_informative_features(30)

Most Informative Features
            V_engrossing = True              pos : neg    =     21.7 : 1.0
              V_mediocre = True              neg : pos    =     15.6 : 1.0
               V_generic = True              neg : pos    =     15.6 : 1.0
                V_flawed = True              pos : neg    =     15.0 : 1.0
              V_supposed = True              neg : pos    =     15.0 : 1.0
               V_routine = True              neg : pos    =     14.3 : 1.0
                V_boring = True              neg : pos    =     13.8 : 1.0
                  V_flat = True              neg : pos    =     13.4 : 1.0
            V_refreshing = True              pos : neg    =     13.0 : 1.0
             V_inventive = True              pos : neg    =     12.3 : 1.0
                  V_warm = True              pos : neg    =     12.2 : 1.0
                  V_dull = True              neg : pos    =     11.9 : 1.0
             V_wonderful = True              pos : neg    =     11.8 : 1.0

In [22]:
# Module Subjectivity reads the subjectivity lexicon file from Wiebe et al
#    at http://www.cs.pitt.edu/mpqa/ (part of the Multiple Perspective QA project)
#
# This file has the format that each line is formatted as in this example for the word "abandoned"
#     type=weaksubj len=1 word1=abandoned pos1=adj stemmed1=n priorpolarity=negative
# In our data, the pos tag is ignored, so this program just takes the last one read
#     (typically the noun over the adjective)
#
# The data structure that is created is a dictionary where
#    each word is mapped to a list of 4 things:  
#        strength, which will be either 'strongsubj' or 'weaksubj'
#        posTag, either 'adj', 'verb', 'noun', 'adverb', 'anypos'
#        isStemmed, either true or false
#        polarity, either 'positive', 'negative', or 'neutral'

import nltk

# pass the absolute path of the lexicon file to this program
# example call:
# nancymacpath = 
#    "/Users/njmccrac/AAAdocs/research/subjectivitylexicon/hltemnlp05clues/subjclueslen1-HLTEMNLP05.tff"
# SL = readSubjectivity(nancymacpath)

# this function returns a dictionary where you can look up words and get back 
#     the four items of subjectivity information described above
def readSubjectivity(path):
    flexicon = open(path, 'r')
    # initialize an empty dictionary
    sldict = { }
    for line in flexicon:
        fields = line.split()   # default is to split on whitespace
        # split each field on the '=' and keep the second part as the value
        strength = fields[0].split("=")[1]
        word = fields[2].split("=")[1]
        posTag = fields[3].split("=")[1]
        stemmed = fields[4].split("=")[1]
        polarity = fields[5].split("=")[1]
        if (stemmed == 'y'):
            isStemmed = True
        else:
            isStemmed = False
        # put a dictionary entry with the word as the keyword
        #     and a list of the other values
        sldict[word] = [strength, posTag, isStemmed, polarity]
    return sldict

In [24]:
SLpath = "subjclueslen1-HLTEMNLP05.tff"
import Subjectivity
SL = Subjectivity.readSubjectivity(SLpath)

ModuleNotFoundError: No module named 'Subjectivity'