In [13]:
import nltk
# movie review sentences
from nltk.corpus import sentence_polarity
import random

In [14]:
## repeat the setup of the movie review sentences for classification
# for each sentence(document), get its words and category (positive/negative)
documents = [(sent, cat) for cat in sentence_polarity.categories() 
    for sent in sentence_polarity.sents(categories=cat)]
random.shuffle(documents)

In [15]:
# get all words from all movie_reviews and put into a frequency distribution
#   note lowercase, but no stemming or stopwords
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

21401


In [16]:
# get the 1500 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1500)
word_features = [word for (word,count) in word_items]

In [17]:
# define features (keywords) of a document for a BOW/unigram baseline
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [18]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d, c) in documents]

# training using naive Baysian classifier, training set is 90% of data
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
nltk.classify.accuracy(classifier, test_set)

0.756

In [19]:
####   adding Bigram features   ####
# set up for using bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [20]:
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

['nervy', 'and', 'sensitive', ',', 'it', 'taps', 'into', 'genuine', 'artistic', 'befuddlement', ',', 'and', 'at', 'the', 'same', 'time', 'presents', 'a', 'scathing', 'indictment', 'of', 'what', 'drives', 'hollywood', '.', 'dazzling', 'in', 'its', 'complexity', ',', 'disturbing', 'for', 'its', 'extraordinary', 'themes', ',', 'the', 'piano', 'teacher', 'is', 'a', 'film', 'that', 'defies', 'categorisation', '.', 'it', 'haunts', ',', 'horrifies']


In [21]:
# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.raw_freq, 500)
print(bigram_features[:50])

[('.', '.'), ('.', 'the'), ('.', 'a'), ('of', 'the'), (',', 'but'), (',', 'and'), ('in', 'the'), ('the', 'film'), ('is', 'a'), ('.', "it's"), (',', 'the'), ('of', 'a'), ('to', 'the'), ('and', 'the'), ('to', 'be'), ('the', 'movie'), ('.', 'it'), ('.', 'this'), ('for', 'the'), ('it', 'is'), ('.', 'an'), ('with', 'a'), ('as', 'a'), ('in', 'a'), ('on', 'the'), ('one', 'of'), ('and', 'a'), ('this', 'is'), ('a', 'movie'), ("it's", 'a'), (',', 'it'), ('.', 'if'), ('with', 'the'), ('film', 'is'), ('like', 'a'), (',', 'a'), ('for', 'a'), ('it', '.'), ('the', 'most'), ('.', 'but'), ('but', 'it'), ('film', '.'), ('of', 'its'), ('movie', '.'), (',', "it's"), (',', 'this'), ('a', 'film'), ('as', 'the'), ('from', 'the'), ('.', 'i')]


In [22]:
# examples to demonstrate the bigram feature function definition
sent = ['Arthur','carefully','rode','the','brown','horse','around','the','castle']
sentbigrams = list(nltk.bigrams(sent))
print(sentbigrams)

[('Arthur', 'carefully'), ('carefully', 'rode'), ('rode', 'the'), ('the', 'brown'), ('brown', 'horse'), ('horse', 'around'), ('around', 'the'), ('the', 'castle')]


In [23]:
# for a single bigram, test if it's in the sentence bigrams and format the feature name
bigram = ('brown','horse')
print(bigram in sentbigrams)
print('bigram({} {})'.format(bigram[0], bigram[1]))

True
bigram(brown horse)


In [24]:
# define features that include words as before 
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['bigram({} {})'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [25]:
# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]

# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

# features in document 0
print(bigram_featuresets[0][0])

2000
{'contains(.)': True, 'contains(the)': True, 'contains(,)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(that)': False, 'contains(it)': True, 'contains(as)': False, 'contains(but)': False, 'contains(with)': False, 'contains(film)': False, 'contains(this)': False, 'contains(for)': False, 'contains(its)': False, 'contains(an)': False, 'contains(movie)': False, "contains(it's)": False, 'contains(be)': False, 'contains(on)': False, 'contains(you)': False, 'contains(not)': False, 'contains(by)': False, 'contains(about)': False, 'contains(one)': False, 'contains(more)': False, 'contains(like)': False, 'contains(has)': False, 'contains(are)': False, 'contains(at)': True, 'contains(from)': False, 'contains(than)': False, 'contains(")': False, 'contains(all)': False, 'contains(--)': False, 'contains(his)': False, 'contains(have)': False, 'contains(so)': False, 'contains(if)': False, 'c

In [26]:
# train a classifier and report accuracy
train_set, test_set = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.756

In [27]:
###  POS tag counts
# using the default pos tagger in NLTK (the Stanford tagger)
print(sent)
print(nltk.pos_tag(sent))

['Arthur', 'carefully', 'rode', 'the', 'brown', 'horse', 'around', 'the', 'castle']
[('Arthur', 'NNP'), ('carefully', 'RB'), ('rode', 'VBD'), ('the', 'DT'), ('brown', 'JJ'), ('horse', 'NN'), ('around', 'IN'), ('the', 'DT'), ('castle', 'NN')]


In [28]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [29]:
# define feature sets using this function
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

1504


In [30]:
# the first sentence
print(documents[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

(['nervy', 'and', 'sensitive', ',', 'it', 'taps', 'into', 'genuine', 'artistic', 'befuddlement', ',', 'and', 'at', 'the', 'same', 'time', 'presents', 'a', 'scathing', 'indictment', 'of', 'what', 'drives', 'hollywood', '.'], 'pos')
num nouns 5
num verbs 3
num adjectives 5
num adverbs 0


In [31]:
# train and test the classifier
train_set, test_set = POS_featuresets[1000:], POS_featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.753

In [32]:
## cross-validation ##
# this function takes the number of folds, the feature sets
# it iterates over the folds, using different sections for training and testing in turn
#   it prints the accuracy for each fold and the average accuracy at the end
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [33]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5
cross_validation_accuracy(num_folds, featuresets)

0 0.7415572232645403
1 0.7471857410881801
2 0.7453095684803002
3 0.7429643527204502
4 0.7209193245778611
mean accuracy 0.7395872420262665


In [34]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds1 = 5
cross_validation_accuracy(num_folds1, bigram_featuresets)

0 0.7415572232645403
1 0.7467166979362101
2 0.7443714821763602
3 0.7424953095684803
4 0.7209193245778611
mean accuracy 0.7392120075046904


In [35]:
num_folds2 = 10
cross_validation_accuracy(num_folds2, bigram_featuresets)

0 0.7485928705440901
1 0.724202626641651
2 0.7457786116322702
3 0.7542213883677298
4 0.7382739212007504
5 0.7551594746716698
6 0.724202626641651
7 0.7560975609756098
8 0.7082551594746717
9 0.7354596622889306
mean accuracy 0.7390243902439024


In [36]:
num_folds3 = 5
cross_validation_accuracy(num_folds2, POS_featuresets)

0 0.7504690431519699
1 0.7176360225140713
2 0.7485928705440901
3 0.7495309568480301
4 0.7354596622889306
5 0.7542213883677298
6 0.7185741088180112
7 0.7532833020637899
8 0.7101313320825516
9 0.725140712945591
mean accuracy 0.7363039399624765


In [37]:
num_folds4 = 5
cross_validation_accuracy(num_folds, POS_featuresets)

0 0.7420262664165104
1 0.7415572232645403
2 0.7368667917448405
3 0.7387429643527205
4 0.725609756097561
mean accuracy 0.7369606003752345


In [38]:
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds5 = 10
cross_validation_accuracy(num_folds5, featuresets)

0 0.7495309568480301
1 0.723264540337711
2 0.7467166979362101
3 0.7551594746716698
4 0.7363977485928705
5 0.7551594746716698
6 0.724202626641651
7 0.7551594746716698
8 0.7082551594746717
9 0.7354596622889306
mean accuracy 0.7389305816135086
