## Task: Classifying Documents

#### Using Tokenization (and basic bag-of-words features)

Here is the code we went over at the start, to get started classifying documents by sentiment.

In [None]:
import re
import random
import nltk
from nltk.corpus import movie_reviews
import numpy as np

random.seed(100)
np.random.seed(100)

# Read in a list of document (wordlist, category) tuples, and shuffle
docs_tuples = [(movie_reviews.words(fileid), category)
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)[:200]]
random.shuffle(docs_tuples)

# Create a list of the most frequent words in the entire corpus
movie_words = [word.lower() for (wordlist, cat) in docs_tuples for word in wordlist]
all_wordfreqs = nltk.FreqDist(movie_words)
top_wordfreqs = all_wordfreqs.most_common()[:1000]
feature_words = [x[0] for x in top_wordfreqs]

# Define a function to extract features of the form containts(word) for each document
def document_features(doc_toks):
    document_words = set(doc_toks)
    features = {}
    for word in feature_words:
        features['contains({})'.format(word)] = 1 if word in document_words else 0
    return features

# Create feature sets of document (features, category) tuples
featuresets = [(document_features(wordlist), cat) for (wordlist, cat) in docs_tuples]

# Separate train and test sets, train the classifier, print accuracy and best features
train_set, test_set = featuresets[:-100], featuresets[-100:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))

#### Using POS Tagging

We left the first part of the code the same as above, but created a new list of most common adjectives as our feature words:

In [None]:
docs_tuples = [(movie_reviews.sents(fileid), category, movie_reviews.words(fileid))
               for category in movie_reviews.categories()
               for fileid in movie_reviews.fileids(category)[:200]]

random.shuffle(docs_tuples)

# Create a list of the most frequent words in the entire corpus
movie_sents = [t[0] for sublist in docs_tuples for t in sublist]
movie_tokstags = nltk.pos_tag_sents(movie_sents)
movie_adjs = [t[0] for sent in movie_tokstags for t in sent if re.match('JJ', t[1])]
all_adjfreqs = FreqDist(movie_adjs)
top_adjfreqs = all_adjfreqs.most_common()[:1000]
feature_words = [x[0] for x in top_adjfreqs]

Then we left the document_features() function and remaining code the same:

In [None]:
# Define a function to extract features of the form containts(word) for each document
def document_features(doc_toks):
    document_words = set(doc_toks)
    features = {}
    for word in feature_words:
        features['contains({})'.format(word)] = 1 if word in document_words else 0
    return features


# Create feature sets of document (features, category) tuples
featuresets = [(document_features(wordlist), cat) for (sent_list, cat, wordlist) in docs_tuples]

# Separate train and test sets, train the classifier, print accuracy and best features
train_set, test_set = featuresets[:-100], featuresets[-100:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
print(classifier.show_most_informative_features(10))