In [12]:
import nltk
import random
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from nltk.classify import ClassifierI
from scipy.stats import mode
import pickle

In [19]:
rev_pos = open('./data/short_reviews/positive.txt', 'r').read().decode('ISO-8859-1')
rev_neg = open('./data/short_reviews/positive.txt', 'r').read().decode('ISO-8859-1')

In [20]:
documents = []
for rev in rev_pos.split('\n'):
    documents.append( (rev, 'pos') )
    
for rev in rev_neg.split('\n'):
    documents.append( (rev, 'neg') )

#random.shuffle(documents)

print len(documents)

10664


In [21]:
all_words = []

words_pos = word_tokenize(rev_pos)
words_neg = word_tokenize(rev_neg)

for w in words_pos:
    all_words.append(w.lower())

for w in words_neg:
    all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

print len(all_words)

13453


In [30]:
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

In [31]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [32]:
random.shuffle(featuresets)

In [33]:
train = featuresets[:9000]
test = featuresets[9000:]

# Classification

### Naive Bayes

In [34]:
classifier = nltk.NaiveBayesClassifier.train(train)
print 'Naive Bayes Accuracy: %f' % nltk.classify.accuracy(classifier, test)
classifier.show_most_informative_features(10)

Naive Bayes Accuracy: 0.222957
Most Informative Features
                    spin = True              neg : pos    =      3.0 : 1.0
                  accept = True              neg : pos    =      3.0 : 1.0
               disarming = True              neg : pos    =      3.0 : 1.0
                  hoping = True              pos : neg    =      2.3 : 1.0
                 leaping = True              pos : neg    =      2.3 : 1.0
              goldmember = True              pos : neg    =      2.3 : 1.0
                      se = True              pos : neg    =      2.3 : 1.0
                     san = True              pos : neg    =      2.3 : 1.0
                     ver = True              pos : neg    =      2.3 : 1.0
                    cops = True              pos : neg    =      2.3 : 1.0


### Multinomial Naive Bayes

In [None]:
mnbClassifier = SklearnClassifier(MultinomialNB())
mnbClassifier.train(train)
print 'MNB Classifier accuracy: %f' %  nltk.classify.accuracy(mnbClassifier, test)

### Bernoulli Naive Bayes

In [None]:
bnbClassifier = SklearnClassifier(BernoulliNB())
bnbClassifier.train(train)
print 'Bernouli NB Classifier accuracy: %f' % nltk.classify.accuracy(bnbClassifier, test)

### Logistic Regression

In [13]:
logClassifier = SklearnClassifier(LogisticRegression())
logClassifier.train(train)
print 'Logistic Classifier accuracy: %f' % nltk.classify.accuracy(logClassifier, test)

Logistic Classifier accuracy: 0.030120


### SGD Classifier

In [None]:
sgdClassifier = SklearnClassifier(SGDClassifier())
sgdClassifier.train(train)
print 'SGD Classifier accuracy: %f' % nltk.classify.accuracy(sgdClassifier, test)

### SVC

In [None]:
svcClassifier = SklearnClassifier(SVC())
svcClassifier.train(train)
print 'SVC Classifier accuracy: %f' % nltk.classify.accuracy(svcClassifier, test)

### Linear SVC

In [None]:
linsvcClassifier = SklearnClassifier(LinearSVC())
linsvcClassifier.train(train)
print 'Linear SVC Classifier accuracy: %f' % nltk.classify.accuracy(linsvcClassifier, test)

### NuSVC

In [None]:
nusvcClassifier = SklearnClassifier(NuSVC())
nusvcClassifier.train(train)
print 'NuSVC Classifier accuracy: %f' % nltk.classify.accuracy(nusvcClassifier, test)

### Decision Tree

In [None]:
dtClassifier = SklearnClassifier(DecisionTreeClassifier())
dtClassifier.train(train)
print 'DecisioTree Classifier accuracy: %f' % nltk.classify.accuracy(dtClassifier, test)

### Random Forest

In [None]:
rfClassifier = SklearnClassifier(RandomForestClassifier())
rfClassifier.train(train)
print 'RandomForest Classifier accuracy: %f' % nltk.classify.accuracy(rfClassifier, test)

## Voting

In [None]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes).mode[0]
    
    def confidence(self, feature):
        votes = []
        for c in self._classifiers:
            v = c.classify(feature)
            votes.append(v)
        choice_votes = votes.count(mode(votes).mode[0])
        conf = choice_votes / float(len(votes))
        return conf

In [None]:
voted_classifier = VoteClassifier(classifier,
                                 mnbClassifier,
                                 bnbClassifier,
                                 linsvcClassifier,
                                 nusvcClassifier,
                                 logClassifier,
                                 dtClassifier,
                                 rfClassifier)

print 'Voted Classifier Accuracy: %f' % nltk.classify.accuracy(voted_classifier, test)

In [None]:
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[0][0]), voted_classifier.confidence(test[0][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[1][0]), voted_classifier.confidence(test[1][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[2][0]), voted_classifier.confidence(test[2][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[3][0]), voted_classifier.confidence(test[3][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[4][0]), voted_classifier.confidence(test[4][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[5][0]), voted_classifier.confidence(test[5][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[6][0]), voted_classifier.confidence(test[6][0]))