In [117]:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from nltk.classify import ClassifierI
from scipy.stats import mode

In [38]:
print movie_reviews.categories()
print len(movie_reviews.words())
print len(set(movie_reviews.words()))

[u'neg', u'pos']
1583820
39768


In [36]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

# or more plainly:
#for category in movie_reviews.categories():
#    for fileid in movie_reviews.fileids(category):
#        (list(movie_reviews.words(fileid)), category)

random.shuffle(documents)
len(documents)

2000

In [None]:
print documents[0]

In [40]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
print len(all_words)
print all_words[:10]

1583820
[u'plot', u':', u'two', u'teen', u'couples', u'go', u'to', u'a', u'church', u'party']


In [41]:
all_words = nltk.FreqDist(all_words)
print len(all_words)
print all_words.most_common(10)

39768
[(u',', 77717), (u'the', 76529), (u'.', 65876), (u'a', 38106), (u'and', 35576), (u'of', 34123), (u'to', 31937), (u"'", 30585), (u'is', 25195), (u'in', 21822)]


In [42]:
print all_words["stupid"]
print all_words["good"]
print all_words["bad"]
print all_words["crazy"]
print all_words["ass"]
print all_words["sucks"]

253
2411
1395
99
101
29


In [189]:
word_features = list(all_words.keys())[:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

#print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

In [92]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [93]:
train = featuresets[:1900]
test = featuresets[1900:]

### Naive Bayes

In [94]:
classifier = nltk.NaiveBayesClassifier.train(train)
print nltk.classify.accuracy(classifier, test)
classifier.show_most_informative_features(10)

0.68
Most Informative Features
               uplifting = True              pos : neg    =     14.2 : 1.0
               insulting = True              neg : pos    =     10.7 : 1.0
                     ugh = True              neg : pos    =      9.1 : 1.0
                    sans = True              neg : pos    =      9.1 : 1.0
            refreshingly = True              pos : neg    =      8.3 : 1.0
            effortlessly = True              pos : neg    =      7.3 : 1.0
                 wasting = True              neg : pos    =      7.1 : 1.0
              mediocrity = True              neg : pos    =      6.4 : 1.0
               dismissed = True              pos : neg    =      6.3 : 1.0
             overwhelmed = True              pos : neg    =      6.3 : 1.0


# Scikit classifiers

## Individual

### Multinomial Naive Bayes

In [106]:
mnbClassifier = SklearnClassifier(MultinomialNB())
mnbClassifier.train(train)
print 'MNB Classifier accuracy: %f' %  nltk.classify.accuracy(mnbClassifier, test)

MNB Classifier accuracy: 0.670000


### Gaussian Naive Bayes

In [None]:
gnbClassifier = SklearnClassifier(GaussianNB())
gnbClassifier.train(train)
print 'GNB Classifier accuracy: %f' % nltk.classify.accuracy(gnbClassifier, test)

### Bernoulli Naive Bayes

In [107]:
bnbClassifier = SklearnClassifier(BernoulliNB())
bnbClassifier.train(train)
print 'Bernouli NB Classifier accuracy: %f' % nltk.classify.accuracy(bnbClassifier, test)

Bernouli NB Classifier accuracy: 0.680000


### Logistic Regression

In [108]:
logClassifier = SklearnClassifier(LogisticRegression())
logClassifier.train(train)
print 'Logistic Classifier accuracy: %f' % nltk.classify.accuracy(logClassifier, test)

Logistic Classifier accuracy: 0.600000


### SGD Classifier

In [109]:
sgdClassifier = SklearnClassifier(SGDClassifier())
sgdClassifier.train(train)
print 'SGD Classifier accuracy: %f' % nltk.classify.accuracy(sgdClassifier, test)

SGD Classifier accuracy: 0.620000


### SVC

In [110]:
svcClassifier = SklearnClassifier(SVC())
svcClassifier.train(train)
print 'SVC Classifier accuracy: %f' % nltk.classify.accuracy(svcClassifier, test)

SVC Classifier accuracy: 0.460000


### Linear SVC

In [111]:
linsvcClassifier = SklearnClassifier(LinearSVC())
linsvcClassifier.train(train)
print 'Linear SVC Classifier accuracy: %f' % nltk.classify.accuracy(linsvcClassifier, test)

Linear SVC Classifier accuracy: 0.600000


### NuSVC

In [112]:
nusvcClassifier = SklearnClassifier(NuSVC())
nusvcClassifier.train(train)
print 'NuSVC Classifier accuracy: %f' % nltk.classify.accuracy(nusvcClassifier, test)

NuSVC Classifier accuracy: 0.640000


### Decision Tree

In [113]:
dtClassifier = SklearnClassifier(DecisionTreeClassifier())
dtClassifier.train(train)
print 'DecisioTree Classifier accuracy: %f' % nltk.classify.accuracy(dtClassifier, test)

DecisioTree Classifier accuracy: 0.640000


### Random Forest

In [114]:
rfClassifier = SklearnClassifier(RandomForestClassifier())
rfClassifier.train(train)
print 'RandomForest Classifier accuracy: %f' % nltk.classify.accuracy(rfClassifier, test)

RandomForest Classifier accuracy: 0.620000


### Gradient Boosting

In [None]:
gbClassifier = SklearnClassifier(GradientBoostingClassifier())
gbClassifier.train(train)
print 'Gradient Boosting Classifier accuracy: %f' % nltk.classify.accuracy(gbClassifier, test)

## Voting

In [176]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes).mode[0]
    
    def confidence(self, feature):
        votes = []
        for c in self._classifiers:
            v = c.classi fy(feature)
            votes.append(v)
        choice_votes = votes.count(mode(votes).mode[0])
        conf = choice_votes / float(len(votes))
        return conf

In [177]:
voted_classifier = VoteClassifier(classifier,
                                 mnbClassifier,
                                 bnbClassifier,
                                 linsvcClassifier,
                                 nusvcClassifier,
                                 logClassifier,
                                 dtClassifier,
                                 rfClassifier)

print 'Voted Classifier Accuracy: %f' % nltk.classify.accuracy(voted_classifier, test)

Voted Classifier Accuracy: 0.700000


In [183]:
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[0][0]), voted_classifier.confidence(test[0][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[1][0]), voted_classifier.confidence(test[1][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[2][0]), voted_classifier.confidence(test[2][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[3][0]), voted_classifier.confidence(test[3][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[4][0]), voted_classifier.confidence(test[4][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[5][0]), voted_classifier.confidence(test[5][0]))
print 'Classification: %s, Confidence: %f' % (voted_classifier.classify(test[6][0]), voted_classifier.confidence(test[6][0]))

Classification: neg, Confidence: 0.750000
Classification: neg, Confidence: 1.000000
Classification: neg, Confidence: 0.500000
Classification: neg, Confidence: 0.500000
Classification: pos, Confidence: 0.750000
Classification: pos, Confidence: 1.000000
Classification: neg, Confidence: 0.875000
