In [1]:
import nltk
import random
from nltk.corpus import movie_reviews

In [25]:
documents=[(list(movie_reviews.words(fileid)),category)
          for category in movie_reviews.categories()
          for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [26]:
all_words=[]
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words=nltk.FreqDist(all_words)    
    

In [27]:
words_features=list(all_words.keys())[:3000]

In [28]:
def find_features(document):
    words=set(document)
    features={}
    for w in words_features:
        features[w]=(w in words)
    return features    

In [29]:
featuresets=[(find_features(rev),category) for (rev,category) in documents]

In [30]:
training_set=featuresets[:1900]
testing_set=featuresets[1900:]

In [31]:
classifier=nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent: ", (nltk.classify.accuracy(classifier,testing_set))*100)
classifier.show_most_informative_features(15)

Naive Bayes Algo accuracy percent:  78.0
Most Informative Features
               atrocious = True              neg : pos    =     11.8 : 1.0
                   sucks = True              neg : pos    =      9.9 : 1.0
                bothered = True              neg : pos    =      9.1 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.5 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
                 idiotic = True              neg : pos    =      7.6 : 1.0
              schumacher = True              neg : pos    =      7.5 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                  regard = True              pos : neg    =      6.9 : 1.0
                  turkey = True              neg : pos    =      6.7 : 1.0
                   kudos = True  

nltk is mainly for data preprocessing, now we are going to marry nltk with sklearn and other modules

In [32]:
from sklearn.naive_bayes import MultinomialNB

In [33]:
from nltk.classify.scikitlearn import SklearnClassifier

In [34]:
multinomial_classifier=SklearnClassifier(MultinomialNB())
multinomial_classifier.train(training_set)
print("Multinomial Algo accuracy percent: ", (nltk.classify.accuracy(multinomial_classifier,testing_set))*100)

Multinomial Algo accuracy percent:  79.0


In [35]:
#now using other ml classification algorithms
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [36]:
logistic_classifier= SklearnClassifier(LogisticRegression())
logistic_classifier.train(training_set)
print("Logistic Algo accuracy percent: ", (nltk.classify.accuracy(logistic_classifier,testing_set))*100)

Logistic Algo accuracy percent:  81.0


In [37]:
sgd_classifier= SklearnClassifier(SGDClassifier())
sgd_classifier.train(training_set)
print("SGD Algo accuracy percent: ", (nltk.classify.accuracy(sgd_classifier,testing_set))*100)



SGD Algo accuracy percent:  76.0


In [38]:
svc_classifier= SklearnClassifier(SVC())
svc_classifier.train(training_set)
print("SVC Algo accuracy percent: ", (nltk.classify.accuracy(svc_classifier,testing_set))*100)

SVC Algo accuracy percent:  84.0


In [39]:
lsvc_classifier= SklearnClassifier(LinearSVC())
lsvc_classifier.train(training_set)
print("LinearVC Algo accuracy percent: ", (nltk.classify.accuracy(lsvc_classifier,testing_set))*100)

LinearVC Algo accuracy percent:  77.0


In [40]:
nusvc_classifier= SklearnClassifier(NuSVC())
nusvc_classifier.train(training_set)
print("NuSVC Algo accuracy percent: ", (nltk.classify.accuracy(nusvc_classifier,testing_set))*100)

NuSVC Algo accuracy percent:  82.0


Now we are going to make a custom classifier combining several classifiers and taking the best vote score

In [41]:
from nltk.classify import ClassifierI
from statistics import mode

In [50]:
class VoteClassifier(ClassifierI):
    def __init__(self,*classifiers):
        self.classifiers=classifiers
    def classify(self,features):
        vote=[]
        for c in self.classifiers:
            v=c.classify(features)
            vote.append(v)
        return mode(vote)   
    def confidence(self,features):
        vote=[]
        for c in self.classifiers:
            v=c.classify(features)
            vote.append(v)
        choice_votes=vote.count(mode(vote)) 
        conf= choice_votes/len(vote)
        return conf

In [51]:
voted_classifier=VoteClassifier(logistic_classifier,svc_classifier,nusvc_classifier,nusvc_classifier,multinomial_classifier)
print("Voted Algo accuracy percent: ", (nltk.classify.accuracy(voted_classifier,testing_set))*100)

Voted Algo accuracy percent:  83.0


In [53]:
print('classification:'+str(voted_classifier.classify(testing_set[0][0]))+' actual result:'+str(testing_set[0][1])+' confidence:'+str(voted_classifier.confidence(testing_set[0][0])))
print('classification:'+str(voted_classifier.classify(testing_set[1][0]))+' actual result:'+str(testing_set[1][1])+' confidence:'+str(voted_classifier.confidence(testing_set[1][0])))
print('classification:'+str(voted_classifier.classify(testing_set[2][0]))+' actual result:'+str(testing_set[2][1])+' confidence:'+str(voted_classifier.confidence(testing_set[2][0])))
print('classification:'+str(voted_classifier.classify(testing_set[3][0]))+' actual result:'+str(testing_set[3][1])+' confidence:'+str(voted_classifier.confidence(testing_set[3][0])))
print('classification:'+str(voted_classifier.classify(testing_set[4][0]))+' actual result:'+str(testing_set[4][1])+' confidence:'+str(voted_classifier.confidence(testing_set[4][0])))

classification:neg actual result:pos confidence:1.0
classification:pos actual result:pos confidence:1.0
classification:neg actual result:neg confidence:1.0
classification:pos actual result:neg confidence:1.0
classification:pos actual result:pos confidence:1.0
