##### Text Classifier for sentiment analysis ..continued ..

In [1]:
import nltk
import random
from nltk.corpus import movie_reviews

In [2]:
#all documents from all categories
documents = [ (list(movie_reviews.words(fileid)), category)
                      for category in movie_reviews.categories()
                      for fileid in movie_reviews.fileids(category)]

all_words = [w.lower() for w in movie_reviews.words()] #convert to lower case
all_words = nltk.FreqDist(all_words)

#We want to use top 3000 words
word_features = list(all_words.keys())[:3000] 


def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:  #note that word_features contains top 3000 words only(above)
        features[w] = (w in words)
    return(features)

featuresets = [(find_features(doc), category) for (doc, category) in documents]


training_set = featuresets[:1900] #firs 1900
testing_set  = featuresets[1900:] #rest starting 1900th 


classifier = nltk.NaiveBayesClassifier.train(training_set)
nltk.classify.accuracy(classifier, testing_set)

0.77

In [3]:
import pickle

save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

classifier_f = open("naivebayes.pickle", "rb")
classifier_fromPickle = pickle.load(classifier_f)
classifier_f.close()

In [4]:
##Accuracy
nltk.classify.accuracy(classifier_fromPickle, testing_set)

0.77

### Using other classfiers

In [5]:
from nltk.classify.scikitlearn import SklearnClassifier #wrapper of nltk over scikit learn
import pickle

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.preprocessing import LabelEncoder

In [6]:
mnb_classifier = SklearnClassifier(MultinomialNB())
mnb_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(mnb_classifier, testing_set)

0.77

In [7]:
#training_set
#[({u'clamoring': False,
#   u'madsen': False,
#   u'sonja': False,
#...


In [8]:
##Some Errors in this 

#gaussian_classifier = SklearnClassifier(GaussianNB())
#gaussian_classifier.train(training_set)
#gaussian_classifier.train(training_set)

##Accuracy
#nltk.classify.accuracy(gaussian_classifier, testing_set)


In [9]:
bernoulli_classifier = SklearnClassifier(BernoulliNB())
bernoulli_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(bernoulli_classifier, testing_set)

0.77

### Other scikitlearn classiers

#LogisticRegression, SGDClassifier
#SVC, LinearSVC, NuSVC


In [10]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [11]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(LogisticRegression_classifier, testing_set)

0.8

In [12]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(SGDClassifier_classifier, testing_set)



0.78

In [13]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(SVC_classifier, testing_set)

0.58

In [14]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(LinearSVC_classifier, testing_set)

0.81

In [15]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
##Accuracy
nltk.classify.accuracy(NuSVC_classifier, testing_set)

0.8

#### Note

All of above have their own paramaters which you can customize to imporve the acuuracy

In [17]:
from nltk.classify import ClassifierI
from statistics import mode

In [18]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return(mode(votes))
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
            
        choice_votes = votes.count(mode(votes))
        conf = choice_votes/len(votes)
        return conf


In [23]:
voted_classfier = VoteClassifier(mnb_classifier,
                                 bernoulli_classifier,
                                 classifier_fromPickle,
                                 SVC_classifier,
                                 NuSVC_classifier
                                )

In [24]:
nltk.classify.accuracy(voted_classfier, testing_set)

0.75