# Voting System

The goal of this project is to combine Naive Bayes algorithm from `nltk` and similar algorithms from `scikit-learn` to create a voting system to increase accuracy and reliability of the end confidence percentages of the movie review classification. The training and testing will be done on the Movie Reviews corpora from `nltk.corpus`.

Note: We will also use `pickle` to save the trained classifiers and sets to reduce the running time.

In [101]:
import nltk
import random
import pickle
import os

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

from nltk.corpus import movie_reviews


def pickle_object(classifier, file_path):
    with open(file_path, 'wb') as f:
        # Take contents of trained classifier and put it to the new file   
        pickle.dump(classifier, f)

def unpickle_object(file_path):
    if not os.path.isfile(file_path):
        return None
    with open(file_path, 'rb') as f:
        # Get trained classifier to work with it
        return pickle.load(f)



# Load sets from serialised files
training_sets = unpickle_object('training_sets.pickle')
testing_sets = unpickle_object('testing_sets.pickle')

# Load classifiers from serialised files
naive_bayes_classifier = unpickle_object('naivebayes.pickle')
MultinomialNB_classifier = unpickle_object('multinomialnb.pickle')
BernoulliNB_classifier = unpickle_object('bernoullinb.pickle')
LogisticRegression_classifier = unpickle_object('logistic_regression.pickle')
SGDClassifier_classifier = unpickle_object('sgd_classifier.pickle')
SVC_classifier = unpickle_object('svc_classifier.pickle')
LinearSVC_classifier = unpickle_object('linear_svc_classifier.pickle')
NuSVC_classifier = unpickle_object('nu_svc_classifier.pickle')


# Upload and prepare training and testing sets if not done yet
if training_sets is None or testing_sets is None:
    def reviews_words_lists(movie_reviews):
        reviews_list = []
        for category in movie_reviews.categories():
            for file_id in movie_reviews.fileids(category):
                reviews_list.append((list(movie_reviews.words(file_id)), category))
        return reviews_list

    # Get words from each review in separate lists marked by category
    documents = reviews_words_lists(movie_reviews)

    # Randomise review lists to mix positive and negative 
    random.shuffle(documents)

    # Get all words from all reviews & make them lower case
    all_words = [word.lower() for word in movie_reviews.words()]

    # Count number of occurances for each word and sort in desc. order
    all_words = nltk.FreqDist(all_words)

    # Take 3000 most common from all available words
    word_features = [w[0] for w in all_words.most_common(3000)]

    def find_features(document, word_features):
        words = set(document)
        features = {}
        for word in word_features:
            features[word] = word in words
        return features

    # Mark whether or not each word in all reviews is also in the list of 3000 most common words
    feature_sets = [(find_features(rev, word_features), category) for (rev, category) in documents]

    # Shuffled training set
    training_sets = feature_sets[:1900]
    # Shuffled testing set
    testing_sets = feature_sets[1900:]


    # Serialise shuffled training set 
    pickle_object(training_sets, 'training_sets.pickle')
    # Serialise shuffled testing set 
    pickle_object(testing_sets, 'testing_sets.pickle')


if naive_bayes_classifier is None:
    # Train classifier on training sets
    naive_bayes_classifier = nltk.NaiveBayesClassifier.train(training_sets)
    # Serialise classifier
    pickle_object(naive_bayes_classifier, 'naivebayes.pickle')

if MultinomialNB_classifier is None:
    MultinomialNB_classifier = SklearnClassifier(MultinomialNB())
    # Train classifier on training sets
    MultinomialNB_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(MultinomialNB_classifier, 'multinomialnb.pickle')

if BernoulliNB_classifier is None:
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    # Train classifier on training sets
    BernoulliNB_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(BernoulliNB_classifier, 'bernoullinb.pickle')

if LogisticRegression_classifier is None:
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    # Train classifier on training sets
    LogisticRegression_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(LogisticRegression_classifier, 'logistic_regression.pickle')

if SGDClassifier_classifier is None:
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    # Train classifier on training sets
    SGDClassifier_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(SGDClassifier_classifier, 'sgd_classifier.pickle')


if SVC_classifier is None:
    SVC_classifier = SklearnClassifier(SVC())
    # Train classifier on training sets
    SVC_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(SVC_classifier, 'svc_classifier.pickle')

if LinearSVC_classifier is None:
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    # Train classifier on training sets
    LinearSVC_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(LinearSVC_classifier, 'linear_svc_classifier.pickle')

if NuSVC_classifier is None:
    NuSVC_classifier = SklearnClassifier(NuSVC())
    # Train classifier on training sets
    NuSVC_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(NuSVC_classifier, 'nu_svc_classifier.pickle')




We've uploaded processed and serialised training and testing datasets, as well as trained and serialised classifiers. Let's check their accuracy percentages at this point. 

In [102]:
# Get Naive Bayes classifier from nltk accuracy on testing sets
nltk.classify.accuracy(naive_bayes_classifier, testing_sets)*100

86.0

In [103]:
# Get accuracy on testing sets of MultinomialNB_classifier from sklearn 
nltk.classify.accuracy(MultinomialNB_classifier, testing_sets)*100

85.0

In [104]:
# Get accuracy on testing sets of BernoulliNB_classifier from sklearn 
nltk.classify.accuracy(BernoulliNB_classifier, testing_sets)*100

86.0

In [105]:
# Get accuracy on testing sets of LogisticRegression_classifier from sklearn 
nltk.classify.accuracy(LogisticRegression_classifier, testing_sets)*100

82.0

In [106]:
# Get accuracy on testing sets of SGDClassifier_classifier from sklearn 
nltk.classify.accuracy(SGDClassifier_classifier, testing_sets)*100

81.0

In [107]:
# Get accuracy on testing sets of SVC_classifier from sklearn 
nltk.classify.accuracy(SVC_classifier, testing_sets)*100

86.0

In [108]:
# Get accuracy on testing sets of LinearSVC_classifier from sklearn 
nltk.classify.accuracy(LinearSVC_classifier, testing_sets)*100

76.0

In [109]:
# Get accuracy on testing sets of NuSVC_classifier from sklearn 
nltk.classify.accuracy(NuSVC_classifier, testing_sets)*100

84.0

On a selected part of shuffled dataset all algorithms are doing fairly well. Now let's create a voting system for all classifiers.

In [110]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        chosen_votes = votes.count(mode(votes))
        conf = chosen_votes / len(votes)
        return conf

voted_classifier = VoteClassifier(
    naive_bayes_classifier,
    MultinomialNB_classifier,
    BernoulliNB_classifier,
    LogisticRegression_classifier, 
    SGDClassifier_classifier,
    SVC_classifier,
    LinearSVC_classifier,
    NuSVC_classifier
    )

Let's check accuracy of the voting system.

In [111]:
nltk.classify.accuracy(voted_classifier, testing_sets)*100

86.0

Now we are going to run through our voting system several reviews to get their particular classification and confidence percentages. 

In [112]:
for x in range(20,40):
    print(f'Classification of the review #{x}:', voted_classifier.classify(testing_sets[x][0]))
    print('Confidence in %:', voted_classifier.confidence(testing_sets[x][0])*100)
    print('--------')

Classification of the review #20: neg
Confidence in %: 87.5
--------
Classification of the review #21: pos
Confidence in %: 100.0
--------
Classification of the review #22: pos
Confidence in %: 100.0
--------
Classification of the review #23: neg
Confidence in %: 87.5
--------
Classification of the review #24: neg
Confidence in %: 100.0
--------
Classification of the review #25: neg
Confidence in %: 100.0
--------
Classification of the review #26: pos
Confidence in %: 100.0
--------
Classification of the review #27: neg
Confidence in %: 100.0
--------
Classification of the review #28: neg
Confidence in %: 87.5
--------
Classification of the review #29: neg
Confidence in %: 50.0
--------
Classification of the review #30: neg
Confidence in %: 100.0
--------
Classification of the review #31: neg
Confidence in %: 100.0
--------
Classification of the review #32: pos
Confidence in %: 62.5
--------
Classification of the review #33: pos
Confidence in %: 87.5
--------
Classification of the revi

### Conclusion:

We created a voting system based on many similar algorithms which gave us more reliable classification results. The final accuracy of the voting system is about `84-86%` which is rather high, but reasonably high. We have additionaly run some tests checking algorithms' confidence levels on exclusively positive or negative reviews, both tests got `85%` from voting system. 

On the level of individual reviews voting system gave us different results from `50%` of confidence to `100%`. Though, at first glance `50%` and lower seems to be not too comon of a result, for more information further research is required. 