In [17]:
import nltk
import random
import pickle
import os

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize


def pickle_object(classifier, file_path):
    with open(file_path, 'wb') as f:
        # Take contents of trained classifier and put it to the new file   
        pickle.dump(classifier, f)

def unpickle_object(file_path):
    if not os.path.isfile(file_path):
        return None
    with open(file_path, 'rb') as f:
        # Get trained classifier to work with it
        return pickle.load(f)



# Load sets from serialised files
training_sets = unpickle_object('training_sets.pickle')
testing_sets = unpickle_object('testing_sets.pickle')

# Load classifiers from serialised files
naive_bayes_classifier = unpickle_object('naivebayes.pickle')
MultinomialNB_classifier = unpickle_object('multinomialnb.pickle')
BernoulliNB_classifier = unpickle_object('bernoullinb.pickle')
LogisticRegression_classifier = unpickle_object('logistic_regression.pickle')
SGDClassifier_classifier = unpickle_object('sgd_classifier.pickle')
SVC_classifier = unpickle_object('svc_classifier.pickle')
LinearSVC_classifier = unpickle_object('linear_svc_classifier.pickle')
NuSVC_classifier = unpickle_object('nu_svc_classifier.pickle')


# Upload and prepare training and testing sets if not done yet
if training_sets is None or testing_sets is None:
    short_pos = open('../positive.txt', 'r').read()
    short_neg = open('../negative.txt', 'r').read()

    allowed_word_types = ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
    # Get words from each review in separate lists marked by category
    documents = []
    all_words = []
    for p in short_pos.split('\n'):
        documents.append((p, 'pos'))
        words = word_tokenize(p)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1] in allowed_word_types:
                all_words.append(w[0].lower())
    print(all_words[:10])
    for p in short_neg.split('\n'):
        documents.append((p, 'neg'))
        words = word_tokenize(p)
        pos = nltk.pos_tag(words)
        for w in pos:
            if w[1] in allowed_word_types:
                all_words.append(w[0].lower())

    # Randomise review lists to mix positive and negative 
    random.shuffle(documents)


    # Count number of occurances for each word and sort in desc. order
    all_words = nltk.FreqDist(all_words)


    # Take most common from all available words
    word_features = list(all_words.keys())[:3000]


    def find_features(document, word_features):
        words = word_tokenize(document)
        features = {}
        for word in word_features:
            features[word] = word in words
        return features


    # Mark whether or not each word in all reviews is also in the list of most common words
    feature_sets = [(find_features(rev, word_features), category) for (rev, category) in documents]

    # Shuffled training set
    training_sets = feature_sets[:10000]
    # # Shuffled testing set
    testing_sets = feature_sets[10000:]

    # Serialise shuffled training set 
    pickle_object(training_sets, 'training_sets.pickle')
    # # Serialise shuffled testing set 
    pickle_object(testing_sets, 'testing_sets.pickle')

print(len(training_sets))
print(len(feature_sets))

10000
10664


In [18]:
if naive_bayes_classifier is None:
    # Train classifier on training sets
    naive_bayes_classifier = nltk.NaiveBayesClassifier.train(training_sets)
    # Serialise classifier
    pickle_object(naive_bayes_classifier, 'naivebayes.pickle')

In [19]:
if MultinomialNB_classifier is None:
    MultinomialNB_classifier = SklearnClassifier(MultinomialNB())
    # Train classifier on training sets
    MultinomialNB_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(MultinomialNB_classifier, 'multinomialnb.pickle')

In [20]:
if BernoulliNB_classifier is None:
    BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
    # Train classifier on training sets
    BernoulliNB_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(BernoulliNB_classifier, 'bernoullinb.pickle')

In [21]:
if LogisticRegression_classifier is None:
    LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
    # Train classifier on training sets
    LogisticRegression_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(LogisticRegression_classifier, 'logistic_regression.pickle')

In [22]:
if SGDClassifier_classifier is None:
    SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
    # Train classifier on training sets
    SGDClassifier_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(SGDClassifier_classifier, 'sgd_classifier.pickle')

In [None]:
if SVC_classifier is None:
    SVC_classifier = SklearnClassifier(SVC())
    # Train classifier on training sets
    SVC_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(SVC_classifier, 'svc_classifier.pickle')

In [None]:

if LinearSVC_classifier is None:
    LinearSVC_classifier = SklearnClassifier(LinearSVC())
    # Train classifier on training sets
    LinearSVC_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(LinearSVC_classifier, 'linear_svc_classifier.pickle')

if NuSVC_classifier is None:
    NuSVC_classifier = SklearnClassifier(NuSVC())
    # Train classifier on training sets
    NuSVC_classifier.train(training_sets)
    # Serialise classifier
    pickle_object(NuSVC_classifier, 'nu_svc_classifier.pickle')