# Sentiment Polarity Prediction with Naive Bayes

This notebook contains a basic implementation of document-level sentiment analysis
for movie reviews with multinomial Naive Bayes and bag-of-words features
and of cross-validation.
* No special treatment of rare or unknown words. Unknown words in the test data are skipped.

We use the movie review polarity data set of Pang and Lee 2004 [A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts](https://www.aclweb.org/anthology/P04-1035/) in Version 2.0 available from http://www.cs.cornell.edu/People/pabo/movie-review-data (section "Sentiment polarity datasets"). This dataset contains 1000 positive and 1000 negative reviews, each tokenised, sentence-split (one sentence per line) and lowercased. Each review has been assigned to 1 of 10 cross-validation folds by the authors and this setup should be followed to compare with published results.


In [1]:
import os
import tarfile
import time
import urllib.request
import numpy
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
import matplotlib.pyplot as plt
from IPython.display import clear_output

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

data_source = 'local-folder'
data_folder = os.path.join('data', 'txt_sentoken')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ivan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/ivan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
class PL04DataLoader_Part_1:
    
    def __init__(self):
        pass
    
    def get_labelled_dataset(self, fold = 0):
        ''' Compile a fold of the data set
        '''
        dataset = []
        for label in ('pos', 'neg'):
            for document in self.get_documents(
                fold = fold,
                label = label,
            ):
                dataset.append((document, label))
        return dataset
    
    def get_documents(self, fold = 0, label = 'pos'):
        ''' Enumerate the raw contents of all data set files.
            Args:
                data_dir: relative or absolute path to the data set folder
                fold: which fold to load (0 to n_folds-1)
                label: 'pos' or 'neg' to
                    select data with positive or negative sentiment
                    polarity
            Return:
                List of tokenised documents, each a list of sentences
                that in turn are lists of tokens
        '''
        raise NotImplementedError

In [3]:
class PL04DataLoader(PL04DataLoader_Part_1):
    
    def get_xval_splits(self):
        ''' Split data with labels for cross-validation
            returns a list of k pairs (training_data, test_data)
            for k cross-validation
        '''
        # load the folds
        folds = []
        for i in range(10):
            folds.append(self.get_labelled_dataset(
                fold = i
            ))
        # create training-test splits
        retval = []
        for i in range(10):
            test_data = folds[i]
            training_data = []
            for j in range(9):
                ij1 = (i+j+1) % 10
                assert ij1 != i
                training_data = training_data + folds[ij1]
            retval.append((training_data, test_data))
        return retval

In [4]:
class PL04DataLoaderFromStream(PL04DataLoader):
        
    def __init__(self, tgz_stream, **kwargs):
        super().__init__(**kwargs)
        self.data = {}
        counter = 0
        with tarfile.open(
            mode = 'r|gz',
            fileobj = tgz_stream
        ) as tar_archive:
            for tar_member in tar_archive:
                if counter == 2000:
                    break
                path_components = tar_member.name.split('/')
                filename = path_components[-1]
                if filename.startswith('cv') \
                and filename.endswith('.txt') \
                and '_' in filename:
                    label = path_components[-2]
                    fold = int(filename[2])
                    key = (fold, label)
                    if key not in self.data:
                        self.data[key] = []
                    f = tar_archive.extractfile(tar_member)
                    document = [
                        line.decode('utf-8').split()
                        for line in f.readlines()
                    ]
                    self.data[key].append(document)
                    counter += 1
            
    def get_documents(self, fold = 0, label = 'pos'):
        return self.data[(fold, label)]

## Read Data from the Web
This should run efficiently both on google colab and locally but has the disadvantage that the same data is downloaded each time the notebook is run.

In [5]:
class PL04DataLoaderFromURL(PL04DataLoaderFromStream):
    
    def __init__(self, data_url, **kwargs):
        with urllib.request.urlopen(data_url) as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

## Read Data from a Local .tgz File

You manually download the .tgz once to a filesystem that can be accessed from the notebook, e.g. google drive on colab, and this notebook reads this file in one chunk. 

Note that if you are accessing files from google drive on colab, you will need to mount your drive and enter an authentication token:

```
from google.colab import drive
drive.mount('/content/drive')
```

You will also have to change your *data_tgz* or *data_folder* paths above so that they start with *'/content/drive/My Drive/'*

In [6]:
class PL04DataLoaderFromTGZ(PL04DataLoaderFromStream):
    
    def __init__(self, data_path, **kwargs):
        with open(data_path, 'rb') as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

## Read Data from a Local Folder

Extract the .tgz to a local folder and only load the required files. This is usually the fastest option when storage is on a local SSD. On remote filesystems, however, this can be very slow.

In [7]:
class PL04DataLoaderFromFolder(PL04DataLoader):
    
    def __init__(self, data_dir, **kwargs):
        self.data_dir = data_dir
        super().__init__(**kwargs)
        
    def get_documents(self, fold = 0, label = 'pos'):
        # read folder contents
        path = os.path.join(self.data_dir, label)
        dir_entries = os.listdir(path)
        # must process entries in numeric order to
        # replicate order of original experiments
        dir_entries.sort()
        # check each entry and add to data if matching
        # selection criteria
        for filename in dir_entries:
            if filename.startswith('cv') \
            and filename.endswith('.txt'):
                if fold == int(filename[2]):
                    # correct fold
                    f = open(os.path.join(path, filename), 'rt')
                    # "yield" tells Python to return an iterator
                    # object that produces the yields of this
                    # function as elements without creating a
                    # full list of all elements
                    yield [line.split() for line in f.readlines()]
                    f.close()

In [8]:
if data_source == 'local-folder':
    data_loader = PL04DataLoaderFromFolder(data_folder)
elif data_source == 'local-tgz':
    data_loader = PL04DataLoaderFromTGZ(data_tgz)
elif data_source == 'web':
    data_loader = PL04DataLoaderFromURL(data_url)
else:
    raise ValueError('Unsupported data source %r' %data_source)

In [9]:
def get_document_preview(document, max_length = 72):
    s = []
    count = 0
    reached_limit = False
    for sentence in document:
        for token in sentence:
            if count + len(token) + len(s) > max_length:
                reached_limit = True
                break
            s.append(token)
            count += len(token)
        if reached_limit:
            break
    return '|'.join(s)
    
for label in 'pos neg'.split():
    print(f'== {label} ==')
    print('doc sentences start of first sentence')
    for index, document in enumerate(data_loader.get_documents(
        label = label
    )):
        print('%3d %7d   %s' %(
            index, len(document), get_document_preview(document)
        ))
        if index == 4:
            break

== pos ==
doc sentences start of first sentence
  0      25   films|adapted|from|comic|books|have|had|plenty|of|success|,|whether
  1      39   every|now|and|then|a|movie|comes|along|from|a|suspect|studio|,|with
  2      19   you've|got|mail|works|alot|better|than|it|deserves|to|.|in|order|to|make
  3      42   "|jaws|"|is|a|rare|film|that|grabs|your|attention|before|it|shows|you|a
  4      25   moviemaking|is|a|lot|like|being|the|general|manager|of|an|nfl|team|in
== neg ==
doc sentences start of first sentence
  0      35   plot|:|two|teen|couples|go|to|a|church|party|,|drink|and|then|drive|.
  1      13   the|happy|bastard's|quick|movie|review|damn|that|y2k|bug|.|it's|got|a
  2      23   it|is|movies|like|these|that|make|a|jaded|movie|viewer|thankful|for|the
  3      19   "|quest|for|camelot|"|is|warner|bros|.|'|first|feature-length|,
  4      37   synopsis|:|a|mentally|unstable|man|undergoing|psychotherapy|saves|a|boy


## Create Training-Test Splits for Cross-Validation

In [10]:
splits = data_loader.get_xval_splits()

print('tr-size te-size (number of documents)')
for xval_tr_data, xval_te_data in splits:
    print('%7d %7d' %(len(xval_tr_data), len(xval_te_data)))

tr-size te-size (number of documents)
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200


# Changes to Code

Below we outline the changes we made to the existing code.

## N-gram Features

The first change which we implemented in the code was to add bigram and trigram feature sets. We initialised two extra sets in order to store the bigrams and trigrams in the *reset_feature_sets* function. The bigrams and trigrams were then added to these new sets in the *add_to_feature_sets_from_data* function.

This was done by checking if the index of the token was greater than zero before adding bigrams and checking if it was greater than one before adding trigrams. We also escaped the start and end of sentences using the respective '\<s>' and '\</s>' tags. In the *finalise_feature_sets* function, a reverse map dictionary is created for both bigrams and trigrams, similar to that which was done for unigrams originally. This enables faster lookup when populating the feature matrix.

In order to store a feature matrix for each implementation, we originally created a *feature_matrices* dict in the *extract_features* function. However, when building all three feature matrices, computation time proved to be too long. Therefore, in order to save time in computations, only the feature matrix which is being used (either unigram, bigram or trigram), is populated in the function.

## Lemmatisation

We also included a lemmatisation step which can be toggled when running the model. This step essentially converts each word in the vocabulary to its base form. The lemmatiser which we opted to use is [Wordnet](https://wordnet.princeton.edu/), which is a large, freely available lexical database for the English language. This lemmatiser was downloaded and accessed through the NLTK toolkit. It is implemented in the *lemmatise_data* function.

As a second argument to the lemmatiser, we also pass the Wordnet POS tag for the word. We find the [Penn POS tag](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) for each word using *nltk.pos_tag(word)* in the *get_pos_tags* function. This tagged data is then passed to the *lemmatise_data* function where the tags are then mapped to the much simpler Wornet POS tags before being passed into the lemmatiser with each word.

## Negation

Another option which can be toggled when running the model is whether to negate the data or not. Negation is performed in the *add_negation_to_data* function. This function essentially checks if a token is "not", "no" or ends in "n't". When one of these tokens is found, each token following this has "NOT_" prefixed to it until the next full stop in the data. This is a simple but effective way to perform negation.

In the aforementioned function, a vector is also created, called *doc2negate*, which stores the number of negated words in each document. This will be used later on to create additional features. 

## Stop Word Removal

We also created a parameter in our model for removing stop words. The list of stop words which we used is from the English stop word list accessed through NLTK. These are shown below.

Stop words are removed from the data in the *remove_stopwords_from_data* function. This function works by storing the indices of stop words when iterating through each sentence and then removing these from the list of sentence tokens.

In [11]:
print(set(nltk.corpus.stopwords.words('english')))


{'will', 'while', 'some', 'did', 'he', 'only', 'wouldn', "isn't", 'until', 'whom', 'if', 'didn', 'hers', 'all', 've', 'mustn', 'there', 'this', 'other', "doesn't", 'during', 'yourselves', 'myself', 'hasn', 'each', 'down', 'does', "mightn't", 'with', "you'll", 'than', 'been', 'doing', 'few', "mustn't", 'doesn', 'when', 's', 'wasn', 'but', 'through', 'own', 'i', 'here', 'not', 'against', 'weren', 'between', 'm', "should've", "hadn't", 'shan', "haven't", 'their', 'once', 'where', 'up', 'of', 'that', 'nor', "you're", "that'll", 'most', "wasn't", 'out', 'do', 'has', 'because', 'again', "couldn't", 'both', 'its', "shan't", 'themselves', 'my', "wouldn't", 'having', 'hadn', 'yours', 'ma', 'won', 'have', 'for', 'his', 'me', 'now', 'were', 'theirs', 'which', 'and', 'being', 'over', 'our', 'her', 'are', 'why', 'had', 'these', 'should', 'you', "she's", 'an', 'before', 'in', 'needn', 'off', 'she', 'just', 'him', 'no', "won't", 'ourselves', "needn't", 'be', 'mightn', 'a', 'more', 'them', 'we', 'am',

## Additional Features

The final thing which we implemented was the creation of 3 new additional features. The first of these is formed from the *doc2negate* vector mentioned above. This vector shows the number of negated terms in each document. The second and third of these additional features represent positive and negative lexicons respectively. Similar to the *doc2negate* vector, they show the number of terms in each document which are found in positive and negative sentiment lexicons.  
  
The sentiment lexicon which we used was the [MPQA Subjectivity Lexicon](http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/). It contains 1340 positive and 2441 negative terms.  
  
These 3 vectors are appended to the feature matrix which was created in the *extract_features* function.

# Interface for Sentiment Polarity Predictor
Let's define a base class to clarify how we plan to use polarity predictors. Its functions will have to be implemented in sub-classes.

In [12]:
class PolarityPredictorInterface:

    def train(self, data_with_labels):
        raise NotImplementedError
        
    def predict(self, data):
        raise NotImplementedError

In [13]:
class PolarityPredictorInit(PolarityPredictorInterface):
    
    def train(self, data_with_labels, feature):
        """
        Function which trains model.
        
        Lemmatisation, negation and removal of stopwords are performed on the data first.
        
        Then the feature matrix is created using extract_features function.
        
        This will be based on bag of unigrams, bigrams or trigrams depending on input.
        
        The targets are then fetched and passed to the learning model for training.
        """

        # Initialise ngram set objects
        self.reset_feature_sets()

        if self.lemmatise:
            """
            Get Penn POS tags
            Convert to WordNet POS tags and retrieve lemma
            Reset data
            """
            tagged_data = self.get_pos_tags(data_with_labels)
            lemmatised_data = self.lemmatise_data(tagged_data)
            data_with_labels = lemmatised_data
        
        if self.negation:
            """
            'NOT_' is added to tokens which are negated.
            """
            self.add_negation_to_data(data_with_labels)
            
        if self.remove_stopwords:
            """
            Stopwords are retrieved from NLTK library and are removed if found in the data.
            """
            self.get_stopwords()
            self.remove_stopwords_from_data(data_with_labels)
            
            
        self.add_to_feature_sets_from_data(data_with_labels)
        self.finalise_feature_sets()
        
        tr_features = self.extract_features(data_with_labels, feature)
        
        if self.additional_features:
            """
            Additional features: (for each document)
            Positive sentiment lexicon counts 
            Negative sentiment lexicon counts
            Negated word counts
            Added as columns to existing feature matrix
            """
            self.get_sentiment_lexicons()
            tr_features = self.add_additional_features(data_with_labels, tr_features, self.negation)
            
        tr_targets = self.get_targets(data_with_labels)
        self.train_model_on_features(tr_features, tr_targets)

    def reset_feature_sets(self):
        self.unigrams = set()
        self.bigrams = set()
        self.trigrams = set()
    
    def get_pos_tags(self, data):
        tagged_data = []
        for document, label in data:
            new_document = []
            for sentence in document:
                tagged_sentence = nltk.pos_tag(sentence)
                new_document.append(tagged_sentence)
            new_entry = (new_document, label)
            tagged_data.append(new_entry)
        return tagged_data

    def lemmatise_data(self, data):
        """
        POS has been retieved as in Penn form
        We convert to WordNet form by taking first letter of Penn POS
        """
        lemmatizer = nltk.stem.WordNetLemmatizer()
        for document, label in data:
            for sentence in document:
                for index, token in enumerate(sentence):
                    word = token[0]
                    penn_pos = token[1]
                    if penn_pos[0] == 'N':
                        wn_pos = nltk.corpus.wordnet.NOUN
                    elif penn_pos[0] == 'J':
                        wn_pos = nltk.corpus.wordnet.ADJ
                    elif penn_pos[0] == 'V':
                        wn_pos = nltk.corpus.wordnet.VERB
                    elif penn_pos[0] == 'R':
                        wn_pos = nltk.corpus.wordnet.ADV
                    else:
                        sentence[index] = lemmatizer.lemmatize(word)
                        continue
                    sentence[index] = lemmatizer.lemmatize(word, wn_pos)
        return data
    
    def add_negation_to_data(self, data):
        """
        if 'no' or 'not' or "...n't" are found, 
        words are negated until end of sentence.
        """
        self.doc2negate = []
        for document, label in data:
            negate_total = 0
            for sentence in document:
                negate = False
                for index, token in enumerate(sentence):
                    if token in ('not', 'no') or (token[-3:] == "n't"):
                        negate = True
                        continue
                    if token == '.':
                        negate = False
                    if negate:
                        sentence[index] = 'NOT_' + token
                        negate_total += 1
            self.doc2negate.append(negate_total)
    
    def remove_stopwords_from_data(self, data):
        for document, label in data:
            for sentence in document:
                stopword_indices = []
                for index, token in enumerate(sentence):
                    if token in self.stopwords:
                        stopword_indices.append(index)
                stopword_indices.reverse()
                # we delete stopwords in place, by deleting the latest index first.
                for index in stopword_indices:
                    del sentence[index]
    
    def get_stopwords(self):
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
    
    def get_sentiment_lexicons(self):
        """
        We form lists of positive and negative sentiment lexicons.
        We used data from MPQA: http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/
        """
        sentiment_lexicon_data = "data/sentiment_lexicons.tff"
    
        self.positive_lexicons = []
        self.negative_lexicons = []
        with open(sentiment_lexicon_data, 'r') as f:
            while len(f.readline()) != 0:
                toks = f.readline().split()
                word = toks[2].split("=")[1]
                polarity = toks[-1].split("=")[1]
                if polarity in ("positive", "negative"):
                    if polarity == 'positive':
                        self.positive_lexicons.append(word)
                    if polarity == 'negative':
                        self.negative_lexicons.append(word)
        
    def add_to_feature_sets_from_data(self, data):
        """
        Parses tokens in data and adds them to each feature set.
        """
        for document, label in data:
            for sentence in document:
                sentence.insert(0, '<s>')
                sentence.append('</s>')
                prev_token = None
                for index, token in enumerate(sentence):
                    self.unigrams.add(token)
                    if index > 0:
                        bigram = (prev_token, token)
                        self.bigrams.add(bigram)
                    if index > 1:
                        trigram = (prev_prev_token, prev_token, token)
                        self.trigrams.add(trigram)
                    prev_prev_token = prev_token
                    prev_token = token
                        
    def finalise_feature_sets(self):
        """
        Creates a dict for the feature sets for faster operations.
        """
        self.unigrams = list(self.unigrams)
        # create reverse map for fast token lookup
        self.unigram2index = {}
        for index, token in enumerate(self.unigrams):
            self.unigram2index[token] = index
            
        self.bigrams = list(self.bigrams)
        # create reverse map for fast token lookup
        self.bigram2index = {}
        for index, token in enumerate(self.bigrams):
            self.bigram2index[token] = index
            
        self.trigrams = list(self.trigrams)
        # create reverse map for fast token lookup
        self.trigram2index = {}
        for index, token in enumerate(self.trigrams):
            self.trigram2index[token] = index
        
        
    def extract_features(self, data, feature):
        raise NotImplementedError
    
    def get_targets(self, data, label2index = None):
        raise NotImplementedError
        
    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [14]:
class PolarityPredictorExtractFeatures(PolarityPredictorInit):
    
    def __init__(self, clip_counts = True, negation=False, remove_stopwords=False, lemmatise=False, additional_features=False, learning_model=MultinomialNB()):
        self.clip_counts = clip_counts
        self.negation = negation
        self.remove_stopwords = remove_stopwords
        self.model = learning_model
        self.lemmatise = lemmatise
        self.additional_features = additional_features
        
    def extract_features(self, data, ngram):
        """
        Creates features from the data.
        Depending on the input, a feature matrix for bag of words, bag of bigrams or bag of trigrams is created.
        """
        rows = len(data)
        feature_matrices = {}
        # Initialise a feature matrix with zeros
        feature_matrices['unigram'] = numpy.zeros((rows, len(self.unigrams)), dtype=numpy.int32)
        feature_matrices['bigram'] = numpy.zeros((rows, len(self.bigrams)), dtype=numpy.int32)
        feature_matrices['trigram'] = numpy.zeros((rows, len(self.trigrams)), dtype=numpy.int32)
        # populate feature matrix
        for row, item in enumerate(data):
            document, _ = item
            for sentence in document:
                if sentence[0] != '<s>':
                    sentence.insert(0, '<s>')
                if sentence[-1] != '</s>':
                    sentence.append('</s>')
                prev_token = None
                for idx, token in enumerate(sentence):
                    if ngram == 'unigram':
                        try:
                            unigram_index = self.unigram2index[token]
                        except KeyError:
                            continue
                        if self.clip_counts:
                            feature_matrices['unigram'][row, unigram_index] = 1
                        else:
                            feature_matrices['unigram'][row, unigram_index] += 1
                    if ngram == 'bigram':
                        if idx > 0:
                            bigram = (prev_token, token)
                            try:
                                bigram_index = self.bigram2index[bigram]
                            except KeyError:
                                continue
                            if self.clip_counts:
                                feature_matrices['bigram'][row, bigram_index] = 1
                            else:
                                feature_matrices['bigram'][row, bigram_index] += 1
                    if ngram == 'trigram':
                        if idx > 1:
                            trigram = (prev_prev_token, prev_token, token)
                            try:
                                trigram_index = self.trigram2index[trigram]
                            except KeyError:
                                continue
                            if self.clip_counts:
                                feature_matrices['trigram'][row, trigram_index] = 1
                            else:
                                feature_matrices['trigram'][row, trigram_index] += 1

                    prev_prev_token = prev_token
                    prev_token = token
        return feature_matrices[ngram]
        
    def add_additional_features(self, data, feature_matrix, negation):
        """
        Additional features representing number of positive lexicons, number of negative lexicons
        and number of negated words are added to the existing feature matrix built above.
        """
        self.doc2poslex = []
        self.doc2neglex = []
        for row, item in enumerate(data):
            document, _ = item
            doc_pos_tot = 0
            doc_neg_tot = 0
            for sentence in document:
                for token in sentence:
                    if token in self.positive_lexicons:
                        doc_pos_tot += 1
                    if token in self.negative_lexicons:
                        doc_neg_tot += 1
            self.doc2poslex.append(doc_pos_tot)
            self.doc2neglex.append(doc_neg_tot)
        if negation:
            additional_features = numpy.array((self.doc2negate, self.doc2poslex, self.doc2neglex)).T
        else:
            additional_features = numpy.array((self.doc2poslex, self.doc2neglex)).T
        new_feature_matrix = numpy.append(feature_matrix, additional_features, 1)
        return new_feature_matrix

In [15]:
class PolarityPredictorAssignTargets(PolarityPredictorExtractFeatures):
 
    def get_targets(self, data):
        ''' 
        Column vector with target labels is created.
        '''
        targets = numpy.zeros(len(data), dtype=numpy.int8)
        index = 0
        for _, label in data:
            if label == 'pos':
                targets[index] = 1
            index += 1
        return targets

    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [16]:
class PolarityPredictor(PolarityPredictorAssignTargets):

    def train_model_on_features(self, tr_features, tr_targets):
        self.model.fit(tr_features, tr_targets)
        
    def predict(self, data, feature, get_accuracy = False, get_confusion_matrix = False):
        
        if self.lemmatise:
            tagged_data = self.get_pos_tags(data)
            lemmatised_data = self.lemmatise_data(tagged_data)
            data = lemmatised_data
        
        if self.negation:
            self.add_negation_to_data(data)
        
        if self.remove_stopwords:
            self.remove_stopwords_from_data(data)
        
        features = self.extract_features(data, feature)
        
        if self.additional_features:
            self.get_sentiment_lexicons()
            features = self.add_additional_features(data, features, self.negation)
            
        y_pred = self.model.predict(features)
        
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
                
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(metrics.accuracy_score(y_true, y_pred))
            if get_confusion_matrix:
                retval.append(metrics.confusion_matrix(y_true, y_pred))
            return retval
        else:
            return labels

# Experiments

## Evaluation Table

Our model builds three different feature representations from the input data:
* Bag-of-Words (Unigrams)
* Bag-of-Bigrams
* Bag-of-Trigrams

The *PolarityPredictor* model takes in a *feature* parameter which specifies which feature of the above to use. There is also a *learning_model* parameter which specifies which particular learning model to use.

The model can also be tweaked to perform the following:
* Clip Counts in feature matrices
* Negation
* Removal of StopWords
* Lemmatisation
* Include additional features

We plan to run many different experiments using different feature representations and different learning models.   Therefore, having an evaluation table which contains the details of each experiment and the corresponding evalation results will be useful. 
The following values will be recorded in the dataframe for each model:
* Average 10 Fold Cross-Validation Accuracy
* Root Mean Square Error (RMSE)
* Minimum Accuracy
* Maximum Accuracy

Below, we define table to store these.

In [17]:
def evaluate_model(model, splits, feature, verbose = False):
    accuracies = []
    fold = 0
    for tr_data, te_data in splits:
        if verbose:
            print('Evaluating fold %d of %d' %(fold+1, len(splits)))
            fold += 1
        model.train(tr_data, feature)
        _, accuracy = model.predict(te_data, feature, get_accuracy = True)
        accuracies.append(accuracy)
        if verbose:
            print('-->', accuracy)
    n = float(len(accuracies))
    avg = sum(accuracies) / n
    mse = sum([(x-avg)**2 for x in accuracies]) / n
    return (avg, mse**0.5, min(accuracies),
            max(accuracies))

def print_first_predictions(model, te_data, feature, n = 12):
    predictions = model.predict(te_data, feature)
    for i in range(n):
        document, label = te_data[i]
        prediction = predictions[i]
        print('%4d %s %s %s' %(i, label, prediction, get_document_preview(document),))

# 1. Multinomial NB

In [18]:
learning_model = MultinomialNB()

## 1.1 Baseline

In [19]:
evaluation_dataframes = {}
evaluation_dataframes['MultinomialNB'] = pd.DataFrame(columns=['name', 'learning_model', 'feature', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'additional_features', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])

We run the baseline approach as a functionality test. The settings used are listed in the cell below.

In [20]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
additional_features = False
feature = 'unigram'

In [21]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['MultinomialNB'].loc[0] = ['baseline-NB', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, additional_features, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

Evaluating fold 1 of 10
--> 0.795
Evaluating fold 2 of 10
--> 0.84
Evaluating fold 3 of 10
--> 0.84
Evaluating fold 4 of 10
--> 0.825
Evaluating fold 5 of 10
--> 0.835
Evaluating fold 6 of 10
--> 0.83
Evaluating fold 7 of 10
--> 0.84
Evaluating fold 8 of 10
--> 0.845
Evaluating fold 9 of 10
--> 0.785
Evaluating fold 10 of 10
--> 0.855


So, the baseline approach achieves an average accuracy score of 82.4% and has been to our evaluation table. The format of this dataframe can be seen below,

In [22]:
evaluation_dataframes['MultinomialNB']

Unnamed: 0,name,learning_model,feature,clip_counts,negation,remove_stopwords,lemmatise,additional_features,avg_cv_acc,rmse,min_acc,max_acc
0,baseline-NB,MultinomialNB(),unigram,True,False,False,False,False,0.829,0.021071,0.785,0.855


## 1.2 Parameter Experimentation

We will now run experiments using the MultinomialNB model with different parameters.


In [23]:
index = 1
remove_stopwords = False
feature = 'unigram'
for clip_counts in (True, False):
    for negation in (True, False):
        for lemmatise in (True, False):
            for additional_features in (True, False):
                print('Start')
                print(index)
                model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
                avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = True)
                evaluation_dataframes['MultinomialNB'].loc[evaluation_dataframes['MultinomialNB'].index.max()+1] = [f'param-exp-NB-{index}', learning_model, feature, clip_counts, 
                                                                  negation, remove_stopwords, lemmatise, additional_features, avg, rmse, min_acc, max_acc]
                index += 1
                clear_output(wait=True)
                print(evaluation_dataframes['MultinomialNB'])
                print('End')

               name   learning_model  feature clip_counts negation  \
0       baseline-NB  MultinomialNB()  unigram        True    False   
1    param-exp-NB-1  MultinomialNB()  unigram        True     True   
2    param-exp-NB-2  MultinomialNB()  unigram        True     True   
3    param-exp-NB-3  MultinomialNB()  unigram        True     True   
4    param-exp-NB-4  MultinomialNB()  unigram        True     True   
5    param-exp-NB-5  MultinomialNB()  unigram        True    False   
6    param-exp-NB-6  MultinomialNB()  unigram        True    False   
7    param-exp-NB-7  MultinomialNB()  unigram        True    False   
8    param-exp-NB-8  MultinomialNB()  unigram        True    False   
9    param-exp-NB-9  MultinomialNB()  unigram       False     True   
10  param-exp-NB-10  MultinomialNB()  unigram       False     True   
11  param-exp-NB-11  MultinomialNB()  unigram       False     True   
12  param-exp-NB-12  MultinomialNB()  unigram       False     True   
13  param-exp-NB-13 

In [None]:
feature = 'bigram'
remove_stopwords = False
for clip_counts in (True, False):
    for negation in (True, False):
        for lemmatise in [True, False]:
            for additional_features in [True, False]:
                print('Start')
                print(index)
                model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
                avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = True)
                evaluation_dataframes['MultinomialNB'].loc[evaluation_dataframes['MultinomialNB'].index.max()+1] = [f'param-exp-NB-{index}', learning_model, feature, clip_counts, 
                                                                  negation, remove_stopwords, lemmatise, additional_features, avg, rmse, min_acc, max_acc]
                index += 1
                clear_output(wait=True)
                print(evaluation_dataframes['MultinomialNB'])
                print('End')

Start
17
Evaluating fold 1 of 10
--> 0.785
Evaluating fold 2 of 10
--> 0.84
Evaluating fold 3 of 10


In [None]:
evaluation_dataframes['MultinomialNB'].to_csv('output/multinomialnb.csv')

# 2. Logistic Regression

In [None]:
evaluation_dataframes['LogisticRegression'] = pd.DataFrame(columns=['name', 'learning_model', 'feature', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'additional_features', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])
from sklearn.linear_model import LogisticRegression
learning_model = LogisticRegression()

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
additional_features = False
feature = 'unigram'

## 2.1 Baseline

In [None]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['LogisticRegression'].loc[0] = ['baseline-LR', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, additional_features, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

## 2.2 Parameter Experimentation

In [None]:
index = 1
for feature in ['unigram', 'bigram', 'trigram']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    for additional_features in (True, False):
                        model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                        avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                        evaluation_dataframes['LogisticRegression'].loc[evaluation_dataframes['LogisticRegression'].index.max()+1] = [f'param-exp-LR-{index}', learning_model, feature, clip_counts, 
                                                                          negation, remove_stopwords, lemmatise, additional_features, avg, rmse, min_acc, max_acc]
                        index += 1
                        clear_output(wait=True)
                        evaluation_dataframes['LogisticRegression']

In [None]:
evaluation_dataframes['LogisticRegression'].to_csv('output/logisticregression.csv')

# 3. Decision Tree

In [None]:
evaluation_dataframes['DecisionTree'] = pd.DataFrame(columns=['name', 'learning_model', 'feature', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'additional_features', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])
from sklearn.tree import DecisionTreeClassifier
learning_model = DecisionTreeClassifier()

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
additional_features = False
feature = 'unigram'

## 3.1 Baseline

In [None]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['DecisionTree'].loc[0] = ['baseline-DT', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, additional_features, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

## 3.2 Parameter Estimation

In [None]:
index = 1
for feature in ['unigram', 'bigram', 'trigram']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    for additional_features in (True, False):
                        model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                        avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                        evaluation_dataframes['DecisionTree'].loc[evaluation_dataframes['DecisionTree'].index.max()+1] = [f'param-exp-DT-{index}', learning_model, feature, clip_counts, 
                                                                          negation, remove_stopwords, lemmatise, additional_features, avg, rmse, min_acc, max_acc]
                        index += 1
                        clear_output(wait=True)
                        evaluation_dataframes['DecisionTree']

In [None]:
evaluation_dataframes['DecisionTree'].to_csv('output/decisiontree.csv')

# 4. SVM Classifier

In [None]:
evaluation_dataframes['SVM'] = pd.DataFrame(columns=['name', 'learning_model', 'feature', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'additional_features', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])
from sklearn.svm import SVC
learning_model = SVC()

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
additional_features = False
feature = 'unigram'

## 4.1 Baseline

In [None]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['SVM'].loc[0] = ['baseline-SVM', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, additional_features, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

## 4.2 Parameter Estimation

In [None]:
index = 1
for feature in ['unigram', 'bigram', 'trigram']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    for additional_features in (True, False):
                        model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                        avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                        evaluation_dataframes['SVM'].loc[evaluation_dataframes['SVM'].index.max()+1] = [f'param-exp-SVM-{index}', learning_model, feature, clip_counts, 
                                                                          negation, remove_stopwords, lemmatise, additional_features, avg, rmse, min_acc, max_acc]
                        index += 1
                        clear_output(wait=True)
                        evaluation_dataframes['SVM']

In [None]:
evaluation_dataframes['SVM'].to_csv('output/svm.csv')

# 5. Random Forest Classifier

In [None]:
evaluation_dataframes['RandomForest'] = pd.DataFrame(columns=['name', 'learning_model', 'feature', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'additional_features', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])
from sklearn.ensemble import RandomForestClassifier
learning_model = RandomForestClassifier()

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
additional_features = False
feature = 'unigram'

## 4.1 Baseline

In [None]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['RandomForest'].loc[0] = ['baseline-RF', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, additional_features, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

## 4.2 Parameter Estimation

In [None]:
index = 1
for feature in ['unigram', 'bigram', 'trigram']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    for additional_features in (True, False):
                        model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                        avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                        evaluation_dataframes['RandomForest'].loc[evaluation_dataframes['RandomForest'].index.max()+1] = [f'param-exp-RF-{index}', learning_model, feature, clip_counts, 
                                                                          negation, remove_stopwords, lemmatise, additional_features, avg, rmse, min_acc, max_acc]
                        index += 1
                        clear_output(wait=True)
                        evaluation_dataframes['RandomForest']

In [None]:
evaluation_dataframes['RandomForest'].to_csv('output/randomforest.csv')