# Sentiment Polarity Prediction with Naive Bayes

This notebook contains a basic implementation of document-level sentiment analysis
for movie reviews with multinomial Naive Bayes and bag-of-words features
and of cross-validation.
* No special treatment of rare or unknown words. Unknown words in the test data are skipped.

We use the movie review polarity data set of Pang and Lee 2004 [A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts](https://www.aclweb.org/anthology/P04-1035/) in Version 2.0 available from http://www.cs.cornell.edu/People/pabo/movie-review-data (section "Sentiment polarity datasets"). This dataset contains 1000 positive and 1000 negative reviews, each tokenised, sentence-split (one sentence per line) and lowercased. Each review has been assigned to 1 of 10 cross-validation folds by the authors and this setup should be followed to compare with published results.


In [21]:
import os
import tarfile
import time
import urllib.request
import numpy
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
import matplotlib.pyplot as plt

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

data_source = 'local-folder'
data_folder = os.path.join('data', 'txt_sentoken')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ivan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /Users/ivan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ivan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
class PL04DataLoader_Part_1:
    
    def __init__(self):
        pass
    
    def get_labelled_dataset(self, fold = 0):
        ''' Compile a fold of the data set
        '''
        dataset = []
        for label in ('pos', 'neg'):
            for document in self.get_documents(
                fold = fold,
                label = label,
            ):
                dataset.append((document, label))
        return dataset
    
    def get_documents(self, fold = 0, label = 'pos'):
        ''' Enumerate the raw contents of all data set files.
            Args:
                data_dir: relative or absolute path to the data set folder
                fold: which fold to load (0 to n_folds-1)
                label: 'pos' or 'neg' to
                    select data with positive or negative sentiment
                    polarity
            Return:
                List of tokenised documents, each a list of sentences
                that in turn are lists of tokens
        '''
        raise NotImplementedError

In [23]:
class PL04DataLoader(PL04DataLoader_Part_1):
    
    def get_xval_splits(self):
        ''' Split data with labels for cross-validation
            returns a list of k pairs (training_data, test_data)
            for k cross-validation
        '''
        # load the folds
        folds = []
        for i in range(10):
            folds.append(self.get_labelled_dataset(
                fold = i
            ))
        # create training-test splits
        retval = []
        for i in range(10):
            test_data = folds[i]
            training_data = []
            for j in range(9):
                ij1 = (i+j+1) % 10
                assert ij1 != i
                training_data = training_data + folds[ij1]
            retval.append((training_data, test_data))
        return retval

In [24]:
class PL04DataLoaderFromStream(PL04DataLoader):
        
    def __init__(self, tgz_stream, **kwargs):
        super().__init__(**kwargs)
        self.data = {}
        counter = 0
        with tarfile.open(
            mode = 'r|gz',
            fileobj = tgz_stream
        ) as tar_archive:
            for tar_member in tar_archive:
                if counter == 2000:
                    break
                path_components = tar_member.name.split('/')
                filename = path_components[-1]
                if filename.startswith('cv') \
                and filename.endswith('.txt') \
                and '_' in filename:
                    label = path_components[-2]
                    fold = int(filename[2])
                    key = (fold, label)
                    if key not in self.data:
                        self.data[key] = []
                    f = tar_archive.extractfile(tar_member)
                    document = [
                        line.decode('utf-8').split()
                        for line in f.readlines()
                    ]
                    self.data[key].append(document)
                    counter += 1
            
    def get_documents(self, fold = 0, label = 'pos'):
        return self.data[(fold, label)]

## Read Data from the Web
This should run efficiently both on google colab and locally but has the disadvantage that the same data is downloaded each time the notebook is run.

In [25]:
class PL04DataLoaderFromURL(PL04DataLoaderFromStream):
    
    def __init__(self, data_url, **kwargs):
        with urllib.request.urlopen(data_url) as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

## Read Data from a Local .tgz File

You manually download the .tgz once to a filesystem that can be accessed from the notebook, e.g. google drive on colab, and this notebook reads this file in one chunk. 

Note that if you are accessing files from google drive on colab, you will need to mount your drive and enter an authentication token:

```
from google.colab import drive
drive.mount('/content/drive')
```

You will also have to change your *data_tgz* or *data_folder* paths above so that they start with *'/content/drive/My Drive/'*

In [26]:
class PL04DataLoaderFromTGZ(PL04DataLoaderFromStream):
    
    def __init__(self, data_path, **kwargs):
        with open(data_path, 'rb') as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

## Read Data from a Local Folder

Extract the .tgz to a local folder and only load the required files. This is usually the fastest option when storage is on a local SSD. On remote filesystems, however, this can be very slow.

In [27]:
class PL04DataLoaderFromFolder(PL04DataLoader):
    
    def __init__(self, data_dir, **kwargs):
        self.data_dir = data_dir
        super().__init__(**kwargs)
        
    def get_documents(self, fold = 0, label = 'pos'):
        # read folder contents
        path = os.path.join(self.data_dir, label)
        dir_entries = os.listdir(path)
        # must process entries in numeric order to
        # replicate order of original experiments
        dir_entries.sort()
        # check each entry and add to data if matching
        # selection criteria
        for filename in dir_entries:
            if filename.startswith('cv') \
            and filename.endswith('.txt'):
                if fold == int(filename[2]):
                    # correct fold
                    f = open(os.path.join(path, filename), 'rt')
                    # "yield" tells Python to return an iterator
                    # object that produces the yields of this
                    # function as elements without creating a
                    # full list of all elements
                    yield [line.split() for line in f.readlines()]
                    f.close()

In [28]:
if data_source == 'local-folder':
    data_loader = PL04DataLoaderFromFolder(data_folder)
elif data_source == 'local-tgz':
    data_loader = PL04DataLoaderFromTGZ(data_tgz)
elif data_source == 'web':
    data_loader = PL04DataLoaderFromURL(data_url)
else:
    raise ValueError('Unsupported data source %r' %data_source)

In [29]:
def get_document_preview(document, max_length = 72):
    s = []
    count = 0
    reached_limit = False
    for sentence in document:
        for token in sentence:
            if count + len(token) + len(s) > max_length:
                reached_limit = True
                break
            s.append(token)
            count += len(token)
        if reached_limit:
            break
    return '|'.join(s)
    
for label in 'pos neg'.split():
    print(f'== {label} ==')
    print('doc sentences start of first sentence')
    for index, document in enumerate(data_loader.get_documents(
        label = label
    )):
        print('%3d %7d   %s' %(
            index, len(document), get_document_preview(document)
        ))
        if index == 4:
            break

== pos ==
doc sentences start of first sentence
  0      25   films|adapted|from|comic|books|have|had|plenty|of|success|,|whether
  1      39   every|now|and|then|a|movie|comes|along|from|a|suspect|studio|,|with
  2      19   you've|got|mail|works|alot|better|than|it|deserves|to|.|in|order|to|make
  3      42   "|jaws|"|is|a|rare|film|that|grabs|your|attention|before|it|shows|you|a
  4      25   moviemaking|is|a|lot|like|being|the|general|manager|of|an|nfl|team|in
== neg ==
doc sentences start of first sentence
  0      35   plot|:|two|teen|couples|go|to|a|church|party|,|drink|and|then|drive|.
  1      13   the|happy|bastard's|quick|movie|review|damn|that|y2k|bug|.|it's|got|a
  2      23   it|is|movies|like|these|that|make|a|jaded|movie|viewer|thankful|for|the
  3      19   "|quest|for|camelot|"|is|warner|bros|.|'|first|feature-length|,
  4      37   synopsis|:|a|mentally|unstable|man|undergoing|psychotherapy|saves|a|boy


## Create Training-Test Splits for Cross-Validation

In [30]:
splits = data_loader.get_xval_splits()

print('tr-size te-size (number of documents)')
for xval_tr_data, xval_te_data in splits:
    print('%7d %7d' %(len(xval_tr_data), len(xval_te_data)))

tr-size te-size (number of documents)
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200


## Interface for Sentiment Polarity Predictor
Let's define a base class to clarify how we plan to use polarity predictors. Its functions will have to be implemented in sub-classes.

In [31]:
class PolarityPredictorInterface:

    def train(self, data_with_labels):
        raise NotImplementedError
        
    def predict(self, data):
        raise NotImplementedError

In [32]:
class PolarityPredictorInit(PolarityPredictorInterface):
    
    def train(self, data_with_labels, feature):
        """
        Function which trains model.
        
        Lemmatisation, Negation and Removal of stopwords are performed on the data first.
        
        Then the feature matrix is created using extract_features function.
        
        This will be based on bag of unigrams, bigrams or trigrams depending on input.
        
        The targets are then fetched and passed to the learning model for training.
        """

        # Initialise ngram set objects
        self.reset_feature_sets()

        if self.lemmatise:
            """
            Get Penn POS tags
            Convert to WordNet POS tags and retrieve lemma
            Reset data
            """
            tagged_data = self.get_pos_tags(data_with_labels)
            lemmatised_data = self.lemmatise_data(tagged_data)
            data_with_labels = lemmatised_data
        
        if self.negation:
            """
            'NOT_' is added to tokens which are negated.
            """
            self.add_negation_to_data(data_with_labels)
            
        if self.remove_stopwords:
            """
            Stopwords are retrieved from NLTK library and are removed if found in the data.
            """
            self.get_stopwords()
            self.remove_stopwords_from_data(data_with_labels)
            
            
        self.add_to_feature_sets_from_data(data_with_labels)
        self.finalise_feature_sets()
        
        tr_features = self.extract_features(data_with_labels, feature)
        
        if self.additional_features:
            """
            Additional features: (for each document)
            Positive sentiment lexicon counts 
            Negative sentiment lexicon counts
            Negated word counts
            Added as columns to existing feature matrix
            """
            self.get_sentiment_lexicons()
            tr_features = self.add_additional_features(data_with_labels, tr_features, self.negation)
            
        tr_targets = self.get_targets(data_with_labels)
        self.train_model_on_features(tr_features, tr_targets)

    def reset_feature_sets(self):
        self.unigrams = set()
        self.bigrams = set()
        self.trigrams = set()
    
    def get_pos_tags(self, data):
        tagged_data = []
        for document, label in data:
            new_document = []
            for sentence in document:
                tagged_sentence = nltk.pos_tag(sentence)
                new_document.append(tagged_sentence)
            new_entry = (new_document, label)
            tagged_data.append(new_entry)
        return tagged_data

    def lemmatise_data(self, data):
        """
        POS has been retieved as in Penn form
        We convert to WordNet form by taking first letter of Penn POS
        """
        lemmatizer = nltk.stem.WordNetLemmatizer()
        for document, label in data:
            for sentence in document:
                for index, token in enumerate(sentence):
                    word = token[0]
                    penn_pos = token[1]
                    if penn_pos[0] == 'N':
                        wn_pos = nltk.corpus.wordnet.NOUN
                    elif penn_pos[0] == 'J':
                        wn_pos = nltk.corpus.wordnet.ADJ
                    elif penn_pos[0] == 'V':
                        wn_pos = nltk.corpus.wordnet.VERB
                    elif penn_pos[0] == 'R':
                        wn_pos = nltk.corpus.wordnet.ADV
                    else:
                        sentence[index] = lemmatizer.lemmatize(word)
                        continue
                    sentence[index] = lemmatizer.lemmatize(word, wn_pos)
        return data
    
    def add_negation_to_data(self, data):
        """
        if 'no' or 'not' or "...n't" are found, 
        words are negated until end of sentence.
        """
        self.doc2negate = []
        for document, label in data:
            negate_total = 0
            for sentence in document:
                negate = False
                for index, token in enumerate(sentence):
                    if token in ('not', 'no') or (token[-3:] == "n't"):
                        negate = True
                        continue
                    if token == '.':
                        negate = False
                    if negate:
                        sentence[index] = 'NOT_' + token
                        negate_total += 1
            self.doc2negate.append(negate_total)
    
    def remove_stopwords_from_data(self, data):
        for document, label in data:
            for sentence in document:
                stopword_indices = []
                for index, token in enumerate(sentence):
                    if token in self.stopwords:
                        stopword_indices.append(index)
                stopword_indices.reverse()
                # we delete stopwords in place, by deleting the latest index first.
                for index in stopword_indices:
                    del sentence[index]
    
    def get_stopwords(self):
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
    
    def get_sentiment_lexicons(self):
        """
        We form lists of positive and negative sentiment lexicons.
        We used data from MPQA: http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/
        """
        sentiment_lexicon_data = "data/sentiment_lexicons.tff"
    
        self.positive_lexicons = []
        self.negative_lexicons = []
        with open(sentiment_lexicon_data, 'r') as f:
            while len(f.readline()) != 0:
                toks = f.readline().split()
                word = toks[2].split("=")[1]
                polarity = toks[-1].split("=")[1]
                if polarity in ("positive", "negative"):
                    if polarity == 'positive':
                        self.positive_lexicons.append(word)
                    if polarity == 'negative':
                        self.negative_lexicons.append(word)
        
    def add_to_feature_sets_from_data(self, data):
        """
        Parses tokens in data and adds them to each feature set.
        """
        for document, label in data:
            for sentence in document:
                sentence.insert(0, '<s>')
                sentence.append('</s>')
                prev_token = None
                for index, token in enumerate(sentence):
                    self.unigrams.add(token)
                    if index > 0:
                        bigram = (prev_token, token)
                        self.bigrams.add(bigram)
                    if index > 1:
                        trigram = (prev_prev_token, prev_token, token)
                        self.trigrams.add(trigram)
                    prev_prev_token = prev_token
                    prev_token = token
                        
    def finalise_feature_sets(self):
        """
        Creates a dict for the feature sets for faster operations.
        """
        self.unigrams = list(self.unigrams)
        # create reverse map for fast token lookup
        self.unigram2index = {}
        for index, token in enumerate(self.unigrams):
            self.unigram2index[token] = index
            
        self.bigrams = list(self.bigrams)
        # create reverse map for fast token lookup
        self.bigram2index = {}
        for index, token in enumerate(self.bigrams):
            self.bigram2index[token] = index
            
        self.trigrams = list(self.trigrams)
        # create reverse map for fast token lookup
        self.trigram2index = {}
        for index, token in enumerate(self.trigrams):
            self.trigram2index[token] = index
        
        
    def extract_features(self, data, feature):
        raise NotImplementedError
    
    def get_targets(self, data, label2index = None):
        raise NotImplementedError
        
    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [33]:
class PolarityPredictorExtractFeatures(PolarityPredictorInit):
    
    def __init__(self, clip_counts = True, negation=False, remove_stopwords=False, lemmatise=False, additional_features=False, learning_model=MultinomialNB()):
        self.clip_counts = clip_counts
        self.negation = negation
        self.remove_stopwords = remove_stopwords
        self.model = learning_model
        self.lemmatise = lemmatise
        self.additional_features = additional_features
        
    def extract_features(self, data, ngram):
        """
        Creates features from the data.
        Depending on the input, a feature matrix for bag of words, bag of bigrams or bag of trigrams is created.
        """
        rows = len(data)
        feature_matrices = {}
        # Initialise a feature matrix with zeros
        feature_matrices['unigram'] = numpy.zeros((rows, len(self.unigrams)), dtype=numpy.int32)
        feature_matrices['bigram'] = numpy.zeros((rows, len(self.bigrams)), dtype=numpy.int32)
        feature_matrices['trigram'] = numpy.zeros((rows, len(self.trigrams)), dtype=numpy.int32)
        # populate feature matrix
        for row, item in enumerate(data):
            document, _ = item
            for sentence in document:
                if sentence[0] != '<s>':
                    sentence.insert(0, '<s>')
                if sentence[-1] != '</s>':
                    sentence.append('</s>')
                prev_token = None
                for idx, token in enumerate(sentence):
                    if ngram == 'unigram':
                        try:
                            unigram_index = self.unigram2index[token]
                        except KeyError:
                            continue
                        if self.clip_counts:
                            feature_matrices['unigram'][row, unigram_index] = 1
                        else:
                            feature_matrices['unigram'][row, unigram_index] += 1
                    if ngram == 'bigram':
                        if idx > 0:
                            bigram = (prev_token, token)
                            try:
                                bigram_index = self.bigram2index[bigram]
                            except KeyError:
                                continue
                            if self.clip_counts:
                                feature_matrices['bigram'][row, bigram_index] = 1
                            else:
                                feature_matrices['bigram'][row, bigram_index] += 1
                    if ngram == 'trigram':
                        if idx > 1:
                            trigram = (prev_prev_token, prev_token, token)
                            try:
                                trigram_index = self.trigram2index[trigram]
                            except KeyError:
                                continue
                            if self.clip_counts:
                                feature_matrices['trigram'][row, trigram_index] = 1
                            else:
                                feature_matrices['trigram'][row, trigram_index] += 1

                    prev_prev_token = prev_token
                    prev_token = token
        return feature_matrices[ngram]
        
    def add_additional_features(self, data, feature_matrix, negation):
        """
        Additional features representing number of positive lexicons, number of negative lexicons
        and number of negated words are added to the existing feature matrix built above.
        """
        self.doc2poslex = []
        self.doc2neglex = []
        for row, item in enumerate(data):
            document, _ = item
            doc_pos_tot = 0
            doc_neg_tot = 0
            for sentence in document:
                for token in sentence:
                    if token in self.positive_lexicons:
                        doc_pos_tot += 1
                    if token in self.negative_lexicons:
                        doc_neg_tot += 1
            self.doc2poslex.append(doc_pos_tot)
            self.doc2neglex.append(doc_neg_tot)
        if negation:
            additional_features = numpy.array((self.doc2negate, self.doc2poslex, self.doc2neglex)).T
        else:
            additional_features = numpy.array((self.doc2poslex, self.doc2neglex)).T
        new_feature_matrix = numpy.append(feature_matrix, additional_features, 1)
        return new_feature_matrix

In [34]:
class PolarityPredictorAssignTargets(PolarityPredictorExtractFeatures):
 
    def get_targets(self, data):
        ''' 
        Column vector with target labels is created.
        '''
        targets = numpy.zeros(len(data), dtype=numpy.int8)
        index = 0
        for _, label in data:
            if label == 'pos':
                targets[index] = 1
            index += 1
        return targets

    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [35]:
class PolarityPredictor(PolarityPredictorAssignTargets):

    def train_model_on_features(self, tr_features, tr_targets):
        self.model.fit(tr_features, tr_targets)
        
    def predict(self, data, feature, get_accuracy = False, get_confusion_matrix = False):
        
        if self.lemmatise:
            tagged_data = self.get_pos_tags(data)
            lemmatised_data = self.lemmatise_data(tagged_data)
            data = lemmatised_data
        
        if self.negation:
            self.add_negation_to_data(data)
        
        if self.remove_stopwords:
            self.remove_stopwords_from_data(data)
        
        features = self.extract_features(data, feature)
        
        if self.additional_features:
            self.get_sentiment_lexicons()
            features = self.add_additional_features(data, features, self.negation)
            
        y_pred = self.model.predict(features)
        
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
                
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(metrics.accuracy_score(y_true, y_pred))
            if get_confusion_matrix:
                retval.append(metrics.confusion_matrix(y_true, y_pred))
            return retval
        else:
            return labels

# Changes to Code

* We build bigram and trigram sets, along with vocab as before (renamed as the unigram set)
* We build feature matrices for these in the same way as was done with vocab previously
* We have added an option to use lemmatisation and negation
* We have added an option to remove stopwords from the data
* We have added '\<s>' and '\</s>' tags to the data 
* Slight refactor from initial code structure 

# Experiments

## Evaluation Table

Our model builds three different feature representations from the input data:
* Bag-of-Words (Unigrams)
* Bag-of-Bigrams
* Bag-of-Trigrams

The *PolarityPredictor* model takes in a *feature* parameter which specifies which feature of the above to use. There is also a *learning_model* parameter which specifies which particular learning model to use.

The model can also be tweaked to perform the following:
* Clip Counts in feature matrices
* Negation
* Removal of StopWords
* Lemmatisation

We plan to run many different experiments using different feature representations and different learning models.   Therefore, having an evaluation table which contains the details of each experiment and the corresponding evalation results will be useful. 
The following values will be recorded in the dataframe for each model:
* Average 10 Fold Cross-Validation Accuracy
* Root Mean Square Error (RMSE)
* Minimum Accuracy
* Maximum Accuracy

Below, we define table to store these.

In [36]:
evaluation_dataframes = {}
evaluation_dataframes['MultinomialNB'] = pd.DataFrame(columns=['name', 'learning_model', 'features', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])

In [37]:
def evaluate_model(model, splits, feature, verbose = False):
    accuracies = []
    fold = 0
    for tr_data, te_data in splits:
        if verbose:
            print('Evaluating fold %d of %d' %(fold+1, len(splits)))
            fold += 1
        model.train(tr_data, feature)
        _, accuracy = model.predict(te_data, feature, get_accuracy = True)
        accuracies.append(accuracy)
        if verbose:
            print('-->', accuracy)
    n = float(len(accuracies))
    avg = sum(accuracies) / n
    mse = sum([(x-avg)**2 for x in accuracies]) / n
    return (avg, mse**0.5, min(accuracies),
            max(accuracies))

def print_first_predictions(model, te_data, feature, n = 12):
    predictions = model.predict(te_data, feature)
    for i in range(n):
        document, label = te_data[i]
        prediction = predictions[i]
        print('%4d %s %s %s' %(i, label, prediction, get_document_preview(document),))

# 1. Multinomial NB

In [38]:
learning_model = MultinomialNB()

## 1.1 Baseline

We run the baseline approach as a functionality test. The settings used are listed in the cell below.

In [41]:
clip_counts = True
negation = False
remove_stopwords = True
lemmatise = True
additional_features = False
feature = 'unigram'

In [42]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, additional_features, learning_model)
model.train(splits[0][0], feature)

# baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)


labels, accuracy, confusion_matrix = model.predict(splits[0][1], feature, get_accuracy = True, get_confusion_matrix = True)

print(accuracy)
print(confusion_matrix)

0.825
[[85 15]
 [20 80]]


In [21]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
model.train(splits[0][0], feature)

baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['MultinomialNB'].loc[0] = ['baseline-NB-BoW-clip', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

Evaluating fold 1 of 10
--> 0.81
Evaluating fold 2 of 10
--> 0.835
Evaluating fold 3 of 10
--> 0.82
Evaluating fold 4 of 10
--> 0.83
Evaluating fold 5 of 10


KeyboardInterrupt: 

So, the baseline approach achieves an average accuracy score of 82.4% and has been to our evaluation table. The format of this dataframe can be seen below,

In [None]:
evaluation_dataframes

## 1.2 Parameter Experimentation

We will now run experiments using the MultinomialNB model with different parameters.


In [None]:
index = 1
for feature in ['bow', 'bob', 'bot']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                    avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                    evaluation_dataframes['MultinomialNB'].loc[evaluation_dataframes['MultinomialNB'].index.max()+1] = [f'param-exp-NB-{index}', learning_model, feature, clip_counts, 
                                                                      negation, remove_stopwords, lemmatise, avg, rmse, min_acc, max_acc]
                    index += 1

# 2. Logistic Regression

In [None]:
evaluation_dataframes['LogisticRegression'] = pd.DataFrame(columns=['name', 'learning_model', 'features', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])
from sklearn.linear_model import LogisticRegression
learning_model = LogisticRegression()

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
feature = 'bow' #bag of words

## 2.1 Baseline

In [None]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
model.train(splits[0][0], feature)

baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['LogisticRegression'].loc[0] = ['baseline-LR-BoW-clip', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

## 2.2 Parameter Experimentation

In [None]:
index = 1
for feature in ['bow', 'bob', 'bot']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                    avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                    evaluation_dataframes['LogisticRegression'].loc[evaluation_dataframes['LogisticRegression'].index.max()+1] = [f'param-exp-LR-{index}', learning_model, feature, clip_counts, 
                                                                      negation, remove_stopwords, lemmatise, avg, rmse, min_acc, max_acc]
                    index += 1

In [None]:
evaluation_dataframes['MultinomialNB']

In [None]:
evaluation_dataframes['LogisticRegression']

# 1. Model Name

In [None]:
# change below to suit model
evaluation_dataframes['LogisticRegression'] = pd.DataFrame(columns=['name', 'learning_model', 'features', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])
from sklearn.linear_model import LogisticRegression
learning_model = LogisticRegression()

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
feature = 'unigram'

## 1.1 Baseline