# Sentiment Polarity Prediction with Naive Bayes

This notebook contains a basic implementation of document-level sentiment analysis
for movie reviews with multinomial Naive Bayes and bag-of-words features
and of cross-validation.
* No special treatment of rare or unknown words. Unknown words in the test data are skipped.

We use the movie review polarity data set of Pang and Lee 2004 [A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts](https://www.aclweb.org/anthology/P04-1035/) in Version 2.0 available from http://www.cs.cornell.edu/People/pabo/movie-review-data (section "Sentiment polarity datasets"). This dataset contains 1000 positive and 1000 negative reviews, each tokenised, sentence-split (one sentence per line) and lowercased. Each review has been assigned to 1 of 10 cross-validation folds by the authors and this setup should be followed to compare with published results.


In [1]:
import os
import tarfile
import time
import urllib.request
import numpy
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
data_source = 'local-folder'
data_folder = os.path.join('data', 'txt_sentoken')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\carro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\carro\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [2]:
class PL04DataLoader_Part_1:
    
    def __init__(self):
        pass
    
    def get_labelled_dataset(self, fold = 0):
        ''' Compile a fold of the data set
        '''
        dataset = []
        for label in ('pos', 'neg'):
            for document in self.get_documents(
                fold = fold,
                label = label,
            ):
                dataset.append((document, label))
        return dataset
    
    def get_documents(self, fold = 0, label = 'pos'):
        ''' Enumerate the raw contents of all data set files.
            Args:
                data_dir: relative or absolute path to the data set folder
                fold: which fold to load (0 to n_folds-1)
                label: 'pos' or 'neg' to
                    select data with positive or negative sentiment
                    polarity
            Return:
                List of tokenised documents, each a list of sentences
                that in turn are lists of tokens
        '''
        raise NotImplementedError

In [3]:
class PL04DataLoader(PL04DataLoader_Part_1):
    
    def get_xval_splits(self):
        ''' Split data with labels for cross-validation
            returns a list of k pairs (training_data, test_data)
            for k cross-validation
        '''
        # load the folds
        folds = []
        for i in range(10):
            folds.append(self.get_labelled_dataset(
                fold = i
            ))
        # create training-test splits
        retval = []
        for i in range(10):
            test_data = folds[i]
            training_data = []
            for j in range(9):
                ij1 = (i+j+1) % 10
                assert ij1 != i
                training_data = training_data + folds[ij1]
            retval.append((training_data, test_data))
        return retval

In [4]:
class PL04DataLoaderFromStream(PL04DataLoader):
        
    def __init__(self, tgz_stream, **kwargs):
        super().__init__(**kwargs)
        self.data = {}
        counter = 0
        with tarfile.open(
            mode = 'r|gz',
            fileobj = tgz_stream
        ) as tar_archive:
            for tar_member in tar_archive:
                if counter == 2000:
                    break
                path_components = tar_member.name.split('/')
                filename = path_components[-1]
                if filename.startswith('cv') \
                and filename.endswith('.txt') \
                and '_' in filename:
                    label = path_components[-2]
                    fold = int(filename[2])
                    key = (fold, label)
                    if key not in self.data:
                        self.data[key] = []
                    f = tar_archive.extractfile(tar_member)
                    document = [
                        line.decode('utf-8').split()
                        for line in f.readlines()
                    ]
                    self.data[key].append(document)
                    counter += 1
            
    def get_documents(self, fold = 0, label = 'pos'):
        return self.data[(fold, label)]

## Read Data from the Web
This should run efficiently both on google colab and locally but has the disadvantage that the same data is downloaded each time the notebook is run.

In [5]:
class PL04DataLoaderFromURL(PL04DataLoaderFromStream):
    
    def __init__(self, data_url, **kwargs):
        with urllib.request.urlopen(data_url) as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

## Read Data from a Local .tgz File

You manually download the .tgz once to a filesystem that can be accessed from the notebook, e.g. google drive on colab, and this notebook reads this file in one chunk. 

Note that if you are accessing files from google drive on colab, you will need to mount your drive and enter an authentication token:

```
from google.colab import drive
drive.mount('/content/drive')
```

You will also have to change your *data_tgz* or *data_folder* paths above so that they start with *'/content/drive/My Drive/'*

In [6]:
class PL04DataLoaderFromTGZ(PL04DataLoaderFromStream):
    
    def __init__(self, data_path, **kwargs):
        with open(data_path, 'rb') as tgz_stream:
            super().__init__(tgz_stream, **kwargs)

## Read Data from a Local Folder

Extract the .tgz to a local folder and only load the required files. This is usually the fastest option when storage is on a local SSD. On remote filesystems, however, this can be very slow.

In [7]:
class PL04DataLoaderFromFolder(PL04DataLoader):
    
    def __init__(self, data_dir, **kwargs):
        self.data_dir = data_dir
        super().__init__(**kwargs)
        
    def get_documents(self, fold = 0, label = 'pos'):
        # read folder contents
        path = os.path.join(self.data_dir, label)
        dir_entries = os.listdir(path)
        # must process entries in numeric order to
        # replicate order of original experiments
        dir_entries.sort()
        # check each entry and add to data if matching
        # selection criteria
        for filename in dir_entries:
            if filename.startswith('cv') \
            and filename.endswith('.txt'):
                if fold == int(filename[2]):
                    # correct fold
                    f = open(os.path.join(path, filename), 'rt')
                    # "yield" tells Python to return an iterator
                    # object that produces the yields of this
                    # function as elements without creating a
                    # full list of all elements
                    yield [line.split() for line in f.readlines()]
                    f.close()

In [8]:
if data_source == 'local-folder':
    data_loader = PL04DataLoaderFromFolder(data_folder)
elif data_source == 'local-tgz':
    data_loader = PL04DataLoaderFromTGZ(data_tgz)
elif data_source == 'web':
    data_loader = PL04DataLoaderFromURL(data_url)
else:
    raise ValueError('Unsupported data source %r' %data_source)

In [9]:
def get_document_preview(document, max_length = 72):
    s = []
    count = 0
    reached_limit = False
    for sentence in document:
        for token in sentence:
            if count + len(token) + len(s) > max_length:
                reached_limit = True
                break
            s.append(token)
            count += len(token)
        if reached_limit:
            break
    return '|'.join(s)
    
for label in 'pos neg'.split():
    print(f'== {label} ==')
    print('doc sentences start of first sentence')
    for index, document in enumerate(data_loader.get_documents(
        label = label
    )):
        print('%3d %7d   %s' %(
            index, len(document), get_document_preview(document)
        ))
        if index == 4:
            break

== pos ==
doc sentences start of first sentence
  0      25   films|adapted|from|comic|books|have|had|plenty|of|success|,|whether
  1      39   every|now|and|then|a|movie|comes|along|from|a|suspect|studio|,|with
  2      19   you've|got|mail|works|alot|better|than|it|deserves|to|.|in|order|to|make
  3      42   "|jaws|"|is|a|rare|film|that|grabs|your|attention|before|it|shows|you|a
  4      25   moviemaking|is|a|lot|like|being|the|general|manager|of|an|nfl|team|in
== neg ==
doc sentences start of first sentence
  0      35   plot|:|two|teen|couples|go|to|a|church|party|,|drink|and|then|drive|.
  1      13   the|happy|bastard's|quick|movie|review|damn|that|y2k|bug|.|it's|got|a
  2      23   it|is|movies|like|these|that|make|a|jaded|movie|viewer|thankful|for|the
  3      19   "|quest|for|camelot|"|is|warner|bros|.|'|first|feature-length|,
  4      37   synopsis|:|a|mentally|unstable|man|undergoing|psychotherapy|saves|a|boy


## Create Training-Test Splits for Cross-Validation

In [10]:
splits = data_loader.get_xval_splits()

print('tr-size te-size (number of documents)')
for xval_tr_data, xval_te_data in splits:
    print('%7d %7d' %(len(xval_tr_data), len(xval_te_data)))

tr-size te-size (number of documents)
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200
   1800     200


# Interface for Sentiment Polarity Predictor
Let's define a base class to clarify how we plan to use polarity predictors. Its functions will have to be implemented in sub-classes.

In [12]:
class PolarityPredictorInterface:

    def train(self, data_with_labels):
        raise NotImplementedError
        
    def predict(self, data):
        raise NotImplementedError

In [13]:
class PolarityPredictorInit(PolarityPredictorInterface):
    
    def train(self, data_with_labels, feature):
        """
        Function which trains model. Extracts features from extract_features function
        (changes for different features). Gets targets also and passes both to training function.
        """
        
        # Initialise vocab set object
        self.reset_feature_sets()
        # negate first then remove stop words? probably makes most sense
        if self.lemmatise:
            self.lemmatise_data(data_with_labels)
        
        if self.negation:
            self.add_negation_to_data(data_with_labels)
            
        if self.remove_stopwords:
            self.get_stopwords()
            self.remove_stopwords_from_data(data_with_labels)
            
        # Populate with the data
        self.add_to_feature_sets_from_data(data_with_labels)
        
        self.finalise_vocab()
        tr_features = self.extract_features(data_with_labels, feature)
        tr_targets = self.get_targets(data_with_labels)
        self.train_model_on_features(tr_features, tr_targets)

    def reset_feature_sets(self):
        """
        Initialises a set to hold each of the feature sets.
        """
        self.vocab = set()
        self.bigrams = set()
        self.trigrams = set()

    def lemmatise_data(self, data):
        """
        Function which lemmatises each token in the dataset.
        """
        lemmatizer = nltk.stem.WordNetLemmatizer()
        for document, label in data:
            for sentence in document:
                for index, token in enumerate(sentence):
                    sentence[index] = lemmatizer.lemmatize(token)
    
    def add_negation_to_data(self, data):
        """
        Function which negates words which follow not, no or 'n't'.
        """
        for document, label in data:
            for sentence in document:
                negate = False
                for index, token in enumerate(sentence):
                    if token in ('not', 'no') or (token[-3:] == "n't"):
                        negate = True
                        continue
                    if token == '.':
                        negate = False
                    if negate:
                        sentence[index] = 'NOT_' + token
    
    def remove_stopwords_from_data(self, data):
        """
        Function which removes stop words from the dataset.
        """
        for document, label in data:
            for sentence in document:
                stopword_indices = []
                for index, token in enumerate(sentence):
                    if token in self.stopwords:
                        stopword_indices.append(index)
                stopword_indices.reverse()
                for index in stopword_indices:
                    del sentence[index]
    
    def get_stopwords(self):
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        
    def add_to_feature_sets_from_data(self, data):
        """
        Parses tokens in data and adds them to each feature set.
        """
        for document, label in data:
            for sentence in document:
#                 sentence.insert(0, '<s>')
#                 sentence.append('</s>')
                prev_token = None
                for index, token in enumerate(sentence):
                    self.vocab.add(token)
                    if index > 0:
                        bigram = (prev_token, token)
                        self.bigrams.add(bigram)
                    if index > 1:
                        trigram = (prev_prev_token, prev_token, token)
                        self.trigrams.add(trigram)
                    prev_prev_token = prev_token
                    prev_token = token
                        
    def finalise_vocab(self):
        """
        Creates a dict for the feature sets for faster operations.
        """
        self.vocab = list(self.vocab)
        # create reverse map for fast token lookup
        self.vocab2index = {}
        for index, token in enumerate(self.vocab):
            self.vocab2index[token] = index
            
        self.bigrams = list(self.bigrams)
        # create reverse map for fast token lookup
        self.bigram2index = {}
        for index, token in enumerate(self.bigrams):
            self.bigram2index[token] = index
            
        self.trigrams = list(self.trigrams)
        # create reverse map for fast token lookup
        self.trigram2index = {}
        for index, token in enumerate(self.trigrams):
            self.trigram2index[token] = index
        
        
    def extract_features(self, data, feature):
        raise NotImplementedError
    
    def get_targets(self, data, label2index = None):
        raise NotImplementedError
        
    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [13]:
class PolarityPredictorExtractFeatures(PolarityPredictorInit):
    
    def __init__(self, clip_counts = True, negation=False, remove_stopwords=False, lemmatise=False, learning_model=MultinomialNB()):
        self.clip_counts = clip_counts
        self.negation = negation
        self.remove_stopwords = remove_stopwords
        self.model = learning_model
        self.lemmatise = lemmatise
        
    def extract_features(self, data, ngram):
        """
        Creates features from the data. This implementation creates a dict which contains the relevant feature
        matrices for different feature implementations.
        """
        feature_matrices = {}            
#         for feature in ['bow', 'bob', 'bot']:
        rows = len(data)
        # Initialise a feature matrix with zeros
        feature_matrices['bow'] = numpy.zeros((rows, len(self.vocab)), dtype=numpy.int32)
        feature_matrices['bob'] = numpy.zeros((rows, len(self.bigrams)), dtype=numpy.int32)
        feature_matrices['bot'] = numpy.zeros((rows, len(self.trigrams)), dtype=numpy.int32)
        # populate feature matrix
        for row, item in enumerate(data):
            document, _ = item
            for sentence in document:
#                     if sentence[0] != '<s>':
#                         sentence.insert(0, '<s>')
#                     if sentence[-1] != '</s>':
#                         sentence.append('</s>')
                prev_token = None
                for idx, token in enumerate(sentence):
                    # word
                    try:
                        bow_index = self.vocab2index[token]
                    except KeyError:
                        continue
                    if self.clip_counts:
                        feature_matrices['bow'][row, bow_index] = 1
                    else:
                        feature_matrices['bow'][row, bow_index] += 1
                    # bigram
                    if idx > 0:
                        bigram = (prev_token, token)
                        try:
                            bob_index = self.bigram2index[bigram]
                        except KeyError:
                            continue
                        if self.clip_counts:
                            feature_matrices['bob'][row, bob_index] = 1
                        else:
                            feature_matrices['bob'][row, bob_index] += 1
                    # trigram
                    if idx > 1:
                        trigram = (prev_prev_token, prev_token, token)
                        try:
                            bot_index = self.trigram2index[trigram]
                        except KeyError:
                            continue
                        if self.clip_counts:
                            feature_matrices['bot'][row, bot_index] = 1
                        else:
                            feature_matrices['bot'][row, bot_index] += 1

                    prev_prev_token = prev_token
                    prev_token = token
        if ngram == 'bow':
            return feature_matrices['bow']
        if ngram == 'bob':
            return feature_matrices['bob']
        if ngram == 'bot':
            return feature_matrices['bot']

In [14]:
class PolarityPredictorAssignTargets(PolarityPredictorExtractFeatures):
 
    def get_targets(self, data):
        ''' create column vector with target labels
        '''
        # prepare target vector
        targets = numpy.zeros(len(data), dtype=numpy.int8)
        index = 0
        for _, label in data:
            if label == 'pos':
                targets[index] = 1
            index += 1
        return targets

    def train_model_on_features(self, tr_features, tr_targets):
        raise NotImplementedError

In [15]:
class PolarityPredictor(PolarityPredictorAssignTargets):

    def train_model_on_features(self, tr_features, tr_targets):
        # pass numpy array to sklearn to train 
        self.model.fit(tr_features, tr_targets)
        
    def predict(self, data, feature, get_accuracy = False, get_confusion_matrix = False):
        if self.lemmatise:
            self.lemmatise_data(data)
        if self.negation:
            self.add_negation_to_data(data)
        if self.remove_stopwords:
            self.remove_stopwords_from_data(data)

        # Extract features from unseen data
        features = self.extract_features(data, feature)
        # use numpy to get predictions
        y_pred = self.model.predict(features)
        # restore labels
        labels = []
        for is_positive in y_pred:
            if is_positive:
                labels.append('pos')
            else:
                labels.append('neg')
        if get_accuracy or get_confusion_matrix:
            retval = []
            retval.append(labels)
            y_true = self.get_targets(data)
            if get_accuracy:
                retval.append(
                    metrics.accuracy_score(y_true, y_pred)
                )
            if get_confusion_matrix:
                retval.append(
                    metrics.confusion_matrix(y_true, y_pred)
                )
            return retval
        else:
            return labels

# Experiments

## Evaluation Table

Our model builds three different feature representations from the input data:
* Bag-of-Words (Unigrams)
* Bag-of-Bigrams
* Bag-of-Trigrams

The *PolarityPredictor* model takes in a *feature* parameter which specifies which feature set of the above to use. There is also a *learning_model* parameter which specifies which particular learning model to use.

The model can also be tweaked to perform the following:
* Clip Counts in feature matrices
* Negation
* Removal of StopWords
* Lemmatisation

We plan to run many different experiments using different feature representations and different learning models.   Therefore, having an evaluation table which contains the details of each experiment and the corresponding evalation results will be useful. 
The following values will be recorded in the dataframe for each model:
* Average 10 Fold Cross-Validation Accuracy
* Root Mean Square Error (RMSE)
* Minimum Accuracy
* Maximum Accuracy

Below, we define table to store these.

In [16]:
evaluation_dataframes = {}
evaluation_dataframes['MultinomialNB'] = pd.DataFrame(columns=['name', 'learning_model', 'features', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])

In [17]:
def evaluate_model(model, splits, feature, verbose = False):
    accuracies = []
    fold = 0
    for tr_data, te_data in splits:
        if verbose:
            print('Evaluating fold %d of %d' %(fold+1, len(splits)))
            fold += 1
        model.train(tr_data, feature)
        _, accuracy = model.predict(te_data, feature, get_accuracy = True)
        accuracies.append(accuracy)
        if verbose:
            print('-->', accuracy)
    n = float(len(accuracies))
    avg = sum(accuracies) / n
    mse = sum([(x-avg)**2 for x in accuracies]) / n
    return (avg, mse**0.5, min(accuracies),
            max(accuracies))

def print_first_predictions(model, te_data, feature, n = 12):
    predictions = model.predict(te_data, feature)
    for i in range(n):
        document, label = te_data[i]
        prediction = predictions[i]
        print('%4d %s %s %s' %(i, label, prediction, get_document_preview(document),))

# 1. Multinomial NB

In [18]:
learning_model = MultinomialNB()

## 1.1 Baseline

We run the baseline approach as a functionality test. The settings used are listed in the cell below.

In [19]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
feature = 'bow' #bag of words

In [20]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
model.train(splits[0][0], feature)

baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['MultinomialNB'].loc[0] = ['baseline-NB-BoW-clip', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

Evaluating fold 1 of 10
--> 0.795
Evaluating fold 2 of 10
--> 0.84
Evaluating fold 3 of 10
--> 0.84
Evaluating fold 4 of 10
--> 0.825
Evaluating fold 5 of 10
--> 0.835
Evaluating fold 6 of 10
--> 0.83
Evaluating fold 7 of 10
--> 0.84
Evaluating fold 8 of 10
--> 0.845
Evaluating fold 9 of 10
--> 0.785
Evaluating fold 10 of 10
--> 0.855


So, the baseline approach achieves an average accuracy score of 82.4% and has been to our evaluation table. The format of this dataframe can be seen below,

In [21]:
evaluation_dataframes

{'MultinomialNB':                    name   learning_model features clip_counts negation  \
 0  baseline-NB-BoW-clip  MultinomialNB()      bow        True    False   
 
   remove_stopwords lemmatise  avg_cv_acc      rmse  min_acc  max_acc  
 0            False     False       0.829  0.021071    0.785    0.855  }

## 1.2 Parameter Experimentation

We will now run experiments using the MultinomialNB model with different parameters.


In [22]:
index = 1
for feature in ['bow', 'bob', 'bot']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                    avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                    evaluation_dataframes['MultinomialNB'].loc[evaluation_dataframes['MultinomialNB'].index.max()+1] = [f'param-exp-NB-{index}', learning_model, feature, clip_counts, 
                                                                      negation, remove_stopwords, lemmatise, avg, rmse, min_acc, max_acc]
                    index += 1

# 2. Logistic Regression

In [23]:
evaluation_dataframes['LogisticRegression'] = pd.DataFrame(columns=['name', 'learning_model', 'features', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])

from sklearn.linear_model import LogisticRegression
learning_model = LogisticRegression()

In [24]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
feature = 'bow' #bag of words

## 2.1 Baseline

In [25]:
model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
model.train(splits[0][0], feature)

baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc = evaluate_model(model, splits, feature, verbose = True)
evaluation_dataframes['LogisticRegression'].loc[0] = ['baseline-LR-BoW-clip', learning_model, feature, clip_counts, negation, remove_stopwords, lemmatise, baseline_avg, baseline_rmse, baseline_min_acc, baseline_max_acc]

Evaluating fold 1 of 10
--> 0.82
Evaluating fold 2 of 10
--> 0.845
Evaluating fold 3 of 10
--> 0.865
Evaluating fold 4 of 10
--> 0.84
Evaluating fold 5 of 10
--> 0.875
Evaluating fold 6 of 10
--> 0.855
Evaluating fold 7 of 10
--> 0.855
Evaluating fold 8 of 10
--> 0.845
Evaluating fold 9 of 10
--> 0.83
Evaluating fold 10 of 10
--> 0.9


## 2.2 Parameter Experimentation

In [26]:
index = 1
for feature in ['bow', 'bob', 'bot']:
    for clip_counts in (True, False):
        for negation in (True, False):
            for remove_stopwords in (True, False):
                for lemmatise in (True, False):
                    model = PolarityPredictor(clip_counts, negation, remove_stopwords, lemmatise, learning_model)
                    avg, rmse, min_acc, max_acc = evaluate_model(model, splits, feature, verbose = False)
                    evaluation_dataframes['LogisticRegression'].loc[evaluation_dataframes['LogisticRegression'].index.max()+1] = [f'param-exp-LR-{index}', learning_model, feature, clip_counts, 
                                                                      negation, remove_stopwords, lemmatise, avg, rmse, min_acc, max_acc]
                    index += 1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [28]:
evaluation_dataframes['MultinomialNB']

Unnamed: 0,name,learning_model,features,clip_counts,negation,remove_stopwords,lemmatise,avg_cv_acc,rmse,min_acc,max_acc
0,baseline-NB-BoW-clip,MultinomialNB(),bow,True,False,False,False,0.829,0.021071,0.785,0.855
1,param-exp-NB-1,MultinomialNB(),bow,True,True,True,True,0.8245,0.020427,0.775,0.85
2,param-exp-NB-2,MultinomialNB(),bow,True,True,True,False,0.8245,0.020427,0.775,0.85
3,param-exp-NB-3,MultinomialNB(),bow,True,True,False,True,0.8245,0.020427,0.775,0.85
4,param-exp-NB-4,MultinomialNB(),bow,True,True,False,False,0.8245,0.020427,0.775,0.85
5,param-exp-NB-5,MultinomialNB(),bow,True,False,True,True,0.8245,0.020427,0.775,0.85
6,param-exp-NB-6,MultinomialNB(),bow,True,False,True,False,0.8245,0.020427,0.775,0.85
7,param-exp-NB-7,MultinomialNB(),bow,True,False,False,True,0.8245,0.020427,0.775,0.85
8,param-exp-NB-8,MultinomialNB(),bow,True,False,False,False,0.8245,0.020427,0.775,0.85
9,param-exp-NB-9,MultinomialNB(),bow,False,True,True,True,0.809,0.034191,0.745,0.85


In [29]:
evaluation_dataframes['LogisticRegression']

Unnamed: 0,name,learning_model,features,clip_counts,negation,remove_stopwords,lemmatise,avg_cv_acc,rmse,min_acc,max_acc
0,baseline-LR-BoW-clip,LogisticRegression(),bow,True,False,False,False,0.853,0.021817,0.82,0.9
1,param-exp-LR-1,LogisticRegression(),bow,True,True,True,True,0.853,0.021817,0.82,0.9
2,param-exp-LR-2,LogisticRegression(),bow,True,True,True,False,0.853,0.021817,0.82,0.9
3,param-exp-LR-3,LogisticRegression(),bow,True,True,False,True,0.853,0.021817,0.82,0.9
4,param-exp-LR-4,LogisticRegression(),bow,True,True,False,False,0.853,0.021817,0.82,0.9
5,param-exp-LR-5,LogisticRegression(),bow,True,False,True,True,0.853,0.021817,0.82,0.9
6,param-exp-LR-6,LogisticRegression(),bow,True,False,True,False,0.853,0.021817,0.82,0.9
7,param-exp-LR-7,LogisticRegression(),bow,True,False,False,True,0.853,0.021817,0.82,0.9
8,param-exp-LR-8,LogisticRegression(),bow,True,False,False,False,0.853,0.021817,0.82,0.9
9,param-exp-LR-9,LogisticRegression(),bow,False,True,True,True,0.8385,0.017037,0.82,0.87


# 3. Decision Trees

In [11]:
evaluation_dataframes['DecisionTree'] = pd.DataFrame(columns=['name', 'learning_model', 'features', 'clip_counts', 'negation', 'remove_stopwords', 'lemmatise', 'avg_cv_acc', 'rmse', 'min_acc', 'max_acc'])

from sklearn.tree import DecisionTreeClassifier
learning_model = DecisionTreeClassifier()

NameError: name 'evaluation_dataframes' is not defined

In [None]:
clip_counts = True
negation = False
remove_stopwords = False
lemmatise = False
feature = 'bow' #bag of words