# Naive Bayes Classification with MPQA Dataset
<hr>

We will build a text classification model using Naive Bayes on the Opinion Polarity Detection subtask of the MPQA Dataset. Since there is no standard train/test split for this dataset, we will use 10-Fold Cross Validation (CV). 

## Load the library

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import random
from nltk.corpus import stopwords, twitter_samples
# from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import KFold
from nltk.stem import PorterStemmer
from string import punctuation
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
import time

%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False
# nltk.download('twitter_samples')

## Load the Dataset

In [3]:
corpus = pd.read_pickle('../0_data/MPQA/MPQA.pkl')
corpus.label = corpus.label.astype(int)
print(corpus.shape)
corpus

(10606, 3)


Unnamed: 0,sentence,label,split
0,complaining,0,train
1,failing to support,0,train
2,desperately needs,0,train
3,many years of decay,0,train
4,no quick fix,0,train
...,...,...,...
10601,urged,1,train
10602,strictly abide,1,train
10603,hope,1,train
10604,strictly abide,1,train


In [4]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10606 entries, 0 to 10605
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  10606 non-null  object
 1   label     10606 non-null  int32 
 2   split     10606 non-null  object
dtypes: int32(1), object(2)
memory usage: 207.3+ KB


In [5]:
corpus.groupby( by='label').count()

Unnamed: 0_level_0,sentence,split
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,7294,7294
1,3312,3312


In [7]:
# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)

## Raw Number of Vocabulary

In [8]:
# Build the raw vocobulary for first inspection
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_raw = tokenizer.word_index
print('\nThe vocabulary size: {}\n'.format(len(vocab_raw)))
print(vocab_raw)


The vocabulary size: 6234



<!--## Split Dataset-->

# Data Preprocessing
<hr>

## Define `clean_doc` function

In [9]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = PorterStemmer()
    
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

## Define `count_docs` function

In [10]:
def count_docs(data, docs, ys):
    '''
    Input:
        data: a dictionary that will be used to map each pair to its frequency
        docs: a list of sentences
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    onehot = OneHotEncoder( sparse=False)
    y_onehot = onehot.fit_transform(np.reshape(ys, (-1,1)))
    # print(y_onehot)
    count = 0                    
    for doc, y in zip(docs, y_onehot):
        # For each word
        for word in clean_doc(doc):
            # if not in the data yet
            if word not in data:
                # assign it
                data[word] = y
            # if already in the data 
            else:
                # update it
                data[word] = data.get(word) + y

    return data

## Build Frequencies Dictionary

In [11]:
# Build the freqs dictionary for later uses
freqs = count_docs({}, sentences, labels)
freqs

{'complain': array([11.,  2.]),
 'fail': array([33.,  3.]),
 'support': array([ 26., 238.]),
 'desper': array([19.,  0.]),
 'need': array([12.,  5.]),
 'mani': array([18.,  2.]),
 'year': array([18.,  3.]),
 'decay': array([2., 0.]),
 'quick': array([3., 2.]),
 'fix': array([2., 1.]),
 'happi': array([9., 8.]),
 'long': array([20.,  8.]),
 'pain': array([8., 1.]),
 'nobodi': array([3., 0.]),
 'complic': array([7., 1.]),
 'process': array([7., 1.]),
 'decad': array([3., 1.]),
 'dramat': array([6., 0.]),
 'econom': array([13.,  9.]),
 'declin': array([14.,  0.]),
 'suffer': array([26.,  0.]),
 'would': array([113.,  75.]),
 'sure': array([7., 3.]),
 'higher': array([3., 2.]),
 'still': array([16., 14.]),
 'forbidden': array([1., 0.]),
 'intox': array([1., 0.]),
 'far': array([26.,  6.]),
 'perfect': array([1., 1.]),
 'sour': array([1., 0.]),
 'relat': array([5., 7.]),
 'fell': array([2., 1.]),
 'mount': array([3., 0.]),
 'rivalri': array([1., 0.]),
 'hostil': array([14.,  0.]),
 'nation'

In [12]:
# convert the freqs dictionary to nested list
def freqs_to_df(freqs, train_y):
    '''
    input:
        freqs: a frequencies dictionary (ex: {'simplist': array([15.,  4.]), 
                                              'silli': array([64., 20.]), . . })
        train_y: labels for data
    output:
        a frequencies dictionary in the form of dataframe
    '''
    # initialize an empty list to store the rows for dataframe
    freqs_list = []
    
    # Define the names of the dataframe columns
    column_names = ['word']
    column_names = column_names + list(np.unique(train_y))
    
    # convert the keys from the freqs dictionary to a list
    keys = list(freqs.keys())

    # For each row
    for i in range(len(freqs)):
        
        # define the elements for each column
        row = [keys[i]] + list(freqs.get(keys[i]))
        
        # update the frequency list
        freqs_list.append(row)
    
    # Create the dataframe
    df = pd.DataFrame(freqs_list, columns=column_names)
    df.set_index('word', inplace=True)
    return df

In [13]:
freqs_df = freqs_to_df(freqs, labels)
freqs_df

Unnamed: 0_level_0,0,1
word,Unnamed: 1_level_1,Unnamed: 2_level_1
complain,11.0,2.0
fail,33.0,3.0
support,26.0,238.0
desper,19.0,0.0
need,12.0,5.0
...,...,...
nich,0.0,2.0
fairer,0.0,2.0
window,0.0,3.0
soften,0.0,1.0


In [14]:
freqs_df[0].sum()

14903.0

# Training and Testing the Model

## Build Training Function

In [15]:
def train_naive_bayes(freq_df, train_x, train_y):
    '''
    Input:
        freqs: a pandas dataframe with word indexing
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    
    freqs = freq_df

    # calculate V, the number of unique words in the vocabulary
    vocab = list(freqs.index)
    V = len(vocab)
    
    ########################################################################################
    # Part 1: Calculate the log prior probability for each class
    
    # Calculate D, the number of documents
    D = len(train_y)
    
    labels = list(np.unique(train_y.astype(int)))
    count = np.zeros((len(labels),))

    for train_label in train_y:
        for unique_label in labels:
            if train_label == unique_label:
                count[unique_label]+=1
                
    # -> count = [4000, 4000]; it means perfectly balanced between each classese

    # Calculate prior probability for each class
    prior = count/D # -> prior = array([0.5, 0.5])
    
        
    # Calculate the logprior for each class
    logprior = np.log(prior) # -> prior = array([-0.69314718, -0.69314718])
    
    ########################################################################################
    # Part 2.a. Calculate the total number of word occurrences for each class
    
    columns = list(freqs.columns)
    N_classes = []
    # calculate N frequency for each class
    for column in columns:
        
        freqs[column] = (freqs[column] + 1)/(freqs[column].sum()+V)
        
    # Calculate the log likelihood of the word
    loglikelihood = np.log(freqs)
        
    ########################################################################################
        
    return logprior, loglikelihood

In [16]:
labels = np.array(labels)
logprior, loglikelihood = train_naive_bayes(freqs_df, sentences, labels)
print(logprior)
loglikelihood

[-0.37436779 -1.16386764]


Unnamed: 0_level_0,0,1
word,Unnamed: 1_level_1,Unnamed: 2_level_1
complain,-7.370755,-8.107620
fail,-6.329301,-7.819938
support,-6.559825,-3.729768
desper,-6.859930,-9.206232
need,-7.290713,-7.414472
...,...,...
nich,-9.855662,-8.107620
fairer,-9.855662,-8.107620
window,-9.855662,-7.819938
soften,-9.855662,-8.513085


## Build Testing Function

In [17]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    input:
        tweet: a string
        logprior: initial probability based on dataset
        loglikelihood: a dictionary of words mapping to numbers
    output:
        p: the sum of all the loglikelihood of each word in the tweet
        (if found in the dictionary) + logprior (a number)
    '''
    
    # process the tweet to get the list of words
    words = clean_doc(tweet)
    
    
    # Initialize probability to zero
    probs = []
    columns = list(loglikelihood.columns)
    for column in columns:
        prob = 0
        # Iterate for each word in word list
        for word in words:

            # check if the word exist in the loglikelihood dictionary
            if word in loglikelihood.index:
                prob += loglikelihood.loc[word, column]
        
        probs.append(prob)
        
    probs = logprior + probs
    y_hat = np.argmax(probs)
    
    return probs, y_hat

In [18]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    output:
        accuracy: (# of tweets classified correctly)/total # of tweets
    """
    # initial accuracy
    acc = 0
    
    # initialize an empty list for storing the predictions
    y_hats = []

    for tweet in test_x:
        
        _ , y = naive_bayes_predict(tweet, logprior, loglikelihood)
        
#         y_hat = np.argmax(probs)
        
        # update the y_hats
        y_hats.append(y)
        
#     Error: the mean absolute values between y_hats and test_y
    error = np.mean(np.abs(np.array(y_hats)-np.array(test_y)))
        
#     Accuracy is 1 - error
    acc = 1-error
    return acc    

## KFold CV

In [19]:
###############################################
# Training and Testing using the same Dataset #
###############################################

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), np.array(list(corpus.label))

# Build the freqs dictionary for later uses
freqs = count_docs({}, sentences, labels)

# Turn the frequencies dictionary into dataframe
freqs_df = freqs_to_df(freqs, labels)
print(freqs_df.head())

# Retrieve the logprior and loglikelihood
logprior, loglikelihood = train_naive_bayes(freqs_df, sentences, labels)
print(logprior)
print(loglikelihood.head())

print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(sentences, labels, logprior, loglikelihood)))

             0      1
word                 
complain  11.0    2.0
fail      33.0    3.0
support   26.0  238.0
desper    19.0    0.0
need      12.0    5.0
[-0.37436779 -1.16386764]
                 0         1
word                        
complain -7.370755 -8.107620
fail     -6.329301 -7.819938
support  -6.559825 -3.729768
desper   -6.859930 -9.206232
need     -7.290713 -7.414472
Naive Bayes accuracy = 0.8995


In [20]:
#######################################################
# Training and Testing using 10-fold Cross Validation #
#######################################################

# prepare cross validation
kfold = KFold(10, True)

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
# count = 0
acc_list = []
# kfold.split() will return set indices for each split
for train, test in kfold.split(sentences):
    train_x, train_y = [], []
    test_x, test_y = [], []
    for i in train:
        train_x.append(sentences[i])
        train_y.append(labels[i])
        
    for i in test:
        test_x.append(sentences[i])
        test_y.append(labels[i])
    
    train_y = np.array(train_y)
    test_y = np.array(test_y)
    
    # Build the freqs dictionary for later uses
    freqs = count_docs({}, train_x, train_y)

    # Turn the frequencies dictionary into dataframe
    freqs_df = freqs_to_df(freqs, train_y)
#     print(freqs_df.head())

    # Retrieve the logprior and loglikelihood
    logprior, loglikelihood = train_naive_bayes(freqs_df, train_x, train_y)
    print('loglikelihood:\n{}'.format(loglikelihood.head()))
    print('logprior: {}'.format(logprior))
    
    acc = test_naive_bayes(test_x, test_y, logprior, loglikelihood)
    print("Naive Bayes test accuracy = %0.4f\n" %(acc))
    
    acc_list.append(acc)

acc_list = np.array(acc_list)
print()
print('The test ccuracy for each training:\n{}'.format(acc_list))
print('The mean of the test accuracy: ', acc_list.mean())



loglikelihood:
                 0         1
word                        
complain -7.276902 -8.030193
fail     -6.296073 -7.742511
support  -6.670766 -3.748908
desper   -6.928595 -9.128805
need     -7.459224 -7.337045
logprior: [-0.37777453 -1.15640567]
Naive Bayes test accuracy = 0.8360

loglikelihood:
                 0         1
word                        
complain -7.459684 -8.038727
fail     -6.361072 -8.038727
support  -6.584216 -3.771363
quick    -8.375975 -8.038727
fix      -8.663657 -8.444192
logprior: [-0.3779274  -1.15607272]
Naive Bayes test accuracy = 0.8379

loglikelihood:
                 0         1
word                        
complain -7.365870 -8.433812
fail     -6.329778 -7.740664
support  -6.467929 -3.728796
desper   -6.873394 -9.126959
need     -7.278859 -7.517521
logprior: [-0.37746886 -1.15707189]
Naive Bayes test accuracy = 0.8577

loglikelihood:
                0         1
word                       
fail    -6.424464 -8.021694
support -6.578614 -3.749668
des