# Naive Bayes Classification with SUBJ Dataset
<hr>

We will build a text classification model using Naive Bayes on the Subjectivitiy Dataset. Since there is no standard train/test split for this dataset, we will use 10-Fold Cross Validation (CV). 

## Load the library

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import random
from nltk.corpus import stopwords, twitter_samples
# from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import KFold
from nltk.stem import PorterStemmer
from string import punctuation
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
import time

%config IPCompleter.greedy=True
%config IPCompleter.use_jedi=False
# nltk.download('twitter_samples')

## Load the Dataset

In [3]:
corpus = pd.read_pickle('../0_data/SUBJ/SUBJ.pkl')
corpus.label = corpus.label.astype(int)
print(corpus.shape)
corpus

(10000, 3)


Unnamed: 0,sentence,label,split
0,"smart and alert , thirteen conversations about...",0,train
1,"color , musical bounce and warm seas lapping o...",0,train
2,it is not a mass market entertainment but an u...,0,train
3,a light hearted french film about the spiritua...,0,train
4,my wife is an actress has its moments in looki...,0,train
...,...,...,...
9995,"in the end , they discover that balance in lif...",1,train
9996,a counterfeit 1000 tomin bank note is passed i...,1,train
9997,enter the beautiful and mysterious secret agen...,1,train
9998,after listening to a missionary from china spe...,1,train


In [4]:
corpus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  10000 non-null  object
 1   label     10000 non-null  int32 
 2   split     10000 non-null  object
dtypes: int32(1), object(2)
memory usage: 195.4+ KB


In [5]:
corpus.groupby( by='label').count()

Unnamed: 0_level_0,sentence,split
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,5000,5000
1,5000,5000


In [6]:
# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)

## Raw Number of Vocabulary

In [7]:
# Build the raw vocobulary for first inspection
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_raw = tokenizer.word_index
print('\nThe vocabulary size: {}\n'.format(len(vocab_raw)))
print(vocab_raw)


The vocabulary size: 21322



<!--## Split Dataset-->

# Data Preprocessing
<hr>

## Define `clean_doc` function

In [8]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stemmer = PorterStemmer()
    
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(punctuation))
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    # remove remaining tokens that are not alphabetic
#     tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    tokens = [w for w in tokens if not w in stopwords]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) >= 1]
    # Stem the token
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

## Define `count_docs` function

In [9]:
def count_docs(data, docs, ys):
    '''
    Input:
        data: a dictionary that will be used to map each pair to its frequency
        docs: a list of sentences
        ys: a list corresponding to the sentiment of each tweet (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    onehot = OneHotEncoder( sparse=False)
    y_onehot = onehot.fit_transform(np.reshape(ys, (-1,1)))
    # print(y_onehot)
    count = 0                    
    for doc, y in zip(docs, y_onehot):
        # For each word
        for word in clean_doc(doc):
            # if not in the data yet
            if word not in data:
                # assign it
                data[word] = y
            # if already in the data 
            else:
                # update it
                data[word] = data.get(word) + y

    return data

## Build Frequencies Dictionary

In [10]:
# Build the freqs dictionary for later uses
freqs = count_docs({}, sentences, labels)
freqs

{'smart': array([31., 10.]),
 'alert': array([5., 0.]),
 'thirteen': array([2., 2.]),
 'convers': array([8., 9.]),
 'one': array([409., 375.]),
 'thing': array([123.,  89.]),
 'small': array([34., 61.]),
 'gem': array([10.,  3.]),
 'color': array([37., 13.]),
 'music': array([62., 48.]),
 'bounc': array([3., 0.]),
 'warm': array([19.,  6.]),
 'sea': array([12., 19.]),
 'lap': array([2., 0.]),
 'island': array([ 7., 28.]),
 'shore': array([3., 7.]),
 'enough': array([161.,  26.]),
 'scienc': array([9., 3.]),
 'send': array([ 8., 27.]),
 'home': array([ 35., 122.]),
 'think': array([68., 52.]),
 'mass': array([2., 9.]),
 'market': array([8., 4.]),
 'entertain': array([129.,   7.]),
 'uncompromis': array([6., 3.]),
 'attempt': array([34., 59.]),
 'artist': array([26., 50.]),
 'anoth': array([69., 64.]),
 'light': array([37., 25.]),
 'heart': array([88., 59.]),
 'french': array([29., 16.]),
 'film': array([914., 226.]),
 'spiritu': array([10., 14.]),
 'quest': array([ 8., 24.]),
 'fashion'

In [11]:
# convert the freqs dictionary to nested list
def freqs_to_df(freqs, train_y):
    '''
    input:
        freqs: a frequencies dictionary (ex: {'simplist': array([15.,  4.]), 
                                              'silli': array([64., 20.]), . . })
        train_y: labels for data
    output:
        a frequencies dictionary in the form of dataframe
    '''
    # initialize an empty list to store the rows for dataframe
    freqs_list = []
    
    # Define the names of the dataframe columns
    column_names = ['word']
    column_names = column_names + list(np.unique(train_y))
    
    # convert the keys from the freqs dictionary to a list
    keys = list(freqs.keys())

    # For each row
    for i in range(len(freqs)):
        
        # define the elements for each column
        row = [keys[i]] + list(freqs.get(keys[i]))
        
        # update the frequency list
        freqs_list.append(row)
    
    # Create the dataframe
    df = pd.DataFrame(freqs_list, columns=column_names)
    df.set_index('word', inplace=True)
    return df

In [25]:
freqs_df = freqs_to_df(freqs, labels)
freqs_df

Unnamed: 0_level_0,0,1
word,Unnamed: 1_level_1,Unnamed: 2_level_1
smart,31.0,10.0
alert,5.0,0.0
thirteen,2.0,2.0
convers,8.0,9.0
one,409.0,375.0
...,...,...
sculpt,0.0,1.0
tomin,0.0,1.0
bazaar,0.0,1.0
schmitt,0.0,1.0


In [26]:
freqs_df[0].sum()

58780.0

# Training and Testing the Model

## Build Training Function

In [14]:
def train_naive_bayes(freq_df, train_x, train_y):
    '''
    Input:
        freqs: a pandas dataframe with word indexing
        train_x: a list of tweets
        train_y: a list of labels correponding to the tweets (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    
    freqs = freq_df

    # calculate V, the number of unique words in the vocabulary
    vocab = list(freqs.index)
    V = len(vocab)
    
    ########################################################################################
    # Part 1: Calculate the log prior probability for each class
    
    # Calculate D, the number of documents
    D = len(train_y)
    
    labels = list(np.unique(train_y.astype(int)))
    count = np.zeros((len(labels),))

    for train_label in train_y:
        for unique_label in labels:
            if train_label == unique_label:
                count[unique_label]+=1
                
    # -> count = [4000, 4000]; it means perfectly balanced between each classese

    # Calculate prior probability for each class
    prior = count/D # -> prior = array([0.5, 0.5])
    
        
    # Calculate the logprior for each class
    logprior = np.log(prior) # -> prior = array([-0.69314718, -0.69314718])
    
    ########################################################################################
    # Part 2.a. Calculate the total number of word occurrences for each class
    
    columns = list(freqs.columns)
    N_classes = []
    # calculate N frequency for each class
    for column in columns:
        
        freqs[column] = (freqs[column] + 1)/(freqs[column].sum()+V)
        
    # Calculate the log likelihood of the word
    loglikelihood = np.log(freqs)
        
    ########################################################################################
        
    return logprior, loglikelihood

In [23]:
labels = np.array(labels)
logprior, loglikelihood = train_naive_bayes(freqs_df, sentences, labels)
print(logprior)
loglikelihood

[-0.69314718 -0.69314718]


Unnamed: 0_level_0,0,1
word,Unnamed: 1_level_1,Unnamed: 2_level_1
smart,-7.737548,-8.869015
alert,-9.411525,-11.266910
thirteen,-10.104672,-10.168298
convers,-9.006059,-8.964325
one,-5.187127,-5.337321
...,...,...
sculpt,-11.203284,-10.573763
tomin,-11.203284,-10.573763
bazaar,-11.203284,-10.573763
schmitt,-11.203284,-10.573763


## Build Testing Function

In [27]:
def naive_bayes_predict(tweet, logprior, loglikelihood):
    '''
    input:
        tweet: a string
        logprior: initial probability based on dataset
        loglikelihood: a dictionary of words mapping to numbers
    output:
        p: the sum of all the loglikelihood of each word in the tweet
        (if found in the dictionary) + logprior (a number)
    '''
    
    # process the tweet to get the list of words
    words = clean_doc(tweet)
    
    
    # Initialize probability to zero
    probs = []
    columns = list(loglikelihood.columns)
    for column in columns:
        prob = 0
        # Iterate for each word in word list
        for word in words:

            # check if the word exist in the loglikelihood dictionary
            if word in loglikelihood.index:
                prob += loglikelihood.loc[word, column]
        
        probs.append(prob)
        
    probs = logprior + probs
    y_hat = np.argmax(probs)
    
    return probs, y_hat

In [28]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of tweets
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    output:
        accuracy: (# of tweets classified correctly)/total # of tweets
    """
    # initial accuracy
    acc = 0
    
    # initialize an empty list for storing the predictions
    y_hats = []

    for tweet in test_x:
        
        _ , y = naive_bayes_predict(tweet, logprior, loglikelihood)
        
#         y_hat = np.argmax(probs)
        
        # update the y_hats
        y_hats.append(y)
        
#     Error: the mean absolute values between y_hats and test_y
    error = np.mean(np.abs(np.array(y_hats)-np.array(test_y)))
        
#     Accuracy is 1 - error
    acc = 1-error
    return acc    

## KFold CV

In [29]:
###############################################
# Training and Testing using the same Dataset #
###############################################

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), np.array(list(corpus.label))

# Build the freqs dictionary for later uses
freqs = count_docs({}, sentences, labels)

# Turn the frequencies dictionary into dataframe
freqs_df = freqs_to_df(freqs, labels)
print(freqs_df.head())

# Retrieve the logprior and loglikelihood
logprior, loglikelihood = train_naive_bayes(freqs_df, sentences, labels)
print(logprior)
print(loglikelihood.head())

print("Naive Bayes accuracy = %0.4f" %
      (test_naive_bayes(sentences, labels, logprior, loglikelihood)))

              0      1
word                  
smart      31.0   10.0
alert       5.0    0.0
thirteen    2.0    2.0
convers     8.0    9.0
one       409.0  375.0
[-0.69314718 -0.69314718]
                  0          1
word                          
smart     -7.737548  -8.869015
alert     -9.411525 -11.266910
thirteen -10.104672 -10.168298
convers   -9.006059  -8.964325
one       -5.187127  -5.337321
Naive Bayes accuracy = 0.9524


In [30]:
#######################################################
# Training and Testing using 10-fold Cross Validation #
#######################################################

# prepare cross validation
kfold = KFold(10, True)

# Separate the sentences and the labels
sentences, labels = list(corpus.sentence), list(corpus.label)
# count = 0
acc_list = []
# kfold.split() will return set indices for each split
for train, test in kfold.split(sentences):
    train_x, train_y = [], []
    test_x, test_y = [], []
    for i in train:
        train_x.append(sentences[i])
        train_y.append(labels[i])
        
    for i in test:
        test_x.append(sentences[i])
        test_y.append(labels[i])
    
    train_y = np.array(train_y)
    test_y = np.array(test_y)
    
    # Build the freqs dictionary for later uses
    freqs = count_docs({}, train_x, train_y)

    # Turn the frequencies dictionary into dataframe
    freqs_df = freqs_to_df(freqs, train_y)
#     print(freqs_df.head())

    # Retrieve the logprior and loglikelihood
    logprior, loglikelihood = train_naive_bayes(freqs_df, train_x, train_y)
    print('loglikelihood:\n{}'.format(loglikelihood.head()))
    print('logprior: {}'.format(logprior))
    
    acc = test_naive_bayes(test_x, test_y, logprior, loglikelihood)
    print("Naive Bayes test accuracy = %0.4f\n" %(acc))
    
    acc_list.append(acc)

acc_list = np.array(acc_list)
print()
print('The test ccuracy for each training:\n{}'.format(acc_list))
print('The mean of the test accuracy: ', acc_list.mean())



loglikelihood:
                  0          1
word                          
smart     -7.812783  -8.973126
alert     -9.499182 -11.170351
thirteen -10.415473 -10.071738
convers   -9.029178  -8.973126
one       -5.179031  -5.344351
logprior: [-0.69336943 -0.69292498]
Naive Bayes test accuracy = 0.9120

loglikelihood:
                  0          1
word                          
smart     -7.742896  -8.869243
alert     -9.318432 -11.171829
thirteen -10.011579 -10.073216
convers   -9.318432  -9.092387
one       -5.185936  -5.357698
logprior: [-0.69159283 -0.69470395]
Naive Bayes test accuracy = 0.9040

loglikelihood:
                  0          1
word                          
smart     -7.808037  -8.875399
alert     -9.494436 -11.177984
thirteen -10.005262 -10.079372
convers   -8.906649  -8.980759
one       -5.193077  -5.372849
logprior: [-0.69894171 -0.68738603]
Naive Bayes test accuracy = 0.9120

loglikelihood:
                  0          1
word                          
smart     -