In [9]:
import pandas as pd
import numpy as np
import string
import random

In [10]:
def load_doc(filename):
    # open the file under read mode
    file = open(filename, 'r')
    content = file.read()
    file.close()
    return content

# Split into sentences from content as every sentence is separated by a newline character
def sentence(content):
    sentences = []
    sentences = list(content.split("\n"))
    return sentences
    

# Load the document
name = 'dataset_NB.txt'
content = load_doc(name)
sentences = sentence(content)

In [11]:
# Text preprocessing
lower_case_sentences = []
for i in sentences:
    lower_case_sentences.append(i.lower())

without_punctuations = []
for i in lower_case_sentences:
    without_punctuations.append(''.join(c for c in i if c not in string.punctuation))

clean_data = []
for i in without_punctuations:
    sub = i.split(',')
    sub1 = [sub[0][0:len(sub)-2].rstrip(),sub[0][-1]]
    clean_data.append(sub1)

In [12]:
df = pd.DataFrame(clean_data, columns =['Statement', 'Sentiment'])
df.head()

Unnamed: 0,Statement,Sentiment
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [16]:
# Creating a list to store the F-score and accuracy for each fold iterations
accur_values = []
count_total = 0

# Split a dataset into 7 folds
def cross_validation(df, n_folds):
    df_split = list()
    df_copy = list(df)
    fold_size = int(len(df) / 7)
    for i in range (n_folds):
        fold = []
        while len(fold) < fold_size:
            index = random.randrange(0,len(df_copy))
            fold.append(df_copy.pop(index))
        df_split.append(fold)
    return df_split

# Building vocabulary of model
def words_frequency(train_df):
    train_sentences = train_df['Statement'].values
    train_sentences_list = train_sentences.tolist()
    all_words_train = []
    for i in train_sentences_list:
        all_words_train.extend(i.split(' '))
    vocab,count = np.unique(np.array(all_words_train),return_counts=True)
    return (vocab,count)

# Calculating likelihood probability P(d|C)
# Writing a function for a given class C = 1,0
def posterior_prob(train_df,vocab,count,words_test,prob_class, class_count):
    posterior_prob = list()
    #Calculations for test data in row i
    for i in words_test:
        likelihood_prob = 1
        word_test_array = np.array(i)
        vocab_test,count_test = np.unique(word_test_array,return_counts=True)
        #j returns the elements of the iterable list i
        for j in i:
            try:
                index = list(vocab).index(j)
                # Here likelihood probability is returned for the ith row of test data
                likelihood_prob *= ((count[index] + 1) / (np.sum(count) + np.sum(count_total) + 1))
            except ValueError:
                likelihood_prob *= ((0 + 1) / (np.sum(count) + np.sum(count_total) + 1))
            
        # Return the probability P(d|C)*P(C)
        posterior = prob_class*likelihood_prob
        posterior_prob.append(posterior)
    return posterior_prob


def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0



In [17]:
def Naive_Bayes(train_set,test_set):
    train_df = pd.DataFrame(train_set, columns =['Statement', 'Sentiment'])
    test_df = pd.DataFrame(test_set,columns =['Statement', 'Sentiment'])
    train_df_positive = train_df.loc[train_df['Sentiment']=='1']
    train_df_negative = train_df.loc[train_df['Sentiment']=='0']


    #Setting the positive sentiment and negative sentiment vocab and frequency
    vocab_positive, count_positive = words_frequency(train_df_positive)
    vocab_negative, count_negative = words_frequency(train_df_negative)
    vocab_total, count_total = words_frequency(train_df)


    #Gives the probability P(C) or prior probability
    # no. of sentiment values is the same as the no. of reviews in train_set
    train_sentiments = train_df['Sentiment'].values
    sentiment,count = np.unique(train_sentiments,return_counts = True)

    positive_review_count = count[1]
    negative_review_count = count[0]

    prob_positive = positive_review_count / (positive_review_count + negative_review_count)
    prob_negative = negative_review_count / (positive_review_count + negative_review_count)

    # extracting the words from the test_set
    test_sentences = test_df['Statement'].values
    test_sentences_list = test_sentences.tolist()
    words_test = []
    for i in test_sentences_list:
        words_test.append(i.split(' '))


            
    posterior_prob_positive = posterior_prob(train_df,vocab_positive,count_positive,words_test,prob_positive,positive_review_count)
    posterior_prob_negative = posterior_prob(train_df,vocab_negative,count_negative,words_test,prob_negative,negative_review_count)

    test_predict = list()

    #predict the Sentiment
    for i in range (len(test_set)):
        if posterior_prob_positive[i] > posterior_prob_negative[i]:
            test_predict.append("1")
        else:
            test_predict.append("0")
        
    test_df['Predicted Sentiment'] = test_predict 
    accuracy = accuracy_metric(test_df['Sentiment'], test_df['Predicted Sentiment'])
    accur_values.append(accuracy)



folds = cross_validation(clean_data, 7)
for fold in folds:
    train_set = list(folds)
    train_set.remove(fold)
    train_set = sum(train_set, [])
    test_set = list()
    for row in fold:
        row_copy = list(row)
        test_set.append(row_copy)
    Naive_Bayes(train_set, test_set)
        
accuracy = np.array(accur_values)
print('\nAccuracy')
print(np.mean(accuracy),' +/- ',np.std(accuracy))
accur_values



Accuracy
81.48893360160966  +/-  3.5052819698560054


[82.3943661971831,
 86.61971830985915,
 83.80281690140845,
 75.35211267605634,
 83.80281690140845,
 78.87323943661971,
 79.5774647887324]