## Adding the directory of the package to the system path

In [1]:
from config import *
append_path('../../')

## Importing data science libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score
from m_learn.utility.model_selection import k_fold_cross_validation
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Reading the data

In [3]:
data = pd.read_csv("./../../data/sentiment_train.txt", delimiter='\t', names  = ['sentiment', 'text'])

In [4]:
data.shape

(6918, 2)

In [5]:
data.head(10)

Unnamed: 0,sentiment,text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...
5,1,that's not even an exaggeration ) and at midni...
6,1,"I loved the Da Vinci Code, but now I want some..."
7,1,"i thought da vinci code was great, same with k..."
8,1,The Da Vinci Code is actually a good movie...
9,1,I thought the Da Vinci Code was a pretty good ...


# Feature Engineering

In [6]:
# extracting stopwords
sw = stopwords.words('english')
# creating a stemmer object
stemmer = SnowballStemmer("english")

In [7]:
def text_preprocessing(text):
    
    '''a function which carries out the neccessary text processing'''
    
    import string
    
    # remove the punctuation marks from the text
    text = str(text)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    
    # removing the stop words from the text
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text)
    
    # stemming the text
    text = [stemmer.stem(word) for word in text.split()]
    text = " ".join(text)    
    
    # returning the processed text
    return text

In [8]:
# carrying out text preprocessing
data['text'] = data['text'].apply(text_preprocessing)
# binarizing the sentiment field
data['sentiment'] = data['sentiment'].apply(lambda x: 1 if x == 1 else -1)

In [9]:
data.head(10)

Unnamed: 0,sentiment,text
0,1,da vinci code book awesom
1,1,first clive cussler ive ever read even book li...
2,1,like da vinci code lot
3,1,like da vinci code lot
4,1,like da vinci code ultimat didnt seem hold
5,1,that even exagger midnight went walmart buy da...
6,1,love da vinci code want someth better differ
7,1,thought da vinci code great kite runner
8,1,da vinci code actual good movi
9,1,thought da vinci code pretti good book


# Extracting ti-id representation of the formatted message

In [10]:
# creating the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer("english")
# fitting the vectorizer using the text data set
tfid_vectorizer.fit(data['text'])
# extracting the tfid representation matrix of the text data
tfid_matrix = tfid_vectorizer.transform(data['text'])



In [11]:
# collecting the tfid matrix in pandas dataframe
array = tfid_matrix.todense()
df = pd.DataFrame(array)
df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Training different naive bayes classifier models and accessing the performance

In [12]:
# adding the output to the dataframe
df['output'] = data['sentiment']
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1793,1794,1795,1796,1797,1798,1799,1800,1801,output
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [13]:
# features and output of the models
features = df.columns.tolist()
output = 'output'
features.remove(output)

In [14]:
# shuffling the data
df = df.sample(frac=1).reset_index(drop=True)

## Training and accessing the performance of Gaussian naive bayes classifier

In [15]:
# creating the model object
model1 = GaussianNB()

In [16]:
# carrying out k-fold cross validation to calculate the accuracy
k = 5
accuracy = k_fold_cross_validation(k, model1, df, features, output, accuracy_score)
print("Accuracy: ", round(accuracy,4))

Accuracy:  0.9643


## Training and accessing the performance of Multinomial  naive bayes classifier

In [17]:
# creating the model object
model2 = MultinomialNB()

In [18]:
# carrying out k-fold cross validation to calculate the accuracy
k = 5
accuracy = k_fold_cross_validation(k, model2, df, features, output, accuracy_score)
print("Accuracy: ", round(accuracy,4))

Accuracy:  0.9814


## Training and accessing the performance of Bernoulli naive bayes classifier

In [19]:
# creating the model object
model3 = BernoulliNB()

In [20]:
# carrying out k-fold cross validation to calculate the accuracy
k = 5
accuracy = k_fold_cross_validation(k, model3, df, features, output, accuracy_score)
print("Accuracy: ", round(accuracy,4))

Accuracy:  0.9851
