In [109]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv("head.csv")
clf_label = "sentiment_category" # can be changed with topic_category
X = df.the_document_tokens
if(clf_label == "sentiment_category"):
    y = df.sentiment_category
elif(clf_label == "topic_category"):
    y = df.topic_category


In [110]:
print(type(X))
print(type(y))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)
print(type(X_train))
type(X_train)

<class 'pandas.core.series.Series'>


pandas.core.series.Series

In [62]:
#vectorizer = CountVectorizer(stop_words='english')
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train) 
X_test = vectorizer.transform(X_test)

In [63]:
feature_names = vectorizer.get_feature_names()
len(feature_names)
#len(vectorizer.vocabulary_)

42110

In [64]:
X_train = pd.DataFrame(X_train.toarray(), columns=feature_names) # converting Series to Dataframe
X_test = pd.DataFrame(X_test.toarray(), columns=feature_names) # converting Series to Dataframe

# Understanding the data

In [65]:
X_train[clf_label+"_label"] = np.array(y_train, dtype='object')

In [33]:
print(X_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9531 entries, 0 to 9530
Columns: 42157 entries, 00 to sentiment_category_label
dtypes: int64(42156), object(1)
memory usage: 3.0+ GB
None


In [34]:
categories = [category for category in X_train.sentiment_category_label.unique()]

In [35]:
X_trains = {}
for sent_category in categories:
    X_trains[sent_category] = X_train[X_train['sentiment_category_label']==sent_category]
    X_trains[sent_category] = X_trains[sent_category].iloc[: , :-1]


In [36]:
len(X_trains["neg"]), len(X_trains["pos"]) # number of documents in negative and positive cases in training data

(4711, 4820)

In [37]:
max_freq_in_negative_case = {} # created to store most freqeunt words in negative labeled documents
for word in X_trains["neg"]:
    freq = X_trains["neg"][word].sum() # sum of the specific word in negative case 
    max_freq_in_negative_case[word] = freq

In [38]:
max_freq_in_negative_case = dict(sorted(max_freq_in_negative_case.items(), key=lambda item: item[1], reverse=True))
count = 1
for key in max_freq_in_negative_case:
    print(key,"->freq: ",max_freq_in_negative_case[key])
    if(count == 10): break
    count+=1

the ->freq:  32382
to ->freq:  16716
and ->freq:  16181
of ->freq:  13062
it ->freq:  12480
is ->freq:  10421
this ->freq:  9633
that ->freq:  8048
in ->freq:  7705
for ->freq:  6060


In [39]:
max_freq_in_positive_case = {} # created to store most freqeunt words in positive labeled documents
for word in X_trains["pos"]:
    freq = X_trains["pos"][word].sum() # sum of the specific word in positive case 
    max_freq_in_positive_case[word] = freq

In [40]:
max_freq_in_positive_case = dict(sorted(max_freq_in_positive_case.items(), key=lambda item: item[1], reverse=True)) 
count = 1
for key in max_freq_in_positive_case:
    print(key,"->freq: ",max_freq_in_positive_case[key])
    if(count == 10): break
    count+=1

the ->freq:  33460
and ->freq:  18732
to ->freq:  15864
of ->freq:  14113
it ->freq:  11862
is ->freq:  11772
this ->freq:  9305
in ->freq:  8668
that ->freq:  7057
for ->freq:  6593


In [41]:
difference = {} # to store freqeuency difference of common words in positive and negative cases
for key in max_freq_in_negative_case.keys(): 
    difference[key] = abs(max_freq_in_positive_case[key] - max_freq_in_negative_case[key])

In [42]:
difference = dict(sorted(difference.items(), key=lambda item: item[1], reverse=True))
count = 1
for key in difference:
    print(key,"->freq: ",difference[key])
    if(count == 10): break
    count+=1

and ->freq:  2551
not ->freq:  1881
was ->freq:  1470
is ->freq:  1351
great ->freq:  1190
the ->freq:  1078
of ->freq:  1051
that ->freq:  991
in ->freq:  963
they ->freq:  888


In [43]:
y_train = np.array(y_train, dtype='object')
y_test = np.array(y_test, dtype='object')

In [44]:
print(y_train)

['pos' 'neg' 'neg' ... 'neg' 'neg' 'pos']


# Implementing Naive Bayes

In [45]:
# Naive Bayes Classifier (Works for both sentiment category and topic category)
class NaiveBayesClassifier: 
    # outcomesPD is X, y is y_train -> dtype = 'object'
    def fit(self, X, y_train, clf_label):
        (unique, counts) = np.unique(y_train, return_counts=True) # count the unique number of classes in the category label
        frequencies = np.asarray((unique, counts)).T 
        self.freq = {} # to store the frequency of the categories seperately
        for category, frequency in frequencies:
            self.freq[category] = frequency  # store the freqeuncy for each category
        total_examples = sum(self.freq.values()) # total number of documents in the dataset
        prob_category = {} # to store the category probabilities
        for category in self.freq.keys():
            prob_category[category] = self.freq[category] / total_examples # to calculate the probability of the category class
        print(prob_category) # print the probabilities of each category seperately
        number_of_word_in_category_case = {} # to store the number of word in the specific category
        outcomesCategoryPD = {} # will be used in filtering the categories from the training data
        prob_word_given_category = {} # category, word
        for category in self.freq.keys(): 
            outcomesCategoryPD[category] = X[X[clf_label]==category] # filtering operation
            number_of_word_in_category_case[category] = outcomesCategoryPD[category].iloc[: , :-1].values.sum() # total number of word is specific category case
            prob_word_given_category[category] = {} # to store the conditional probability of each word for each class
        vocabularies = [word for word in X.iloc[: , :-1]] # vocabularies in training data
        vocab_size = len(vocabularies) # vocabulary size
        prior_category = {} # to store the prior probability of any category
        posterior_category = {} # to store the posterior information for each category
        for category in self.freq.keys():
            for word in vocabularies:
                count_specific_word_in_category_case = outcomesCategoryPD[category][word].sum() + 1 # count the specific word in the specific category
                prob_word_given_category[category][word] = np.log(count_specific_word_in_category_case / (number_of_word_in_category_case[category]+vocab_size)) # calculate the conditional probability using Laplace Smooting
                prior_category[category] = np.log(prob_category[category]) # store the prior probability for each category
                posterior_category[category] = prob_word_given_category[category][word] + prior_category[category]
        return prior_category, prob_word_given_category,vocabularies,outcomesCategoryPD # return the prior category, conditional probabilities, vocabularies
    
    def predict(self, X): # X is X_test -> outcomesTestPD
        test_views = []
        for test_index in X.index:
            test_features = X.loc[test_index]
            test_features = test_features[test_features > 0]
            test_view = [feature for feature in test_features.to_frame().T.columns]
            test_views.append(test_view)
        test_views
        print(len(test_views[0]))
        y_pred = np.array([]).astype(np.object)
        for view in test_views:
            posterior_for_category = {}
            for category in self.freq.keys():
                posterior_for_category[category] = prior_category[category]
                for token in view:
                    if(token in vocabularies):
                        posterior_for_category[category] += prob_word_given_category[category][token] # sum all the log likelihood with the prior probability 
            max_key = max(posterior_for_category, key=posterior_for_category.get) # return argmax of the predicted class
            y_pred = np.append(y_pred, max_key) # append the predicted class to y_pred 
        return y_pred # return y_pred

In [46]:
def calculate_accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [47]:
nb_clf_for_categories = NaiveBayesClassifier()

In [48]:
prior_category, prob_word_given_category,vocabularies,outcomesCategoryPD = nb_clf_for_categories.fit(X_train,y_train,clf_label+"_label")

{'neg': 0.49428181722799286, 'pos': 0.5057181827720071}


In [49]:
y_pred = nb_clf_for_categories.predict(X_test)

43


In [50]:
accuracy = calculate_accuracy(y_test, y_pred)

In [51]:
print(accuracy)

0.8271086865295846


In [54]:
#prob_word_given_category["neg"]["like"], prob_word_given_category["pos"]["like"]
#prob_word_given_category["neg"]["book"], prob_word_given_category["pos"]["book"]
cond_neg = dict(sorted(prob_word_given_category["pos"].items(), key=lambda item: item[1], reverse=True))
count = 1
for key in cond_neg:
    print("P(word='{}'|{}) = {}".format(key,"label = neg",cond_neg[key]))
    if(count == 10): break
    count+=1

P(word='the'|label = neg) = -2.9500401714536966
P(word='and'|label = neg) = -3.5301340785184228
P(word='to'|label = neg) = -3.6963053279621856
P(word='of'|label = neg) = -3.813253540418706
P(word='it'|label = neg) = -3.986996440318378
P(word='is'|label = neg) = -3.9946119789012844
P(word='this'|label = neg) = -4.229761399574888
P(word='in'|label = neg) = -4.300667309110841
P(word='that'|label = neg) = -4.5062590277442665
P(word='for'|label = neg) = -4.574260608347805


In [55]:
cond_pos = dict(sorted(prob_word_given_category["neg"].items(), key=lambda item: item[1], reverse=True))
count = 1
for key in cond_pos:
    print("P(word='{}'|{}) = {}".format(key,"label = pos",cond_pos[key]))
    if(count == 10): break
    count+=1

P(word='the'|label = pos) = -2.9816102355855114
P(word='to'|label = pos) = -3.6428176636065976
P(word='and'|label = pos) = -3.6753443159414436
P(word='of'|label = pos) = -3.8894600228226652
P(word='it'|label = pos) = -3.935036341390775
P(word='is'|label = pos) = -4.115324872840069
P(word='this'|label = pos) = -4.193945321119062
P(word='that'|label = pos) = -4.373695969215495
P(word='in'|label = pos) = -4.417244583123126
P(word='for'|label = pos) = -4.6573690263579435


In [56]:
def printConfusionMatrix(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [57]:
from sklearn.metrics import classification_report, confusion_matrix
printConfusionMatrix(y_pred, y_test)

[[1024  179]
 [ 233  947]]
              precision    recall  f1-score   support

         neg       0.81      0.85      0.83      1203
         pos       0.84      0.80      0.82      1180

    accuracy                           0.83      2383
   macro avg       0.83      0.83      0.83      2383
weighted avg       0.83      0.83      0.83      2383



In [58]:
y_train,y_test

(array(['pos', 'neg', 'neg', ..., 'neg', 'neg', 'pos'], dtype=object),
 array(['pos', 'pos', 'pos', ..., 'pos', 'neg', 'neg'], dtype=object))

# Calculate Accuracy With The Sentiment Categories

In [66]:
nb_clf_for_categories = NaiveBayesClassifier()

In [67]:
prior_category, prob_word_given_category,vocabularies,outcomesCategoryPD = nb_clf_for_categories.fit(X_train,y_train,clf_label+"_label")

{'books': 0.1656699192109957, 'camera': 0.1692372258944497, 'dvd': 0.1685027804007974, 'health': 0.16650928548945546, 'music': 0.16692896862868534, 'software': 0.1631518203756164}


In [68]:
y_pred = nb_clf_for_categories.predict(X_test)

23


In [69]:
accuracy = calculate_accuracy(y_test, y_pred)

In [70]:
print(accuracy)

0.9139739823751574


In [71]:
def printConfusionMatrix(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [72]:
from sklearn.metrics import classification_report, confusion_matrix
printConfusionMatrix(y_pred, y_test)

[[373   2  28   3   2  13]
 [  0 374   0   3   0   9]
 [ 20  10 335   1  18  10]
 [  1  24   0 375   3  10]
 [  5   0  13   2 386   3]
 [  6  10   7   1   1 335]]
              precision    recall  f1-score   support

       books       0.92      0.89      0.90       421
      camera       0.89      0.97      0.93       386
         dvd       0.87      0.85      0.86       394
      health       0.97      0.91      0.94       413
       music       0.94      0.94      0.94       409
    software       0.88      0.93      0.91       360

    accuracy                           0.91      2383
   macro avg       0.91      0.91      0.91      2383
weighted avg       0.92      0.91      0.91      2383



In [73]:
y_train,y_test

(4935    camera
 8918    health
 9083    camera
 4090       dvd
 9025     music
          ...  
 3109    camera
 525      books
 526     camera
 9688     music
 8308     books
 Name: topic_category, Length: 9531, dtype: object,
 6794      camera
 1273         dvd
 7185       music
 5616       music
 1467       music
           ...   
 4965    software
 5643      health
 788       health
 5357      camera
 5841         dvd
 Name: topic_category, Length: 2383, dtype: object)

# TF-IDF (using TfidfVectorizer)

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS)

In [94]:
X_train = vectorizer.fit_transform(X_train) 
X_test = vectorizer.transform(X_test)

In [95]:
feature_names = vectorizer.get_feature_names()
print(len(feature_names))

42041


In [96]:
X_train = pd.DataFrame(X_train.toarray(), columns=feature_names)
X_test = pd.DataFrame(X_test.toarray(), columns=feature_names)

In [97]:
X_train[clf_label+"_label"] = np.array(y_train, dtype='object')

In [98]:
y_train = np.array(y_train, dtype='object')
y_test = np.array(y_test, dtype='object')

In [99]:
nb = NaiveBayesClassifier()

In [100]:
prior_category, prob_word_given_category,vocabularies,outcomesCategoryPD = nb.fit(X_train,y_train,clf_label+"_label")

{'neg': 0.499213094113944, 'pos': 0.500786905886056}


In [101]:
y_pred = nb.predict(X_test)

23


In [102]:
accuracy = calculate_accuracy(y_test, y_pred)

In [103]:
print(accuracy)

0.8271086865295846


In [104]:
most_strong_words_in_positive = dict(sorted(prob_word_given_category["pos"].items(), key=lambda item: item[1], reverse=True))
most_strong_words_in_negative = dict(sorted(prob_word_given_category["neg"].items(), key=lambda item: item[1], reverse=True))

In [105]:
count = 1
for key in most_strong_words_in_negative:
    print("P(word='{}'|{}) = {}".format(key,"label = neg",most_strong_words_in_negative[key]))
    if(count == 10): break
    count+=1

P(word='book'|label = neg) = -6.46425909660478
P(word='camera'|label = neg) = -6.548271851220741
P(word='like'|label = neg) = -6.588081767342898
P(word='just'|label = neg) = -6.5908443940197285
P(word='did'|label = neg) = -6.736892387922794
P(word='product'|label = neg) = -6.819979919052741
P(word='does'|label = neg) = -6.826173979942325
P(word='good'|label = neg) = -6.842000250115771
P(word='movie'|label = neg) = -6.859846665545958
P(word='time'|label = neg) = -6.947305302631093


In [106]:
count = 1
for key in most_strong_words_in_positive:
    print("P(word='{}'|{}) = {}".format(key,"label = pos",most_strong_words_in_positive[key]))
    if(count == 10): break
    count+=1

P(word='great'|label = pos) = -6.253003063526483
P(word='book'|label = pos) = -6.5281013220232875
P(word='camera'|label = pos) = -6.589902030510466
P(word='good'|label = pos) = -6.6496049358115155
P(word='use'|label = pos) = -6.699341458611662
P(word='like'|label = pos) = -6.75624635668573
P(word='love'|label = pos) = -6.84533057465018
P(word='just'|label = pos) = -6.85897955980858
P(word='easy'|label = pos) = -6.905323251994461
P(word='really'|label = pos) = -6.9339179411360865


In [107]:
def printConfusionMatrix(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [108]:
from sklearn.metrics import classification_report, confusion_matrix
printConfusionMatrix(y_pred, y_test)

[[988 168]
 [244 983]]
              precision    recall  f1-score   support

         neg       0.80      0.85      0.83      1156
         pos       0.85      0.80      0.83      1227

    accuracy                           0.83      2383
   macro avg       0.83      0.83      0.83      2383
weighted avg       0.83      0.83      0.83      2383



# TF-IDF (using CountVectorizer)

In [112]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text  import TfidfTransformer

In [113]:
vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)  #instantiate CountVectorizer()

In [114]:
word_count_vector = vectorizer.fit_transform(X_train) # this steps generates word counts for the words in the documets 
#X_test = vectorizer.transform(X_test)

In [115]:
#Compute the IDF values
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [116]:
#tfidf_transformer.fit(word_count_vector)

In [117]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=vectorizer.get_feature_names(),columns=["idf_weights"])

In [118]:
# sort ascending 
df_idf.sort_values(by=['idf_weights']).head() #Notice that the words ‘just’ and ‘like’ have the lowest IDF values. This is expected as these words appear in each and every document in our collection. The lower the IDF value of a word, the less unique it is to any particular document.

Unnamed: 0,idf_weights
like,2.27821
just,2.309193
good,2.455347
great,2.504655
time,2.583242


In [119]:
#  Compute the TFIDF score for the documents
X_train=vectorizer.transform(X_train) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(X_train) # to compute the tf-idf scores for documents

In [120]:
feature_names = vectorizer.get_feature_names()  # get feature names
first_document_vector=tf_idf_vector[0] #get tfidf vector for first document 
shape = first_document_vector.shape
tf_idf_vector

<9531x42102 sparse matrix of type '<class 'numpy.float64'>'
	with 466288 stored elements in Compressed Sparse Row format>

In [121]:
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False).head()

Unnamed: 0,tfidf
quot,0.620988
force,0.375275
revelation,0.251256
cd,0.228315
3rd,0.221466


In [122]:
sorted_vals = df.sort_values(by=["tfidf"],ascending=False)
most_weighted_words = {}
for index in sorted_vals.index:
    features = sorted_vals.loc[index]
    if(features["tfidf"] > 0):
        most_weighted_words[features.name] = features["tfidf"]
print(most_weighted_words.keys())

dict_keys(['quot', 'force', 'revelation', 'cd', '3rd', 'new', 'tone', 'sets', 'field', 'recording', 'heart', 'collection', 'add', 'track', 'wonderful', 'years'])


In [123]:
X_train = pd.DataFrame(X_train.toarray(), columns=feature_names) #convert series to dataframe

In [124]:
X_train[clf_label+"_label"] = np.array(y_train, dtype='object')

In [125]:
y_train = np.array(y_train, dtype='object')
y_test = np.array(y_test, dtype='object')

In [126]:
nb = NaiveBayesClassifier()
X_train.head()

Unnamed: 0,00,000,0003,000mb,007,00am,00pm,01,02,03,...,½lan,½sistance,½ttbullar,½ve,áron,árpád,ça,énnui,único,sentiment_category_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos


In [127]:
prior_category, prob_word_given_category,vocabularies,outcomesCategoryPD = nb.fit(X_train,y_train,clf_label+"_label")

{'neg': 0.49585562900010494, 'pos': 0.5041443709998951}


In [128]:
X_test = vectorizer.transform(X_test)

In [129]:
X_test = pd.DataFrame(X_test.toarray(), columns=feature_names)
X_test.head()

Unnamed: 0,00,000,0003,000mb,007,00am,00pm,01,02,03,...,½kerlund,½lan,½sistance,½ttbullar,½ve,áron,árpád,ça,énnui,único
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [130]:
y_pred = nb.predict(X_test)

8


In [131]:
accuracy = calculate_accuracy(y_test, y_pred)

In [132]:
print(accuracy)

0.8287872429710449


# Bigram

In [7]:
vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS,ngram_range=(2, 2))
X_train = vectorizer.fit_transform(X_train) 
X_test = vectorizer.transform(X_test)

In [8]:
feature_names = vectorizer.get_feature_names()
len(feature_names)

425585

In [9]:
X_train = pd.DataFrame(X_train.toarray(), columns=feature_names)
X_test = pd.DataFrame(X_test.toarray(), columns=feature_names)

In [10]:
X_train

Unnamed: 0,00 00,00 22,00 320,00 50,00 8x,00 avoid,00 believe,00 bit,00 bot,00 bucks,...,áron feels,áron map,áron organize,áron solace,áron tamás,áron undergoes,árpád sopsits,ça claque,énnui wanna,único que
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X_train = X_train.iloc[:,:42031]
X_train[clf_label+"_label"] = np.array(y_train, dtype='object')

In [12]:
X_test = X_test.iloc[:,:42031]
X_test

Unnamed: 0,00 00,00 22,00 320,00 50,00 8x,00 avoid,00 believe,00 bit,00 bot,00 bucks,...,bobby short,bobby valentino,bobby vee,bobby wanzer,boc sound,boca raton,boccherini adagio,boccherini vivaldi,bocelli fans,bocelli simply
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2380,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
y_train = np.array(y_train, dtype='object')
y_test = np.array(y_test, dtype='object')
X_train

Unnamed: 0,00 00,00 22,00 320,00 50,00 8x,00 avoid,00 believe,00 bit,00 bot,00 bucks,...,bobby valentino,bobby vee,bobby wanzer,boc sound,boca raton,boccherini adagio,boccherini vivaldi,bocelli fans,bocelli simply,sentiment_category_label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
9527,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,pos
9528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg
9529,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,neg


In [17]:
nb_clf_for_categories = NaiveBayesClassifier()

In [18]:
prior_category, prob_word_given_category,vocabularies,outcomesCategoryPD = nb_clf_for_categories.fit(X_train,y_train,clf_label+"_label")

{'neg': 0.49606547056971984, 'pos': 0.5039345294302802}


In [19]:
y_pred = nb_clf_for_categories.predict(X_test)

1


In [20]:
accuracy = calculate_accuracy(y_test, y_pred)

In [21]:
print(accuracy)

0.5971464540495174


In [22]:
def printConfusionMatrix(y_pred, y_test):
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [23]:
from sklearn.metrics import classification_report, confusion_matrix
printConfusionMatrix(y_pred, y_test)

[[466 720]
 [240 957]]
              precision    recall  f1-score   support

         neg       0.66      0.39      0.49      1186
         pos       0.57      0.80      0.67      1197

    accuracy                           0.60      2383
   macro avg       0.62      0.60      0.58      2383
weighted avg       0.62      0.60      0.58      2383



In [24]:
cond_pos = dict(sorted(prob_word_given_category["pos"].items(), key=lambda item: item[1], reverse=True))
count = 1
for key in cond_pos:
    print("P(word='{}'|{}) = {}".format(key,"label = pos",cond_pos[key]))
    if(count == 10): break
    count+=1

P(word='battery life'|label = pos) = -7.152834257809046
P(word='10 years'|label = pos) = -8.00632408844417
P(word='absolutely love'|label = pos) = -8.050775851015004
P(word='big fan'|label = pos) = -8.050775851015004
P(word='black white'|label = pos) = -8.050775851015004
P(word='20 years'|label = pos) = -8.19737932520688
P(word='adobe photoshop'|label = pos) = -8.19737932520688
P(word='beginning end'|label = pos) = -8.19737932520688
P(word='blood pressure'|label = pos) = -8.19737932520688
P(word='20 minutes'|label = pos) = -8.251446546477155


In [25]:
cond_neg = dict(sorted(prob_word_given_category["neg"].items(), key=lambda item: item[1], reverse=True))
count = 1
for key in cond_neg:
    print("P(word='{}'|{}) = {}".format(key,"label = neg",cond_neg[key]))
    if(count == 10): break
    count+=1

P(word='amazon com'|label = neg) = -7.8809206237447995
P(word='anti virus'|label = neg) = -7.8809206237447995
P(word='blood pressure'|label = neg) = -7.8809206237447995
P(word='10 minutes'|label = neg) = -7.960963331418336
P(word='15 minutes'|label = neg) = -8.003522945837132
P(word='big fan'|label = neg) = -8.003522945837132
P(word='10 years'|label = neg) = -8.047974708407965
P(word='big mistake'|label = neg) = -8.14328488821229
P(word='better quality'|label = neg) = -8.194578182599841
P(word='20 minutes'|label = neg) = -8.248645403870118
