# LAB 09 - NIAVE BAYES' NETWORK

In [1]:


from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.

X = ['offer secret', 'click secret link', 'secret sports link', 'play sports today', 'went play sports', 'secret sports event', 'sports today', 'sports costs money']
Y = [1,1,1,0,0,0,0,0] # review labels. 1 indicate spam, 0 non-spam

vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 100) 
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
X = vectorizer.fit_transform(X)

# Numpy arrays are easy to work with, so convert the result to an 
# array
X = X.toarray()
print (X.shape)

print (vectorizer.vocabulary_) 

(8, 11)
{'offer': 5, 'secret': 7, 'click': 0, 'link': 3, 'sports': 8, 'play': 6, 'today': 9, 'went': 10, 'event': 2, 'costs': 1, 'money': 4}


In [2]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=0.000001) # alpha=0 means no laplace smoothing
clf.fit(X, np.array(Y))

test_reviews = ['sports', 'secret secret', 'today secret']
# bag of word representation
tX = vectorizer.transform(test_reviews).toarray()
# prediction
print(clf.predict(tX))

print (clf.predict_proba(tX)) 


[0 1 0]
[[8.26446251e-01 1.73553749e-01]
 [5.70208051e-02 9.42979195e-01]
 [9.99997244e-01 2.75623594e-06]]


In [11]:
import sklearn
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB

train = pd.read_csv('labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)

def review_to_words(raw_review):
    #1. Remove HTML
    review_bs_obj = BeautifulSoup(raw_review)
    review = review_bs_obj.get_text()
    #2. Remove non letters
    review = re.sub('[^A-Za-z]+',' ', review)
    #3. Convert to lowercase and split it into words
    review_words = review.lower().split()
    #4. Remove stops words
    stops = set(stopwords.words('english'))
    review_words = [w for w in review_words if not w in stops] 
    #5. Joint back and return the joined sentence
    review_sentence = " ".join(review_words)
    return review_sentence


vectorizer = sklearn.feature_extraction.text.CountVectorizer(analyzer='word',
                                                            tokenizer=None,
                                                            preprocessor=None,
                                                            stop_words=None,
                                                            max_features=3000)
train_size = 22000
print("\nTraining data size: ", train_size)
X_train = train['review'][:train_size]
Y_train = train['sentiment'][:train_size]
X_test = train['review'][train_size:]
Y_test = train['sentiment'][train_size:]

training_X = vectorizer.fit_transform(X_train).toarray()
testing_X = vectorizer.fit_transform(X_test).toarray()

clf = MultinomialNB(alpha=5)
clf = clf.fit(training_X, Y_train)

predictions_train = clf.predict(training_X) 
predictions_test = clf.predict(testing_X)

compare_train = predictions_train == Y_train
compare_test = predictions_test == Y_test
print("Accuracy achieved on training data: ", compare_train.mean() * 100, "%")
print("Accuracy achieved on testing data: ", compare_test.mean() * 100, "%")

[nltk_data] Downloading package stopwords to C:\Users\Hamna
[nltk_data]     Moiz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Training data size:  22000
Accuracy achieved on training data:  83.47272727272727 %
Accuracy achieved on testing data:  52.36666666666666 %


| VocSize |Alpha  |Accuracy            |
|---------|-------|--------------------|
|3000     |0.00001|52.2 %              |
|3000     |5      |52.36666666666666 % |
|5000     |5      |57.699999999999996 %|
|5000     |0.00001|57.199999999999996 %|

