## BernoulliNB Classifier

#### Importing data

In [3]:
import pickle
from nltk.corpus import stopwords
from string import punctuation
import re

x_train, y_train, x_test, y_test = pickle.load(open("data/sklearn-data.pickle", "rb")).values()


'\n# Training on smaller dataset\nn = 100000\nx_train = x_train[:n]\ny_train = y_train[:n]\n'

#### Preprocessing

In [4]:
# Removing special characters
for index, review in enumerate(x_train):
    x_train[index] = re.sub('['+punctuation+']', '', review)
    
for index, review in enumerate(x_test):
    x_test[index] = re.sub('['+punctuation+']', '', review)

In [34]:
from collections import Counter

stop_words = set(stopwords.words('english'))

# Checking number of unique words
words = [word for sentence in x_train for word in sentence.split() if word not in stop_words and word not in punctuation]
print(len(Counter(words).keys())) 

30630


#### Creating a HashingVectorizer for the word-features

In [5]:
from sklearn.feature_extraction.text import HashingVectorizer

stop_words = set(stopwords.words('english'))
vectorizer = HashingVectorizer(norm="l1",n_features=2**28, ngram_range=(1,3),
                               stop_words=stop_words, binary=False, alternate_sign=False )


corpus = [
    "My name is Henrik pannekake",
    "I am Henrik",
]

x_train_coded = vectorizer.fit_transform(x_train)
x_test_coded = vectorizer.fit_transform(x_test)

# Testing
print(vectorizer.fit_transform(corpus).toarray())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Naive Bayes classifier for multivariate Bernoulli models

In [6]:
from sklearn.naive_bayes import BernoulliNB
classifier_nb = BernoulliNB(alpha=1.0e-10)

# Fitting classifier
classifier_nb.fit(x_train_coded, y_train)

BernoulliNB(alpha=1e-10, binarize=0.0, class_prior=None, fit_prior=True)

In [7]:
from sklearn.metrics import accuracy_score

# Predicting test data and checking accuracy
preds_nb = classifier_nb.predict(x_test_coded)
print(f"Accuracy of classifier is {accuracy_score(preds_nb, y_test)}")

Accuracy of classifier is 0.9084640843343957


### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

classifier_dt = DecisionTreeClassifier()
classifier_dt.fit(x_train_coded, y_train)

In [None]:
# Predicting test data and checking accuracy
preds_dt = classifier_dt.predict(x_test_coded)
print(f"Accuracy of classifier is {accuracy_score(preds_dt, y_test)}")