# Chapter 6 - Other Popular Machine Learning Methods
## Segment 5 - Naive Bayes Classifiers

In [2]:
import numpy as np
import pandas as pd
import urllib
import sklearn

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [3]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Naive Bayes
### Using Naive Bayes to predict spam

In [4]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

import urllib.request

raw_data = urllib.request.urlopen(url)
dataset = np.loadtxt(raw_data, delimiter=',')
print(dataset[0])

[  0.      0.64    0.64    0.      0.32    0.      0.      0.      0.
   0.      0.      0.64    0.      0.      0.      0.32    0.      1.29
   1.93    0.      0.96    0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.      0.      0.
   0.      0.      0.      0.      0.      0.      0.778   0.      0.
   3.756  61.    278.      1.   ]


In [5]:
X = dataset[:,0:48]
# only the first 48 variables

y = dataset[:,-1]
# isolate target variable (spam vs. non-spam record) spam=1, nonspam=0

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=17)

In [7]:
# we try bernoulli NB first, and binarize=True so the model will use binning
BernNB = BernoulliNB(binarize=True)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=True)
0.8577633007600435


In [8]:
# use multinomial NB as well, doing the same as before. Don't need binning for this case

MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)


y_pred = MultiNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

MultinomialNB()
0.8816503800217155


In [9]:
# one more time for Gaussian NB
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)


y_pred = GausNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

GaussianNB()
0.8197611292073833


In [10]:
# perhaps we can improve our bernoulli by using trial and error. In this case, 0.1 is optimal!
# we can play around with the binarize value and see how it changes
BernNB = BernoulliNB(binarize=0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test
y_pred = BernNB.predict(X_test)

print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=0.1)
0.9109663409337676


In [11]:
from sklearn.model_selection import GridSearchCV
# hyperparameter tuning to improve accuracy
from sklearn.preprocessing import PowerTransformer
params_NB = {'var_smoothing': np.<a onclick="parent.postMessage({'referent':'.numpy.logspace'}, '*')">logspace(0,-9, num=100)}

gridsearch = GridSearchCV(estimator=model, param_grid=params_NB, cv=cv_method, verbose=1, scoring='accuracy')

SyntaxError: invalid syntax (3058355442.py, line 4)