# Machine Learning com Naïve Bayes

## Multinomial...

In [3]:
import numpy as np
import csv
import random
np.set_printoptions(precision=6)

class MultinomialNB(object):
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha
        self.feature_log_prob_ = np.log(count / count.sum(axis=1)[np.newaxis].T)
        return self

    def predict_log_proba(self, X):
        return [(self.feature_log_prob_ * x).sum(axis=1) + self.class_log_prior_
                for x in X]

    def predict(self, X):
        return np.argmax(self.predict_log_proba(X), axis=1)
    
def loadCsv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset
 
def splitDataset(dataset, splitRatio):
    trainSize = int(len(dataset) * splitRatio)
    trainSet = []
    copy = list(dataset)
    while len(trainSet) < trainSize:
        index = random.randrange(len(copy))
        trainSet.append(copy.pop(index))
    return [trainSet, copy]
    
def getAccuracy(testSet, predictions):
    correct = 0
    for i in range(len(testSet)):
        if testSet[i][-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0
    

### Leitura...

In [4]:
filename = './spambase.data'
splitRatio = 0.67
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset), len(trainingSet), len(testSet)))

Split 4601 rows into train = 3082 and test = 1519 rows


## Executando o algoritmo

In [5]:
#Convertendo os dados para arrays
x = [linha for linha in trainingSet]
x = np.array(x)

#Extraindo o vetor de classes y
y = [linha[57] for linha in trainingSet]
y = np.array(y)

#Rodando o algoritmo
nb = MultinomialNB().fit(x, y)

### Fazendo uma predição...
Calcular a probabilidade de um dado exemplo pertencer a cada uma das classes...e, assim, identificar a classe com maior probabilidade.

In [6]:
teste = np.array([[0,0.64,0.64,0,0.32,0,0,0,0,0,0,0.64,0,0,0,0.32,0,1.29,1.93,0,0.96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.778,0,0,3.756,61,278, 1]])
resultado = nb.predict(teste)
print('Prediction: {0}'.format(resultado)) 

Prediction: [1]


### Predições para todos os exemplos do conjunto de treinamento e acurácia

In [7]:
#Extracao do vetor de resultados
resultados = [linha[57] for linha in testSet]

#Efetuando predicoes
predicoes = nb.predict(testSet)

#Calculando acuracia
acuracia = getAccuracy(testSet, predicoes)
print('Accuracy: {0}%'.format(acuracia))

Accuracy: 84.39763001974984%
