In [1]:
import sklearn.datasets
import re
from sklearn.cross_validation import train_test_split
import numpy as np



In [2]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset = sklearn.datasets.load_files(container_path = 'local', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['kerajaan', 'pembangkang']
201
201


In [4]:
vocabulary = list(set(' '.join(trainset.data).split()))
len(vocabulary)

1737

In [5]:
# calculate IDF
idf = {}
for i in vocabulary:
    idf[i] = 0
    for k in trainset.data:
        if i in k.split():
            idf[i] += 1
    idf[i] = np.log(idf[i] / len(trainset.data))

# calculate TF
X = np.zeros((len(trainset.data),len(vocabulary)))
for no, i in enumerate(trainset.data):
    for text in i.split():
        X[no, vocabulary.index(text)] += 1
    for text in i.split():
        # calculate TF * IDF
        X[no, vocabulary.index(text)] = X[no, vocabulary.index(text)] * idf[text]
        
X = np.abs(X)

In [6]:
train_X, test_X, train_Y, test_Y = train_test_split(X, trainset.target, test_size = 0.2)

In [7]:
class GaussianNB:
    def __init__(self, epsilon):
        self.EPSILON = epsilon

    def fit(self, X, y):
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.model = np.array([np.c_[np.mean(i, axis=0)+self.EPSILON, np.std(i, axis=0)+self.EPSILON] for i in separated])

    def _prob(self, x, mean, std):
        exponent = np.exp(-((x - mean)**2 / (2 * std**2))+self.EPSILON)
        return np.log((exponent / (np.sqrt(2 * np.pi) * std))+self.EPSILON)

    def predict_log_proba(self, X):
        log_proba = [[sum(self._prob(i, *s) for s, i in zip(summaries, x)) for summaries in self.model] for x in X]
        return [i/np.sum(i) for i in log_proba]

    def predict(self, X):
        return np.argmax(self.predict_log_proba(X), axis=1)

    def score(self, X, y):
        return sum(self.predict(X) == y) / len(y)

In [8]:
gaussian_bayes = GaussianNB(1e-8)
gaussian_bayes.fit(train_X, train_Y)

## accuracy training

In [9]:
gaussian_bayes.score(train_X, train_Y)

0.48125

## accuracy testing

In [10]:
gaussian_bayes.score(test_X, test_Y)

0.5853658536585366

In [11]:
gaussian_bayes.predict_log_proba(test_X)

[array([0.39327101, 0.60672899]),
 array([0.39263689, 0.60736311]),
 array([0.39179131, 0.60820869]),
 array([0.38721061, 0.61278939]),
 array([0.38833738, 0.61166262]),
 array([0.38840505, 0.61159495]),
 array([0.38683972, 0.61316028]),
 array([0.39020765, 0.60979235]),
 array([0.39029644, 0.60970356]),
 array([0.38749508, 0.61250492]),
 array([0.39025201, 0.60974799]),
 array([0.39241201, 0.60758799]),
 array([0.38840757, 0.61159243]),
 array([0.38883126, 0.61116874]),
 array([0.38506275, 0.61493725]),
 array([0.38782718, 0.61217282]),
 array([0.38773943, 0.61226057]),
 array([0.38692275, 0.61307725]),
 array([0.38791895, 0.61208105]),
 array([0.38972045, 0.61027955]),
 array([0.3915118, 0.6084882]),
 array([0.38675418, 0.61324582]),
 array([0.39001741, 0.60998259]),
 array([0.38830252, 0.61169748]),
 array([0.38751443, 0.61248557]),
 array([0.38995648, 0.61004352]),
 array([0.39218534, 0.60781466]),
 array([0.38756023, 0.61243977]),
 array([0.38868535, 0.61131465]),
 array([0.386858