In [1]:
import sklearn.datasets
import re
from sklearn.cross_validation import train_test_split
import numpy as np



In [2]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset = sklearn.datasets.load_files(container_path = 'local', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['kerajaan', 'pembangkang']
201
201


In [4]:
vocabulary = list(set(' '.join(trainset.data).split()))
len(vocabulary)

1737

In [5]:
# calculate IDF
idf = {}
for i in vocabulary:
    idf[i] = 0
    for k in trainset.data:
        if i in k.split():
            idf[i] += 1
    idf[i] = np.log(idf[i] / len(trainset.data))

# calculate TF
X = np.zeros((len(trainset.data),len(vocabulary)))
for no, i in enumerate(trainset.data):
    for text in i.split():
        X[no, vocabulary.index(text)] += 1
    for text in i.split():
        # calculate TF * IDF
        X[no, vocabulary.index(text)] = X[no, vocabulary.index(text)] * idf[text]
        
X = np.abs(X)

In [6]:
train_X, test_X, train_Y, test_Y = train_test_split(X, trainset.target, test_size = 0.2)

In [7]:
class MultinomialNB:
    def __init__(self, epsilon):
        self.EPSILON = epsilon

    def fit(self, X, y):
        count_sample = X.shape[0]
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.class_log_prior_ = [np.log((len(i) / count_sample)+self.EPSILON) for i in separated]
        count = np.array([np.array(i).sum(axis=0) for i in separated])
        self.feature_log_prob_ = np.log((count / count.sum(axis=1)[np.newaxis].T)+self.EPSILON)

    def predict_log_proba(self, X):
        log_proba = [(self.feature_log_prob_ * x).sum(axis=1) + self.class_log_prior_ for x in X]
        return [1-(i/np.sum(i)) for i in log_proba]

    def predict(self, X):
        return np.argmax(self.predict_log_proba(X), axis=1)

In [8]:
multinomial_bayes = MultinomialNB(1e-8)
multinomial_bayes.fit(train_X, train_Y)

## accuracy training

In [9]:
np.mean(train_Y == multinomial_bayes.predict(train_X))

1.0

## accuracy testing

In [10]:
np.mean(test_Y == multinomial_bayes.predict(test_X))

0.9024390243902439

In [11]:
multinomial_bayes.predict_log_proba(test_X)

[array([0.4160699, 0.5839301]),
 array([0.55938444, 0.44061556]),
 array([0.59284875, 0.40715125]),
 array([0.62516137, 0.37483863]),
 array([0.44937446, 0.55062554]),
 array([0.45295124, 0.54704876]),
 array([0.44677705, 0.55322295]),
 array([0.53576261, 0.46423739]),
 array([0.47820352, 0.52179648]),
 array([0.46795305, 0.53204695]),
 array([0.52081375, 0.47918625]),
 array([0.42479686, 0.57520314]),
 array([0.48283977, 0.51716023]),
 array([0.41845241, 0.58154759]),
 array([0.50002556, 0.49997444]),
 array([0.55858497, 0.44141503]),
 array([0.55340458, 0.44659542]),
 array([0.42416387, 0.57583613]),
 array([0.55213442, 0.44786558]),
 array([0.53164622, 0.46835378]),
 array([0.39331215, 0.60668785]),
 array([0.41795026, 0.58204974]),
 array([0.60649046, 0.39350954]),
 array([0.46393913, 0.53606087]),
 array([0.43016626, 0.56983374]),
 array([0.59933077, 0.40066923]),
 array([0.4685191, 0.5314809]),
 array([0.64641473, 0.35358527]),
 array([0.51236647, 0.48763353]),
 array([0.55986553