In [1]:
import sklearn.datasets
import re
from sklearn.cross_validation import train_test_split
import numpy as np



In [2]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset = sklearn.datasets.load_files(container_path = 'local', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['kerajaan', 'pembangkang']
201
201


In [4]:
vocabulary = list(set(' '.join(trainset.data).split()))
len(vocabulary)

1737

In [5]:
idf = np.zeros((len(vocabulary)))
for no, i in enumerate(vocabulary):
    for s in trainset.data:
        if i in s.split():
            idf[no] += 1
    idf[no] = np.log(len(trainset.data) / idf[no])
idf

array([5.30330491, 5.30330491, 5.30330491, ..., 5.30330491, 4.61015773,
       5.30330491])

In [6]:
tf = np.zeros((len(trainset.data),len(vocabulary)))
for no, s in enumerate(trainset.data):
    for w in s.split():
        tf[no, vocabulary.index(w)] += 1
tfidf = tf * idf

tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
label = np.array(trainset.target)

In [8]:
store_total_tfidf = {}
for l in np.unique(label):
    store_total_tfidf[l] = tfidf[np.where(label == l)[0]].sum()
store_total_tfidf

{0: 8359.847501116172, 1: 5170.803704701069}

In [9]:
prob = np.zeros([len(trainset.target_names)])
counts = np.unique(label, return_counts = True)
for i in range(len(counts[0])):
    prob[counts[0][i]] = counts[1][i] / label.shape[0]
prob

array([0.49751244, 0.50248756])

In [10]:
trained_tfidf = np.zeros((len(store_total_tfidf), tfidf.shape[1]))
trained_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
trained_tfidf[0] = np.log((tfidf[np.where(label == 0)[0]].sum(axis=0) + 1) / (store_total_tfidf[0] + tfidf.shape[1]))
trained_tfidf[1] = np.log((tfidf[np.where(label == 1)[0]].sum(axis=0) + 1) / (store_total_tfidf[1] + tfidf.shape[1]))
trained_tfidf

array([[-7.37890444, -7.37890444, -7.37890444, ..., -7.37890444,
        -6.89560107, -9.21997853],
       [-8.84040702, -8.84040702, -8.84040702, ..., -8.84040702,
        -8.84040702, -6.99933294]])

In [12]:
def softmax(x):
    mx = np.max(x, axis=-1, keepdims=True)
    top = np.exp(x - mx)
    bottom = np.sum(top, axis = 1, keepdims = True)
    return top / bottom

In [13]:
new_result = tfidf.dot(trained_tfidf.T) + np.log(prob)
softmax(new_result)[:10]

array([[1.13435465e-63, 1.00000000e+00],
       [1.16575968e-45, 1.00000000e+00],
       [1.23479225e-81, 1.00000000e+00],
       [1.20714646e-38, 1.00000000e+00],
       [1.19345584e-41, 1.00000000e+00],
       [4.00001684e-70, 1.00000000e+00],
       [5.06942386e-73, 1.00000000e+00],
       [1.49247236e-43, 1.00000000e+00],
       [3.29223786e-40, 1.00000000e+00],
       [1.87686650e-32, 1.00000000e+00]])

In [14]:
train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)

In [15]:
class MultinomialNB:
    def __init__(self):
        pass

    def fit(self, X, Y):
        self.label = np.array(Y)
        self.store_total = {}
        for l in np.unique(label):
            self.store_total[l] = X[np.where(self.label == l)[0]].sum()
        
        self.prob = np.zeros([len(self.store_total)])
        counts = np.unique(self.label, return_counts = True)
        for i in range(len(counts[0])):
            self.prob[counts[0][i]] = counts[1][i] / self.label.shape[0]
        
        self.log_prob = np.zeros((len(self.store_total), X.shape[1]))
        
        for no, l in enumerate(np.unique(label)):
            top = X[np.where(self.label == 0)[0]].sum(axis=0) + 1
            bottom = self.store_total[l] + X.shape[1]
            self.log_prob[no] = np.log(top / bottom)

    def predict_proba(self, X):
        new_result = X.dot(self.log_prob.T) + np.log(self.prob)
        return softmax(new_result)
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis = 1)

In [16]:
multinomial_bayes = MultinomialNB()
multinomial_bayes.fit(train_X, train_Y)

## accuracy training

In [17]:
np.mean(train_Y == multinomial_bayes.predict(train_X))

0.49375

## accuracy testing

In [18]:
np.mean(test_Y == multinomial_bayes.predict(test_X))

0.5365853658536586

In [19]:
multinomial_bayes.predict_proba(test_X)

array([[1.32087101e-09, 9.99999999e-01],
       [1.18502362e-14, 1.00000000e+00],
       [9.38632721e-20, 1.00000000e+00],
       [4.55619638e-06, 9.99995444e-01],
       [4.07506407e-18, 1.00000000e+00],
       [3.33260593e-16, 1.00000000e+00],
       [1.26882112e-17, 1.00000000e+00],
       [1.94283519e-09, 9.99999998e-01],
       [2.63618910e-17, 1.00000000e+00],
       [4.21985803e-10, 1.00000000e+00],
       [7.13460635e-08, 9.99999929e-01],
       [3.76005742e-09, 9.99999996e-01],
       [1.39409848e-07, 9.99999861e-01],
       [5.47949491e-16, 1.00000000e+00],
       [8.56484791e-17, 1.00000000e+00],
       [3.59546649e-07, 9.99999640e-01],
       [2.05689487e-15, 1.00000000e+00],
       [1.52764013e-09, 9.99999998e-01],
       [1.16722178e-15, 1.00000000e+00],
       [4.29137996e-10, 1.00000000e+00],
       [1.33689912e-10, 1.00000000e+00],
       [5.58334180e-08, 9.99999944e-01],
       [2.15378196e-16, 1.00000000e+00],
       [2.37915138e-08, 9.99999976e-01],
       [5.058130