## Naive Bayes and unstructured text

$ h_{map} = argmax_{h \in H} P(D | h) P(h) $

#### News Classification

I. Remove all stop words (depends on type of data though)

In [153]:
from nltk.corpus import stopwords
def getStopWord(l=100):
    return list(filter(lambda x: len(x) <= l, stopwords.words('english')))

# returns 57 stopwords

II. Class for training data

In [154]:
from collections import defaultdict, Counter
import os, math
class BayesTextTrain(object):
    def __init__(self, tpath = "/data/ch7_unstructured/20news-bydate/20news-bydate-train/"):
        self.join = os.path.join
        self.vocab = defaultdict(int)
        self.classVocab = defaultdict(lambda: defaultdict)
        self.stopwords = set(getStopWord())
        self.stopwords.add('')
        self.train_path = tpath
        self.test_path = "/data/ch7_unstructured/20news-bydate/20news-bydate-test/" # lazy
        self.classes = BayesTextTrain._getfiles("."+self.train_path) # 20 classes
        self.priors = defaultdict(lambda: defaultdict(int))
        self.matrix = defaultdict(lambda: defaultdict(int))
        self.acc = [0,0] # correct and wrong
        
    @staticmethod
    def _getfiles(directory):
        return os.listdir(directory)
    
    
    def _filterContent(self,c):
        c = c.lower().split("lines:")[-1].split()
        def trimit(w):
            ew = set("/\<>&=!?^+*()[]{}-.%")
            return filter(lambda x: x not in ew, w)
        c = map(lambda x: trimit(x), filter(lambda x: x not in self.stopwords and "@" not in x and len(x) > 2, c))
        return c

    def _countClass(self, clas):
        classCounter = defaultdict(int)
        pth = os.getcwd()+self.join(self.train_path, clas)
        files = BayesTextTrain._getfiles(pth)
        for afile in files:
            filepth = self.join(pth, afile)
            with open(filepth, 'r+') as f:
                content = self._filterContent(f.read())
                for w in content: classCounter[w]+=1
        
        self.classVocab[clas] = classCounter
        for k,v in classCounter.items(): self.vocab[k] += v
    
    def train(self):
        for clas in self.classes: self._countClass(clas)
        for clas in self.classes: self._calcProbClass(clas)
        
    def _calcProbClass(self,clas):
        n = len(self.vocab)
        vocab = self.classVocab[clas]
        for key, val in vocab.items():
            self.priors[clas][key] = (val+1)/float(self.vocab[key]+n)
            cclas = clas
            if clas != cclas:
                print clas
                cclas = clas
                
    def classify(self,clas, article):
        pth = os.getcwd()+self.join(self.test_path, clas+"/"+str(article))
        with open(pth,'r+') as f:
            l = map(lambda x: [0,x], self.classes)
            content = self._filterContent(f.read())
            for word in content:
                for c in l:
                    if self.priors[c[1]][word] == 0: continue
                    c[0] += math.log(self.priors[c[1]][word])
        return min(l)[-1]
    
    def test(self):
        for clas in self.classes:
            pth = self.join(os.getcwd()+"/"+self.test_path,clas)
            files = self._getfiles(pth)
            for f in files:
                predictClass = self.classify(clas, f)
                if (clas == predictClass): self.acc[0] += 1
                else: self.acc[1] += 1
                self.matrix[clas][predictClass] += 1
        
    
    def accuracy(self):
        print self.acc[0]/float(sum(self.acc))

In [155]:
test = BayesTextTrain()

In [156]:
test.train() # takes ~2 min

In [157]:
test.test()

#### Accuracy and confusion matrix

In [158]:
test.accuracy()

0.603956452469


In [159]:
for clas in test.classes:
    print ""
    for clas2 in test.classes:
        n = test.matrix[clas][clas2]
        print str(n) + (3-len(str(n)))*" ",


189 2   2   3   0   1   3   0   7   0   0   5   0   4   10  19  1   9   8   56  
5   187 12  14  12  34  25  7   3   3   5   16  20  10  29  2   1   2   1   1   
4   27  121 40  24  25  51  3   9   1   1   10  10  23  25  0   5   2   11  2   
1   12  60  160 34  6   69  2   2   1   0   3   32  4   4   0   0   0   0   2   
4   28  10  13  191 3   54  6   12  2   5   8   20  16  11  0   0   0   2   0   
2   52  17  17  7   228 12  3   3   1   2   10  6   7   15  5   0   2   4   2   
6   12  13  20  21  1   225 13  10  3   11  6   17  7   10  1   4   4   3   3   
2   7   1   2   0   2   29  255 39  0   2   4   8   9   17  2   9   0   3   5   
2   3   0   0   1   2   13  23  334 0   2   0   2   5   0   0   6   2   2   1   
1   4   0   1   1   1   13  3   11  286 21  2   3   13  5   3   6   4   14  5   
1   2   1   1   0   0   22  0   7   15  321 2   2   3   7   2   2   3   5   3   
5   8   5   4   5   4   16  1   2   0   2   263 7   4   11  3   28  10  10  8   
2   15  8   14  9   1   39 