In [3]:
import collections
import re
import matplotlib.pyplot as plt
import numpy as np
import itertools
import os

In [23]:
!ls bbc

business  entertainment  politics  README.TXT  sport  tech


In [44]:
arquivos = [ x[0]+'/'+nome  for x in list(os.walk("bbc")) for nome in x[2]]

In [45]:
arquivosTech = [x for x in arquivos if  x.split('/')[1] in ['tech']  ]
arquivosNTech = [x for x in arquivos if  x.split('/')[1] in ['business', 'entertainment' , 'politics', 'sport']  ]

In [53]:
def readDoc(file):
    with open(file,'r',encoding='latin-1') as f:
        return f.read()

In [242]:
#TODO: tirar números?
def tokenize(text):
    text=text.lower()
    text=re.sub('\n',' ',text)
    text=re.sub('[,.+=]',' ',text)
    text=re.sub('[0-9]','',text)
    text=re.sub('\s+',' ',text)
    text=text.strip()
    return text.split(' ')

In [243]:
#retirar stop words?
def Train(docsTrue, docsFalse):
    total=len(docsTrue)+len(docsFalse)
    logPrioriT = np.log(len(docsTrue)/total)
    logPrioriF = np.log(len(docsFalse)/total)

    counterTotal = collections.Counter([y for x in [*docsTrue, *docsFalse]  for y in tokenize(x)])
    V = set([y for x in [*docsTrue, *docsFalse]  for y in tokenize(x)])
    counterT = collections.Counter([y for x in docsTrue for y in tokenize(x)])
    counterF = collections.Counter([y for x in docsFalse for y in tokenize(x)])

    denT = sum([x[1] for x in counterT.items()])+len(V)
    denF = sum([x[1] for x in counterF.items()])+len(V)

    likelyhoodT = { v: np.log((counterT[v]+1)/denT) for v  in V }
    likelyhoodF = { v: np.log((counterF[v]+1)/denF) for v  in V }
    
    return {'logPrioriT': logPrioriT, 'logPrioriF': logPrioriF, 'likelyT': likelyhoodT, 'likelyF': likelyhoodF, 'V': V}

In [244]:
modelo = Train(docsTrue, docsFalse)

In [245]:
def Classify(modelo, doc):
    logPrioriT = modelo['logPrioriT']
    logPrioriF = modelo['logPrioriF']
    likelyT = modelo['likelyT']
    likelyF = modelo['likelyF']
    V = modelo['V']
    
    classT = logPrioriT
    
    for w in tokenize(doc):
        if w in V:
            classT += likelyT[w]
    
    
    classF = logPrioriF
    for w in tokenize(doc):
        if w in V:
            classF += likelyF[w]
    
    return np.argmax([classF, classT])

In [246]:
def CrossValidation(docsTrue, docsFalse, k=10):
    docs=[ (x,1) for x in docsTrue ]
    docs=docs+[ (x,0) for x in docsFalse ]
    np.random.shuffle(docs)
    
    sz = round(len(docs)/k)
    
    grupos = [ docs[idx:idx+sz]  for idx in range(0,len(docs), sz)]
    
    if(len(grupos) > k):
        grupos[-2] += grupos[-1]
    
    grupos=grupos[:k]
    
    for i in range(k):
        yield ( [doc for z in list(set(range(k)) -{i} ) for doc in grupos[z] ],  grupos[i] )

In [247]:
set(range(10))-{5}

{0, 1, 2, 3, 4, 6, 7, 8, 9}

In [248]:
def Experimento(setup):
    train = setup[0]
    test = setup[1]
    
    docsTrueTrain=[x[0] for x in train if x[1]==1]
    docsFalseTrain=[x[0] for x in train if x[1]==0]
    
    
    modelo = Train(docsTrueTrain, docsFalseTrain)
        
    result = [ (Classify(modelo, x), y)  for x,y in test ]
    return result
    

In [249]:
def Score(r):
    tp, tn, fp, fn = 0,0,0,0
    for x in r:
        tp += x[0]==1 and x[1]==1
        tn += x[0]==0 and x[1]==0
        fp += x[0]==1 and x[1]==0
        fn += x[0]==0 and x[1]==1
    
    prec =  tp/(tp+fp)
    rev  =  tp/(tp+fn)
    
    return ( prec, rev, 2*prec*rev/(prec+rev)  )

In [251]:
setups = CrossValidation(docsTrue, docsFalse, 20)

for s in setups:
    train, test = s
    r = Experimento(s)
    print(Score(r))

(0.9629629629629629, 1.0, 0.9811320754716981)
(1.0, 0.9444444444444444, 0.9714285714285714)
(1.0, 1.0, 1.0)
(1.0, 1.0, 1.0)
(1.0, 1.0, 1.0)
(0.9583333333333334, 1.0, 0.9787234042553191)
(0.85, 1.0, 0.9189189189189189)
(0.95, 1.0, 0.9743589743589743)
(1.0, 0.9473684210526315, 0.972972972972973)
(0.9583333333333334, 1.0, 0.9787234042553191)
(0.9583333333333334, 1.0, 0.9787234042553191)
(1.0, 1.0, 1.0)
(1.0, 0.9411764705882353, 0.9696969696969697)
(0.8333333333333334, 1.0, 0.9090909090909091)
(0.9545454545454546, 1.0, 0.9767441860465117)
(1.0, 0.9444444444444444, 0.9714285714285714)
(0.875, 1.0, 0.9333333333333333)
(0.8823529411764706, 0.967741935483871, 0.923076923076923)
(0.9285714285714286, 0.9285714285714286, 0.9285714285714286)
(0.9523809523809523, 1.0, 0.975609756097561)


10

In [84]:
docsTrue=[readDoc(x) for x in arquivosTech]
docsFalse=[readDoc(x) for x in arquivosNTech]

In [85]:
Train(docsTrue, docsFalse)

{'logPrioriT': -1.7119585786383464, 'logPrioriF': -0.1990754012407757}