
# 10장 문서 분류 (Document Classification)

# 11-1 나이브 베이즈 분류(Naive Bayes Classifier)

## 1.1 직접구현

### Naive Bayes Classifier

In [1]:
training_set = [['me free lottery', 1],
 ['free get free you', 1],
 ['you free scholarship', 0],
 ['free to contact me', 0],
 ['you won award', 0],
 ['you ticket lottery', 1]]
# 1이 스팸!!

### 토큰 빈도수 및 문서별 토큰수 계산 (확률 계산을 위한 준비)

![대체 텍스트](https://wikimedia.org/api/rest_v1/media/math/render/svg/98f086c560aa2f66650060277dda4f90e54e30c0)

In [28]:
from collections import defaultdict

# 범주에 속하는 토큰수 세기 1(스팸), 0(정상))

doccnt0 = 0
doccnt1 = 0

for doc in training_set:
    if doc[1] == 0:  # 정상인 문서
        for token in doc[0].split():
            doccnt0 += 1
    else:
        for token in doc[0].split():
            doccnt1 += 1 # 스팸인 문서
    
print(doccnt0, doccnt1)


10 10


In [45]:
# 토큰별로 문서내 빈도수 카운팅
wordfreq = defaultdict(lambda : [0, 0])

# 하고싶은거 : 토큰 별로 문장에 들어가서 있으면, 스팸인 경우에 +1, 노말이면 +1 해주고파

In [46]:
[wordfreq[token] for sent in training_set for token in sent[0].split()]

[[0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0]]

In [47]:
wordfreq['me'][0]

0

In [None]:
# training_set에 문장에 하나씩 들어가서
# split 해주고
# 그 문장이 일단 1이면  wordfreq['me']에 1번째 열에 들어가서 +1씩 들어가자

In [48]:
for sent in training_set:
    if sent[1] == 1: # 스팸일 때
        for token in sent[0].split():
            wordfreq[token][0] += 1 # 스팸일때는, 첫 번째 원소에 카운트 들어감
    else:
        for token in sent[0].split():
            wordfreq[token][1] += 1

In [67]:
wordfreq

defaultdict(<function __main__.<lambda>()>,
            {'me': [1, 1],
             'free': [3, 2],
             'lottery': [2, 0],
             'get': [1, 0],
             'you': [2, 2],
             'scholarship': [0, 1],
             'to': [0, 1],
             'contact': [0, 1],
             'won': [0, 1],
             'award': [0, 1],
             'ticket': [1, 0],
             0: [0, 0]})

### Training : 토큰별 조건부 확률 계산 

In [53]:
k = 0.5

wordprobs = defaultdict(lambda : [0, 0])
[wordprobs[token] for sent in training_set for token in sent[0].split() ]


[[0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0]]

In [90]:
for key in wordprobs:
    # print(key)
    # print(wordprobs[key])
    for values in wordprobs[key]:
        wordprobs[key][0] = (wordfreq[key][0] + k) / (doccnt1 + 2*k) 
        wordprobs[key][1] = (wordfreq[key][1] + k) / (doccnt0 + 2*k)
        

In [71]:
(1+0.5) / (10+0.5)

0.14285714285714285

In [91]:
wordprobs

defaultdict(<function __main__.<lambda>()>,
            {'me': [0.13636363636363635, 0.13636363636363635],
             'free': [0.22727272727272727, 0.3181818181818182],
             'lottery': [0.045454545454545456, 0.22727272727272727],
             'get': [0.045454545454545456, 0.13636363636363635],
             'you': [0.22727272727272727, 0.22727272727272727],
             'scholarship': [0.13636363636363635, 0.045454545454545456],
             'to': [0.13636363636363635, 0.045454545454545456],
             'contact': [0.13636363636363635, 0.045454545454545456],
             'won': [0.13636363636363635, 0.045454545454545456],
             'award': [0.13636363636363635, 0.045454545454545456],
             'ticket': [0.045454545454545456, 0.13636363636363635]})

In [92]:
### 원할한 계산을 위해, 스팸,normal 확률의 로그값을 계산해보자
import math

spam = (doccnt1) / (doccnt0 + doccnt1)
normal = (doccnt0) / (doccnt1 + doccnt0)

logspam = math.log(spam)
lognon = math.log(normal)

logspam, lognon

(-0.6931471805599453, -0.6931471805599453)

### Classify : 신규 텍스트가 주어졌을 때 확률 계산

In [93]:
import math

doc = "free lottery"

tokens = doc.split() # 우선 해당 문서를 토큰화

log_prob_spam, log_prob_non = 0, 0

# print(tokens)
for token in tokens:
    # print(wordprobs[token])
    log_prob_spam += math.log(wordprobs[token][0])
    log_prob_non += math.log(wordprobs[token][1])

sum_log_prob_spam =log_prob_spam + logspam
sum_log_prob_non = log_prob_non + lognon

pnormal = math.exp(sum_log_prob_non)
pspam = math.exp(sum_log_prob_spam)


print("정상확률 : {}".format(pnormal / (pnormal + pspam)))
print("스팸확률 : {}".format(pspam / (pnormal + pspam)))

정상확률 : 0.8749999999999999
스팸확률 : 0.12500000000000008


In [80]:
sum_log_prob_spam ,sum_log_prob_non 

(-3.226843994517378, -5.172754143572691)

# 선생님의 방식

### Naive Bayes Classifier

In [83]:
training_set = [['me free lottery', 1],
 ['free get free you', 1],
 ['you free scholarship', 0],
 ['free to contact me', 0],
 ['you won award', 0],
 ['you ticket lottery', 1]]

In [95]:
from collections import defaultdict

# 범주에 속하는 토큰수 세기 1(스팸), 0(정상))
doccnt0 = 0
doccnt1 = 0

# 토큰별로 문서내 빈도수 카운팅
wordfreq = defaultdict(lambda : [0, 0])

for doc, label in training_set:
    words = doc.split()
    for word in words:
        wordfreq[word][label] += 1

print(wordfreq)        

for key, (cnt0, cnt1) in wordfreq.items(): 
    doccnt0 += cnt0
    doccnt1 += cnt1

    
print('doccnt0 : {}'.format(doccnt0))
print('doccnt1 : {}'.format(doccnt1))

defaultdict(<function <lambda> at 0x000001FB9D9786A8>, {'me': [1, 1], 'free': [2, 3], 'lottery': [0, 2], 'get': [0, 1], 'you': [2, 2], 'scholarship': [1, 0], 'to': [1, 0], 'contact': [1, 0], 'won': [1, 0], 'award': [1, 0], 'ticket': [0, 1]})
doccnt0 : 10
doccnt1 : 10


In [96]:
## DefaultDict
test_dict = defaultdict(lambda : 0)
test_dict['free'] += 1
test_dict

defaultdict(<function __main__.<lambda>()>, {'free': 1})

### Training : 토큰별 조건부 확률 계산

In [105]:
k = 0.5 # For Laplace Smoothing

wordprobs = defaultdict(lambda : [0, 0])

for key, (cnt0, cnt1) in wordfreq.items():
    wordprobs[key][0] = (cnt0 + k) / (2 * k + doccnt0 )
    wordprobs[key][1] = (cnt1 + k) / (2 * k + doccnt1 )
    
wordprobs


defaultdict(<function __main__.<lambda>()>,
            {'me': [0.13636363636363635, 0.13636363636363635],
             'free': [0.22727272727272727, 0.3181818181818182],
             'lottery': [0.045454545454545456, 0.22727272727272727],
             'get': [0.045454545454545456, 0.13636363636363635],
             'you': [0.22727272727272727, 0.22727272727272727],
             'scholarship': [0.13636363636363635, 0.045454545454545456],
             'to': [0.13636363636363635, 0.045454545454545456],
             'contact': [0.13636363636363635, 0.045454545454545456],
             'won': [0.13636363636363635, 0.045454545454545456],
             'award': [0.13636363636363635, 0.045454545454545456],
             'ticket': [0.045454545454545456, 0.13636363636363635]})

### Classify : 신규 텍스트가 주어졌을 때 확률 계산


In [107]:
import math

doc = "free lottery"
tokens = doc.split()

log_prob1 = log_prob0 = 0.0

for word, (prob0, prob1) in wordprobs.items():
    if word in tokens:
        log_prob0 += math.log(prob0)
        log_prob1 += math.log(prob1)

log_prob0 += math.log(doccnt0/ (doccnt0 + doccnt1))
log_prob1 += math.log(doccnt1/ (doccnt0 + doccnt1))

prob0 = math.exp(log_prob0)
prob1 = math.exp(log_prob1)

print(prob0, prob1)

print("정상확률 : {}".format(prob0 / (prob0 + prob1)*100))
print("스팸확률 : {}".format(prob1 / (prob0 + prob1)*100))

0.00516528925619835 0.03615702479338842
정상확률 : 12.500000000000009
스팸확률 : 87.49999999999999


# 근데 사실.. 다 제공해줘

In [110]:
from sklearn.datasets import fetch_20newsgroups



In [114]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [116]:
print(twenty_train.target_names)
print(twenty_train.data[0])
print(twenty_train.target[0])


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have o

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer