In [1]:
# 직접 구현

import pandas as pd
import numpy as np

In [2]:
naive = pd.read_csv('./naivebayes_example.csv')
naive

Unnamed: 0,mail,label
0,i love you,1
1,love happy weekend,1
2,bore work job,0
3,i hate you,0
4,bore weekend,0
5,happy together,1


In [3]:
# array 형태로 변환 (편집의 용이를 위해?)

training_set = np.array(naive)
print(training_set)

[['i love you' 1]
 ['love happy weekend' 1]
 ['bore work job' 0]
 ['i hate you' 0]
 ['bore weekend' 0]
 ['happy together' 1]]


In [4]:
# 토큰별로 문서 내 빈도수 카운팅

from collections import defaultdict
wordfreq = defaultdict(lambda: [0,0])
for doc, point in training_set:
    words = doc.split()
    for word in words:
        if point ==1:
            wordfreq[word][0] +=1 # 긍정인 경우 0번째 인덱스에 1을 더한다
        else:    
            wordfreq[word][1] +=1 # 부정인 경우 1번째 인덱스에 1을 더한다
            
wordfreq

defaultdict(<function __main__.<lambda>()>,
            {'i': [1, 1],
             'love': [2, 0],
             'you': [1, 1],
             'happy': [2, 0],
             'weekend': [1, 1],
             'bore': [0, 2],
             'work': [0, 1],
             'job': [0, 1],
             'hate': [0, 1],
             'together': [1, 0]})

In [5]:
# 긍정/부정 빈도수 계산

positive_tk = []
negative_tk = []
for key, (cnt1,cnt0) in wordfreq.items():
    positive_tk.append(int(cnt1))
    negative_tk.append(int(cnt0))
print(positive_tk)
print(negative_tk)

[1, 2, 1, 2, 1, 0, 0, 0, 0, 1]
[1, 0, 1, 0, 1, 2, 1, 1, 1, 0]


In [6]:
positive_tk_nm = sum(positive_tk)
negative_tk_nm = sum(negative_tk)

print(positive_tk_nm,negative_tk_nm)

8 8


In [7]:
# 토큰별 조건부 확률 계산

wordprobs = defaultdict(lambda:[0,0])
for key, (cnt1,cnt0) in wordfreq.items():
    wordprobs[key][0] = (cnt1 +1)/(positive_tk_nm + len(wordfreq))
    wordprobs[key][1] = (cnt0 +1)/(negative_tk_nm + len(wordfreq))
wordprobs

defaultdict(<function __main__.<lambda>()>,
            {'i': [0.1111111111111111, 0.1111111111111111],
             'love': [0.16666666666666666, 0.05555555555555555],
             'you': [0.1111111111111111, 0.1111111111111111],
             'happy': [0.16666666666666666, 0.05555555555555555],
             'weekend': [0.1111111111111111, 0.1111111111111111],
             'bore': [0.05555555555555555, 0.16666666666666666],
             'work': [0.05555555555555555, 0.1111111111111111],
             'job': [0.05555555555555555, 0.1111111111111111],
             'hate': [0.05555555555555555, 0.1111111111111111],
             'together': [0.1111111111111111, 0.05555555555555555]})

In [8]:
# 신규텍스트가 주어졌을 때 확률 계산

import math
doc = 'happy weekend'
tokens = doc.split()
tokens

['happy', 'weekend']

In [9]:
# 초기값을 모두 0으로 처리
log_prob1 = log_prob0 = 0.0

# 모든 단어에 대해 반복
for word, (prob1,prob0) in wordprobs.items():
    if word in tokens:
        log_prob1 +=math.log(prob1)
        log_prob0 +=math.log(prob0)
log_prob1 += math.log(positive_tk_nm/(positive_tk_nm+negative_tk_nm))
log_prob0 += math.log(negative_tk_nm/(negative_tk_nm+negative_tk_nm))

prob1 = math.exp(log_prob1)
prob0 = math.exp(log_prob0)

print('prob1:', prob1)
print('prob0:', prob0)

print('happy와 weekend가 새로운 메일에 포함되어 있을 경우, 긍정확룰과 부정확률')
print('긍정확률: {}%'.format(prob1/(prob1+prob0)*100))
print('부정확률: {}%'.format(prob0/(prob1+prob0)*100))

prob1: 0.009259259259259257
prob0: 0.0030864197530864183
happy와 weekend가 새로운 메일에 포함되어 있을 경우, 긍정확룰과 부정확률
긍정확률: 75.00000000000001%
부정확률: 24.999999999999996%


In [10]:
# sklearn

X_train = list(naive['mail'])
Y_train = list(naive['label'])
print(X_train)
print(Y_train)

['i love you', 'love happy weekend', 'bore work job', 'i hate you', 'bore weekend', 'happy together']
[1, 1, 0, 0, 0, 1]


In [11]:
!pip install sklearn



In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

cv = CountVectorizer()
# fit&transform
X_train_counts = cv.fit_transform(X_train)
# MultinominalNB 선언 & fit
clf = MultinomialNB().fit(X_train_counts, Y_train)

In [18]:
# 예측
print(clf.predict(cv.transform(['happy weekend'])))
# 확률
print(clf.predict_proba(cv.transform(['happy weekend'])))

[1]
[[0.25 0.75]]
