# Naive Bayes

$P(스팸메일|광고) = \frac{P(광고|스팸메일)  P(스팸메일)}{       P(광고)           }$

$P(B|A) = \frac{P(A|B)P(B)}{P(A)}$

In [4]:
import math, sys
from konlpy.tag import Okt, Mecab

In [81]:
class BayesianFilter22:
    def __init__(self):
        self.words = set()
        self.word_dict = {}
        self.category_dict ={}
    
    def split(self, text):
        results = []
        mec = Mecab()
        malist = mec.pos(text)
#         okt = Okt()
#         malist = okt.pos(text, norm=True, stem= True)
#         print("malist: \n", malist)
        for word in malist:
            if not word[1] in ['Josa', "Eomi", "Punctuation"]:
                results.append(word[0])
#         print("result: \n", results)
        return results
    
    def inc_word(self, word, category):
        if not category in self.word_dict:
            self.word_dict[category] ={}
        if not word in self.word_dict[category]:
            self.word_dict[category][word] = 0 
        self.word_dict[category][word] += 1
        self.words.add(word)
    
    def inc_category(self, category):
        if not category in self.category_dict:
            self.category_dict[category] = 0
        self.category_dict[category] += 1
    
    def fit(self, text, category):
        word_list = self.split(text)
        for word in word_list:
            self.inc_word(word, category)
        self.inc_category(category)
    
    def score(self, words, category):
        score = math.log(self.category_prob(category))
        for word in words:
            score += math.log(self.word_prob(word, category))
        return score
   
    def predict(self, text):
        best_category = None
        max_score = -sys.maxsize
        words = self.split(text)
        score_list = []
        for category in self.category_dict.keys():
            score = self.score(words, category)
            score_list.append((category, score))
            if score > max_score:
                max_score = score
                best_category = category
        return best_category, score_list
        
    def get_word_count(self, word, category):
        if word in self.word_dict[category]:
            print(self.word_dict)
            return self.word_dict[category][word]
        else:
            return 0
    
    def category_prob(self, category):
        sum_categories = sum(self.category_dict.values())
        category_v = self.category_dict[category]
        return category_v / sum_categories
    
    def word_prob(self, word, category):
        n = self.get_word_count(word, category) + 1
        d = sum(self.word_dict[category].values()) + len(self.words)
        return n / d

In [82]:
bf = BayesianFilter22()

In [83]:
bf.fit("파격 세일 - 오늘까지만 30% 할인", "광고") 
bf.fit("쿠폰 선물 & 무료 배송", "광고")
bf.fit("백화점 세일", "광고")
bf.fit("봄과 함께 찾아온 따뜻한 신제품 소식", "광고") 
bf.fit("인기 제품 기간 한정 세일", "광고")
bf.fit("오늘 일정 확인", "중요")
bf.fit("프로젝트 진행 상황 보고","중요")
bf.fit("계약 잘 부탁드립니다","중요")
bf.fit("회의 일정이 등록되었습니다.","중요")
bf.fit("오늘 일정이 없습니다.","중요")
# 예측을 한다.
pre, scorelist = bf.predict("인기") 
print("결과 =", pre)
print(scorelist)

{'광고': {'세일': 3, '따뜻': 1, '한': 1, '선물': 1, '만': 1, '소식': 1, '%': 1, '기간': 1, '무료': 1, '백화점': 1, '한정': 1, '쿠폰': 1, '과': 1, '&': 1, '함께': 1, '30': 1, '-': 1, '파격': 1, '인기': 1, '배송': 1, '까지': 1, '봄': 1, '신': 1, '오늘': 1, '할인': 1, '찾아온': 1, '제품': 2}, '중요': {'계약': 1, '회의': 1, '고': 1, '등록': 1, '프로젝트': 1, '부탁드립니다': 1, '진행': 1, '잘': 1, '상황': 1, '이': 2, '보': 1, '일정': 3, '었': 1, '오늘': 2, '습니다': 2, '없': 1, '되': 1, '.': 2, '확인': 1}}
결과 = 광고
[('광고', -4.31748811353631), ('중요', -4.941642422609305)]


In [77]:
mec = Mecab()
malist = mec.pos("아버지가방에들어가신다")
malist

[('아버지', 'NNG'),
 ('가', 'JKS'),
 ('방', 'NNG'),
 ('에', 'JKB'),
 ('들어가', 'VV'),
 ('신다', 'EP+EC')]

In [52]:
-sys.maxsize

-9223372036854775807

### 다층 퍼셉트론

In [84]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from sklearn import metrics
import json
import numpy as np
import logging
logging.disable(logging.WARNING)

In [85]:
max_words = 56681
nb_classes =  9

In [86]:
def build_model():
    model = Sequential()
    model.add(Dense(512, input_shape=(max_words,)))
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation("softmax"))
    model.compile(loss="categorical_crossentropy", optimizer= 'adam', metrics=['accuracy'])
    return model

In [101]:
data = json.load(open('/Users/lesson6_mac/AtomData/python/신문파일/data.json'))
X = np.array(data['X'])
Y = np.array(data['Y'])


In [102]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y)
Y_train = np_utils.to_categorical(Y_train, nb_classes)
print(len(X_train),len(Y_train))

6314 6314


In [103]:
model = KerasClassifier(build_fn=build_model, epochs=20, batch_size=64)
model.fit(X_train, Y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1c309c5e10>

In [104]:
y = model.predict(X_test)
ac_score = metrics.accuracy_score(Y_test, y)
cl_report = metrics.classification_report(Y_test, y)
print("정답률 = ", ac_score)
print("리포트 = ", cl_report)

정답률 =  0.8907363420427553
리포트 =                precision    recall  f1-score   support

           0       0.96      0.93      0.94       338
           1       0.85      0.89      0.87       355
           2       0.88      0.90      0.89       342
           3       0.90      0.84      0.87       362
           4       0.92      0.89      0.90       355
           5       0.85      0.90      0.87       353

    accuracy                           0.89      2105
   macro avg       0.89      0.89      0.89      2105
weighted avg       0.89      0.89      0.89      2105

