# [4.5] 나이브 베이즈 (Naïve Bayes) 알고리즘

- 지도학습 알고리즘 > '분류'에 이용; 확률 기반 알고리즘

## 예제2) 베르누이 나이브 베이즈를 활용한 스팸 메일 분류

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

### Data 가져오기

In [2]:
email_list = [
                {'email title': 'free game only today', 'spam': True},
                {'email title': 'cheapest flight deal', 'spam': True},
                {'email title': 'limited time offer only today only today', 'spam': True},
                {'email title': 'today meeting schedule', 'spam': False},
                {'email title': 'your flight schedule attached', 'spam': False},
                {'email title': 'your credit card statement', 'spam': False}
             ]

df = pd.DataFrame(email_list)
df

Unnamed: 0,email title,spam
0,free game only today,True
1,cheapest flight deal,True
2,limited time offer only today only today,True
3,today meeting schedule,False
4,your flight schedule attached,False
5,your credit card statement,False


### Data 가공하기


베르누이 나이브 베이즈 알고리즘은 '0' 또는 '1'로 Data의 특징이 표현됐을 때 사용하는 모델이다.


현재 Data에는 스팸의 여부가 'True', 'False' 등의 bool 형식으로 되어있는데, 이를 'True' = 1로, 'False' = 0으로 매칭해서 바꿔준 후 'label'이라는 Column에 새로 추가하였다.

In [3]:
df['label'] = df['spam'].map({True:1, False:0})

In [4]:
df_x = df['email title']
df_y = df['label']

In [5]:
cv = CountVectorizer(binary = True)
x_traincv = cv.fit_transform(df_x)

In [6]:
encoded_input = x_traincv.toarray()
encoded_input

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]])

In [7]:
cv.vocabulary_

{'free': 6,
 'game': 7,
 'only': 11,
 'today': 15,
 'cheapest': 2,
 'flight': 5,
 'deal': 4,
 'limited': 8,
 'time': 14,
 'offer': 10,
 'meeting': 9,
 'schedule': 12,
 'your': 16,
 'attached': 0,
 'credit': 3,
 'card': 1,
 'statement': 13}

In [8]:
cv.inverse_transform(encoded_input[0])

[array(['free', 'game', 'only', 'today'], dtype='<U9')]

In [9]:
cv.get_feature_names()

['attached',
 'card',
 'cheapest',
 'credit',
 'deal',
 'flight',
 'free',
 'game',
 'limited',
 'meeting',
 'offer',
 'only',
 'schedule',
 'statement',
 'time',
 'today',
 'your']

### 모델 학습하기

In [10]:
bnb = BernoulliNB()
y_train = df_y.astype('int')  # 현재는 float type
bnb.fit(x_traincv, y_train)

BernoulliNB()

### 테스트 Data 가공하기

In [11]:
test_email_list = [
                {'email title': 'free flight offer', 'spam': True},
                {'email title': 'hey traveler free flight deal', 'spam': True},
                {'email title': 'limited free game offer', 'spam': True},
                {'email title': 'today flight schedule', 'spam': False},
                {'email title': 'your credit card attached', 'spam': False},
                {'email title': 'free credit card offer only today', 'spam': False}
             ]

test_df = pd.DataFrame(test_email_list)
test_df['label'] = test_df['spam'].map({True:1, False:0})

test_x = test_df['email title']
test_y = test_df['label']

x_testcv = cv.transform(test_x)

### 테스트

In [12]:
predictions = bnb.predict(x_testcv)

In [13]:
accuracy_score(test_y, predictions)

0.8333333333333334

## 예제3) 다항분포 나이브 베이즈를 활용한 영화 리뷰 분류

In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

### Data 가공하기

In [15]:
review_list = [
                {'movie_review': 'this is great great movie. I will watch again', 'type': 'positive'},
                {'movie_review': 'I like this movie', 'type': 'positive'},
                {'movie_review': 'amazing movie in this year', 'type': 'positive'},
                {'movie_review': 'cool my boyfriend also said the movie is cool', 'type': 'positive'},
                {'movie_review': 'awesome of the awesome movie ever', 'type': 'positive'},
                {'movie_review': 'shame I wasted money and time', 'type': 'negative'},
                {'movie_review': 'regret on this move. I will never never what movie from this director', 'type': 'negative'},
                {'movie_review': 'I do not like this movie', 'type': 'negative'},
                {'movie_review': 'I do not like actors in this movie', 'type': 'negative'},
                {'movie_review': 'boring boring sleeping movie', 'type': 'negative'}
             ]

df = pd.DataFrame(review_list)
df

Unnamed: 0,movie_review,type
0,this is great great movie. I will watch again,positive
1,I like this movie,positive
2,amazing movie in this year,positive
3,cool my boyfriend also said the movie is cool,positive
4,awesome of the awesome movie ever,positive
5,shame I wasted money and time,negative
6,regret on this move. I will never never what m...,negative
7,I do not like this movie,negative
8,I do not like actors in this movie,negative
9,boring boring sleeping movie,negative


In [16]:
df['label'] = df['type'].map({'positive':1, 'negative':0})

In [17]:
df_x = df['movie_review']
df_y = df['label']

In [18]:
cv = CountVectorizer()
x_traincv = cv.fit_transform(df_x)

encoded_input = x_traincv.toarray()
encoded_input

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
        0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [19]:
cv.vocabulary_

{'this': 30,
 'is': 15,
 'great': 13,
 'movie': 19,
 'will': 35,
 'watch': 33,
 'again': 1,
 'like': 16,
 'amazing': 3,
 'in': 14,
 'year': 36,
 'cool': 8,
 'my': 20,
 'boyfriend': 7,
 'also': 2,
 'said': 26,
 'the': 29,
 'awesome': 5,
 'of': 23,
 'ever': 11,
 'shame': 27,
 'wasted': 32,
 'money': 17,
 'and': 4,
 'time': 31,
 'regret': 25,
 'on': 24,
 'move': 18,
 'never': 21,
 'what': 34,
 'from': 12,
 'director': 9,
 'do': 10,
 'not': 22,
 'actors': 0,
 'boring': 6,
 'sleeping': 28}

In [20]:
cv.inverse_transform(encoded_input[0])

[array(['again', 'great', 'is', 'movie', 'this', 'watch', 'will'],
       dtype='<U9')]

In [21]:
cv.get_feature_names()

['actors',
 'again',
 'also',
 'amazing',
 'and',
 'awesome',
 'boring',
 'boyfriend',
 'cool',
 'director',
 'do',
 'ever',
 'from',
 'great',
 'in',
 'is',
 'like',
 'money',
 'move',
 'movie',
 'my',
 'never',
 'not',
 'of',
 'on',
 'regret',
 'said',
 'shame',
 'sleeping',
 'the',
 'this',
 'time',
 'wasted',
 'watch',
 'what',
 'will',
 'year']

### 모델 학습하기

In [22]:
mnb = MultinomialNB()
y_train = df_y.astype('int')
mnb.fit(x_traincv, y_train)

MultinomialNB()

### 테스트 Data 가공하기

In [23]:
test_feedback_list = [
                {'movie_review': 'great great great movie ever', 'type': 'positive'},
                {'movie_review': 'I like this amazing movie', 'type': 'positive'},
                {'movie_review': 'my boyfriend said great movie ever', 'type': 'positive'},
                {'movie_review': 'cool cool cool', 'type': 'positive'},
                {'movie_review': 'awesome boyfriend said cool movie ever', 'type': 'positive'},
                {'movie_review': 'shame shame shame', 'type': 'negative'},
                {'movie_review': 'awesome director shame movie boring movie', 'type': 'negative'},
                {'movie_review': 'do not like this movie', 'type': 'negative'},
                {'movie_review': 'I do not like this boring movie', 'type': 'negative'},
                {'movie_review': 'aweful terrible boring movie', 'type': 'negative'}
             ]

test_df = pd.DataFrame(test_feedback_list)
test_df['label'] = test_df['type'].map({'positive':1, 'negative':0})

test_x = test_df['movie_review']
test_y = test_df['label']

### 테스트

In [24]:
x_testcv = cv.transform(test_x)
predictions = mnb.predict(x_testcv)

In [25]:
accuracy_score(test_y, predictions)

1.0