### 나이브 베이즈(Navie Bayes)
- 확률 기반 머신러닝 분류 알고리즘의 대표적
- 나이브 베이즈 분류 알고리즘은 데이터를 나이브(단순)하게 독립적인 사건으로 가정
- 이 독립적인 사건들을 베이즈 이론에 대입시켜 가장 높은 확률 레이블로 분류를 실행하는 알고리즘.

### 베이즈 이론

$P(A|B) = P(B|A) * P(A) \div P(B)$

P(A|B) : 어떤 사건 B가 일어났을때 사건 A가 일어날 확률   
P(B|A) : 어떤 사건 A가 일어났을때 사건 B가 일어날 확률  
P(A) : 어떤 사건 A가 일어날 확률

<img src="../data/naive.png" style = "width: 30%;">


<b>B사건이 일어났을때 A사건이 일어날 확률? </b>
- P(A|B) = P(B|A) * P(A) / P(B) 
- P(A|B) = 3 / (3 + 2) = 0.6

<b>A사건이 일어났을때 B사건이 일어날 확률? </b>
- P(B|A) = 3/ (7+3) = 0.3

0.3 * 10 / 5 = 0.6

In [101]:
import pandas as pd

In [102]:
pd.DataFrame([
    [0, 1, 3],
    [2, 2, 2],
],
index=['주문함', '주문안함'],
columns=['아침', '점심', '저녁']
)

Unnamed: 0,아침,점심,저녁
주문함,0,1,3
주문안함,2,2,2


# 치킨집에서 저녁에 손님이 주문을 할 때 맥주를 주문할 확률
p(주문|저녁) = P(저녁 | 주문) * P(주문) / P(저녁)


---
### 가우시안 나이브 베이즈를 이용한 붓꽃 분류

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [104]:
df = pd.read_csv('../data/iris.csv')
df.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [105]:
csv_data = df.iloc[:,:-1]
csv_label = df.iloc[:,-1]

In [106]:
train_data, test_data, train_target, test_target = train_test_split(
    csv_data,
    csv_label,
    random_state=42,
    test_size=0.15,
    stratify=csv_label
)

In [107]:
print(train_data.shape)
print(test_data.shape)

(127, 4)
(23, 4)


In [108]:
model = GaussianNB()

model.fit(train_data, train_target)


In [109]:
print(model.score(train_data, train_target), model.score(test_data, test_target))

0.968503937007874 0.9130434782608695


---
###  베르누이 나이브베이즈를 활용한 스팸 분류

In [110]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB


In [111]:
df = pd.read_csv('../data/email_train.csv')
df

Unnamed: 0,email title,spam
0,free game only today,True
1,cheapest flight deak,True
2,limited time offer only today only today,True
3,today meeting schedule,False
4,your flight schedule attached,False
5,your credit card statement,False


### spam의 값을 숫자로 변환

In [112]:
df['label'] =  df['spam'].map(lambda x: 1 if x else 0)
df


Unnamed: 0,email title,spam,label
0,free game only today,True,1
1,cheapest flight deak,True,1
2,limited time offer only today only today,True,1
3,today meeting schedule,False,0
4,your flight schedule attached,False,0
5,your credit card statement,False,0


In [113]:
df_x = df['email title']
df_y = df['label']

In [114]:
cv = CountVectorizer(binary=True)
x_traincv = cv.fit_transform(df_x)
x_traincv

<6x17 sparse matrix of type '<class 'numpy.int64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [115]:
encodded_input = x_traincv.toarray()
encodded_input

array([[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1]])

In [116]:
# 벡터로 인코딩된 이메일 제목에 어떤 단어가 있는지 궁금하다면
cv.inverse_transform(encodded_input[0].reshape(1, -1))

[array(['free', 'game', 'only', 'today'], dtype='<U9')]

In [117]:
cv.get_feature_names_out()

array(['attached', 'card', 'cheapest', 'credit', 'deak', 'flight', 'free',
       'game', 'limited', 'meeting', 'offer', 'only', 'schedule',
       'statement', 'time', 'today', 'your'], dtype=object)

### 베르누이 나이브 베이즈 분류


In [118]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
bnb = BernoulliNB()
y_train = df_y.astype(int)
bnb.fit(x_traincv, y_train)

In [120]:
test_df = pd.read_csv('../data/email_test.csv')
test_df

Unnamed: 0,email title,spam
0,free flight offer,True
1,hey traveler free flight deal,True
2,limited free game iffer,True
3,today flight schedule,False
4,your credit card attached,False
5,free credit card offer only today,False


In [121]:
test_df['label'] =  test_df['spam'].map(lambda x: 1 if x else 0)
test_df

Unnamed: 0,email title,spam,label
0,free flight offer,True,1
1,hey traveler free flight deal,True,1
2,limited free game iffer,True,1
3,today flight schedule,False,0
4,your credit card attached,False,0
5,free credit card offer only today,False,0


In [122]:
test_x = df['email title']
test_y = df['label']

In [123]:
test_cv = cv.transform(test_x)

In [124]:
incoded_test_input = test_cv.toarray()

In [125]:
from sklearn.metrics import accuracy_score


In [126]:
pred = bnb.predict(test_cv)

In [127]:
accuracy_score(test_y, pred)

1.0

---
### 다항분포 나이브베이즈 영화리뷰 감정 분류

In [128]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [129]:
df = pd.read_csv('../data/naive_movie.csv')
df.head()

Unnamed: 0,movie_review,type
0,this is great great movie. I will watch again,positive
1,I like this movie,positive
2,amazing movie in this year,positive
3,cool my boyfriend also said the movie is cool,positive
4,awesome of the awesome movie ever,positive


In [130]:
df['label'] = df['type'].map(lambda x: x == 'positive').astype(int)
df.head()

Unnamed: 0,movie_review,type,label
0,this is great great movie. I will watch again,positive,1
1,I like this movie,positive,1
2,amazing movie in this year,positive,1
3,cool my boyfriend also said the movie is cool,positive,1
4,awesome of the awesome movie ever,positive,1


In [131]:
df_x = df['movie_review']
df_y = df['label']

In [132]:
cv = CountVectorizer()
x_traincv = cv.fit_transform(df_x)
encodded_input = x_traincv.toarray()

In [133]:
encodded_input

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 2,
        0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0

In [134]:
cv.get_feature_names_out()

array(['actors', 'again', 'also', 'amazing', 'and', 'awesome', 'boring',
       'boyfriend', 'cool', 'director', 'do', 'ever', 'from', 'great',
       'in', 'is', 'like', 'money', 'move', 'movie', 'my', 'never', 'not',
       'of', 'on', 'regret', 'said', 'shame', 'sleeping', 'the', 'this',
       'time', 'wasted', 'watch', 'what', 'will', 'year'], dtype=object)

In [136]:
cv.inverse_transform(encodded_input[[0]])

[array(['again', 'great', 'is', 'movie', 'this', 'watch', 'will'],
       dtype='<U9')]

### 다항분포 나이브베이즈 분류
: 다항분포 나이브베이즈로 영화 리뷰를 긍정적 평가인지, 부정적 평가인지 분류

In [137]:
mnb = MultinomialNB()
y_train = df_y.astype(int)
mnb.fit(x_traincv, y_train)

In [142]:
test_df = pd.read_csv('../data/naive_movie_test.csv')
test_df['label'] = test_df['type'].map(lambda x : x == 'positive').astype(int)
test_x = test_df['movie_review']
test_y = test_df['label']

In [144]:
x_testcv = cv.transform(test_x)
pred = mnb.predict(x_testcv)

In [145]:
accuracy_score(test_y, pred)

1.0