##  베이즈 추정(Bayesian Estimation)
추론 대상의 사전확률과 추가적인 정보를 기반으로 해당 대상의 사후 확률을 추론하는 통계적 방법

## $p(\theta|X) = \frac{p(\theta, X)}{p(X)} = \frac{p(X|\theta)p(\theta)}{p(X)}$


In [9]:
import pandas as pd
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

In [3]:
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

###  데이터 전처리 (인코딩)

In [6]:
# 레이블 인코더 객체 생성
le = preprocessing.LabelEncoder()
# 날씨의 종류를 숫자형으로 변환
weather_encoded=le.fit_transform(weather)
print(weather_encoded)

[2 2 0 1 1 1 0 2 2 1 2 0 0 1]


In [11]:
temp_encoded=le.fit_transform(temp)
label=le.fit_transform(play)
print('temp: ', temp_encoded)
print('play: ', label)

temp:  [1 1 1 2 0 0 0 2 0 2 2 2 1 2]
play:  [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [8]:
# 인코딩된 2 features 결합
features = zip(weather_encoded, temp_encoded)
features = list(features)
print(features)

[(2, 1), (2, 1), (0, 1), (1, 2), (1, 0), (1, 0), (0, 0), (2, 2), (2, 0), (1, 2), (2, 2), (0, 2), (0, 1), (1, 2)]


In [13]:
model = GaussianNB()
model.fit(features, label)
predicted = model.predict([[0, 2]])
print("Predicted Value: ", predicted)

Predicted Value:  [1]


### Label이 여러개인 나이브 베이즈

In [26]:
from sklearn import datasets

wine = datasets.load_wine()

In [33]:
print("Features: ", wine.feature_names)
print("Label: ", wine.target_names)

Features:  ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
Label:  ['class_0' 'class_1' 'class_2']


In [34]:
wine.data.shape

(178, 13)

In [35]:
wine.data[0:5]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03],
       [1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02, 3.850e+00,
        3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
        1.480e+03],
       [1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02, 2.800e+00,
        2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
        7.350e+02]])

In [36]:
wine.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [37]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.3, random_state=109)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

print('Accuracy: ', metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.9074074074074074


### 나이브 베이즈의 장단점

### 장점

- 간단하고 빠르며 정확하다
- Computation cost가 작다
- 큰 데이터 셋에 적합
- 연속형보다 이산형 데이터에서 성능이 좋다
- Multiple class 예측을 위해서도 사용할 수 있다

### 단점

- feature 간의 독립성이 있어야한다 (실제 데이터에서 모든 feature의 독립인 경우는 드물다) 매우 치명적임
