# 붓꽃 분류(Naive Bayes)
- 통계적 분류기
- 주어진 데이터가 특정 클래스에 속하는지를 조건부 확률을 통해서 예측
- 텍스트 데이터처럼 희소한 고차원인 경우 높은 정확도와 속도 제공
- 적용 분야: 스팸 메일 분류, 문서 주제 분류, 네트워크 침입자 분류

## 패키지 로딩

In [1]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB # 독립변수가 연속형인 경우
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

## 데이터 로딩 및 분할

In [5]:
x, y =load_iris(return_X_y=True) # 알아서 독립변수와 종속변수를 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

## 모델 생성

In [6]:
model = GaussianNB()
model.fit(x_train, y_train)

## 모델 평가

In [7]:
y_hat = model.predict(x_test)
print(f'정확도:{accuracy_score(y_test, y_hat):.3f}')

정확도:1.000


In [8]:
print(confusion_matrix(y_test, y_hat))

[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]


# [문제] 독버섯 분류하기

## 패키지 로딩

In [78]:
from sklearn.naive_bayes import MultinomialNB # 독립변수가 범주형인 경우
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
import numpy as np
import pandas as pd

## 데이터 로딩

In [79]:
df = pd.read_csv('./datasets/mushrooms.csv')
print(df.shape)
display(df.head())

(8124, 23)


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,poisonous,convex,smooth,brown,yes,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,yes,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,yes,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,yes,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses


In [80]:
print(df['type'].value_counts())

type
edible       4208
poisonous    3916
Name: count, dtype: int64


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   type                      8124 non-null   object
 1   cap_shape                 8124 non-null   object
 2   cap_surface               8124 non-null   object
 3   cap_color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill_attachment           8124 non-null   object
 7   gill_spacing              8124 non-null   object
 8   gill_size                 8124 non-null   object
 9   gill_color                8124 non-null   object
 10  stalk_shape               8124 non-null   object
 11  stalk_root                8124 non-null   object
 12  stalk_surface_above_ring  8124 non-null   object
 13  stalk_surface_below_ring  8124 non-null   object
 14  stalk_color_above_ring  

## 데이터 분리 및 인코딩

### 라벨 인코딩

In [58]:
from sklearn.preprocessing import LabelEncoder

df = df.apply(lambda col:LabelEncoder().fit_transform(col)) # df가 데이터프레임인 경우 col에는 컬럼값이 시리즈객체 형식으로 하나씩 들어감

### 원-핫 인코딩

In [88]:
x = df.drop('type', axis=1)
x = pd.get_dummies(x)
y = df['type']
y = y.map({'edible':0, 'poisonous':1})

In [89]:
print(x.shape)

(8124, 117)


In [84]:
display(x.head())

Unnamed: 0,cap_shape_bell,cap_shape_conical,cap_shape_convex,cap_shape_flat,cap_shape_knobbed,cap_shape_sunken,cap_surface_fibrous,cap_surface_grooves,cap_surface_scaly,cap_surface_smooth,...,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,False,False,True,False,False,False,False,False,False,True,...,True,False,False,False,False,False,False,True,False,False
1,False,False,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,False,False,True,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [85]:
# 원-핫 인코딩 돌릴 때는 이 셀 건너뜀
x = df.drop('type', axis = 1)
y = df['type'] 

In [90]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

## 모델 생성

In [91]:
model = MultinomialNB()
model.fit(x_train, y_train)

## 모델 평가

In [92]:
y_hat = model.predict(x_test)
cm = confusion_matrix(y_test, y_hat)
print(cm)

print(f'정확도:{accuracy_score(y_test, y_hat):.3f}')
print(f'AUC:{roc_auc_score(y_test, model.predict_proba(x_test)[:,1]):.3f}')

[[841   1]
 [ 74 709]]
정확도:0.954
AUC:0.997
