# 붓꽃 분류
- Naive Bayes Classifier
  - 텍스트 데이터처럼 희소한 고차원인 경우 높은 정확도와 속도를 제공
  - 적용분야: 스팸 메일 분류, 문서(주제) 분류, 컴퓨터 네트워크 침입자 분류 등

## 패키지 로딩

In [1]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

## 데이터 로딩 및 분할

In [2]:
x, y = load_iris(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=10, stratify=y)

## 모델 생성

In [3]:
model = GaussianNB()  # 설명변수가 연속형인 경우 사용하는 나이브 베이즈
model.fit(x_train, y_train)

## 모델 평가

In [4]:
y_hat = model.predict(x_test)
con_mat = confusion_matrix(y_test, y_hat)
print(con_mat)

[[15  0  0]
 [ 0 15  0]
 [ 0  0 15]]


# [실습] 독버섯 분류
- 설명변수가 범주형 자료
- 종속변수: type

In [73]:
# 데이터 불러오기
import pandas as pd
import numpy as np

df = pd.read_csv('./dataset/mushrooms.csv')

print(df.shape)
display(df.head(10))

(8124, 23)


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,poisonous,convex,smooth,brown,yes,pungent,free,close,narrow,black,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,yes,almond,free,close,broad,black,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,yes,anise,free,close,broad,brown,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,yes,pungent,free,close,narrow,brown,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,tapering,equal,smooth,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses
5,edible,convex,scaly,yellow,yes,almond,free,close,broad,brown,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,black,numerous,grasses
6,edible,bell,smooth,white,yes,almond,free,close,broad,gray,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,black,numerous,meadows
7,edible,bell,scaly,white,yes,anise,free,close,broad,brown,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,scattered,meadows
8,poisonous,convex,scaly,white,yes,pungent,free,close,narrow,pink,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,several,grasses
9,edible,bell,smooth,yellow,yes,almond,free,close,broad,gray,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,meadows


In [32]:
# 인코딩
df_encoding = pd.get_dummies(df, columns=df.columns[1:])
# 종속변수는 원핫인코딩x, 종속변수는 하나의 컬럼만 있어야함
df_encoding

Unnamed: 0,type,cap_shape_bell,cap_shape_conical,cap_shape_convex,cap_shape_flat,cap_shape_knobbed,cap_shape_sunken,cap_surface_fibrous,cap_surface_grooves,cap_surface_scaly,cap_surface_smooth,cap_color_brown,cap_color_buff,cap_color_cinnamon,cap_color_gray,cap_color_green,cap_color_pink,cap_color_purple,cap_color_red,cap_color_white,cap_color_yellow,bruises_no,bruises_yes,odor_almond,odor_anise,odor_creosote,odor_fishy,odor_foul,odor_musty,odor_none,odor_pungent,odor_spicy,gill_attachment_attached,gill_attachment_free,gill_spacing_close,gill_spacing_crowded,gill_size_broad,gill_size_narrow,gill_color_black,gill_color_brown,gill_color_buff,gill_color_chocolate,gill_color_gray,gill_color_green,gill_color_orange,gill_color_pink,gill_color_purple,gill_color_red,gill_color_white,gill_color_yellow,stalk_shape_enlarging,stalk_shape_tapering,stalk_root_bulbous,stalk_root_club,stalk_root_equal,stalk_root_missing,stalk_root_rooted,stalk_surface_above_ring_fibrous,stalk_surface_above_ring_scaly,stalk_surface_above_ring_silky,stalk_surface_above_ring_smooth,stalk_surface_below_ring_fibrous,stalk_surface_below_ring_scaly,stalk_surface_below_ring_silky,stalk_surface_below_ring_smooth,stalk_color_above_ring_brown,stalk_color_above_ring_buff,stalk_color_above_ring_cinnamon,stalk_color_above_ring_gray,stalk_color_above_ring_orange,stalk_color_above_ring_pink,stalk_color_above_ring_red,stalk_color_above_ring_white,stalk_color_above_ring_yellow,stalk_color_below_ring_brown,stalk_color_below_ring_buff,stalk_color_below_ring_cinnamon,stalk_color_below_ring_gray,stalk_color_below_ring_orange,stalk_color_below_ring_pink,stalk_color_below_ring_red,stalk_color_below_ring_white,stalk_color_below_ring_yellow,veil_type_partial,veil_color_brown,veil_color_orange,veil_color_white,veil_color_yellow,ring_number_none,ring_number_one,ring_number_two,ring_type_evanescent,ring_type_flaring,ring_type_large,ring_type_none,ring_type_pendant,spore_print_color_black,spore_print_color_brown,spore_print_color_buff,spore_print_color_chocolate,spore_print_color_green,spore_print_color_orange,spore_print_color_purple,spore_print_color_white,spore_print_color_yellow,population_abundant,population_clustered,population_numerous,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,poisonous,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
1,edible,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,True,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
2,edible,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,True,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False
3,poisonous,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
4,edible,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,edible,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
8120,edible,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False
8121,edible,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
8122,poisonous,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False


In [29]:
from sklearn.preprocessing import LabelEncoder

encoded = LabelEncoder().fit_transform(df_encoding['type'])
df_encoding['type'] = encoded
df_encoding

Unnamed: 0,type,cap_shape_bell,cap_shape_conical,cap_shape_convex,cap_shape_flat,cap_shape_knobbed,cap_shape_sunken,cap_surface_fibrous,cap_surface_grooves,cap_surface_scaly,cap_surface_smooth,cap_color_brown,cap_color_buff,cap_color_cinnamon,cap_color_gray,cap_color_green,cap_color_pink,cap_color_purple,cap_color_red,cap_color_white,cap_color_yellow,bruises_no,bruises_yes,odor_almond,odor_anise,odor_creosote,odor_fishy,odor_foul,odor_musty,odor_none,odor_pungent,odor_spicy,gill_attachment_attached,gill_attachment_free,gill_spacing_close,gill_spacing_crowded,gill_size_broad,gill_size_narrow,gill_color_black,gill_color_brown,gill_color_buff,gill_color_chocolate,gill_color_gray,gill_color_green,gill_color_orange,gill_color_pink,gill_color_purple,gill_color_red,gill_color_white,gill_color_yellow,stalk_shape_enlarging,stalk_shape_tapering,stalk_root_bulbous,stalk_root_club,stalk_root_equal,stalk_root_missing,stalk_root_rooted,stalk_surface_above_ring_fibrous,stalk_surface_above_ring_scaly,stalk_surface_above_ring_silky,stalk_surface_above_ring_smooth,stalk_surface_below_ring_fibrous,stalk_surface_below_ring_scaly,stalk_surface_below_ring_silky,stalk_surface_below_ring_smooth,stalk_color_above_ring_brown,stalk_color_above_ring_buff,stalk_color_above_ring_cinnamon,stalk_color_above_ring_gray,stalk_color_above_ring_orange,stalk_color_above_ring_pink,stalk_color_above_ring_red,stalk_color_above_ring_white,stalk_color_above_ring_yellow,stalk_color_below_ring_brown,stalk_color_below_ring_buff,stalk_color_below_ring_cinnamon,stalk_color_below_ring_gray,stalk_color_below_ring_orange,stalk_color_below_ring_pink,stalk_color_below_ring_red,stalk_color_below_ring_white,stalk_color_below_ring_yellow,veil_type_partial,veil_color_brown,veil_color_orange,veil_color_white,veil_color_yellow,ring_number_none,ring_number_one,ring_number_two,ring_type_evanescent,ring_type_flaring,ring_type_large,ring_type_none,ring_type_pendant,spore_print_color_black,spore_print_color_brown,spore_print_color_buff,spore_print_color_chocolate,spore_print_color_green,spore_print_color_orange,spore_print_color_purple,spore_print_color_white,spore_print_color_yellow,population_abundant,population_clustered,population_numerous,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,1,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
1,0,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,True,True,False,False,False,False,False,False,False,False,False,True,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
2,0,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,True,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False
3,1,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False
4,0,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,True,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
8120,0,False,False,True,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False
8121,0,False,False,False,True,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,True,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,True,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False
8122,1,False,False,False,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,False,True,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False


In [76]:
encoder = LabelEncoder()
for column in df.columns:
    df[column] = encoder.fit_transform(df[column])
df

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,2,3,0,1,7,1,0,1,0,0,2,3,3,7,7,0,2,1,4,0,3,4
1,0,2,3,9,1,0,1,0,0,0,0,1,3,3,7,7,0,2,1,4,1,2,0
2,0,0,3,8,1,1,1,0,0,1,0,1,3,3,7,7,0,2,1,4,1,2,2
3,1,2,2,8,1,7,1,0,1,1,0,2,3,3,7,7,0,2,1,4,0,3,4
4,0,2,3,3,0,6,1,1,0,0,1,2,3,3,7,7,0,2,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,4,3,0,0,6,0,0,0,11,0,3,3,3,4,4,0,1,1,4,2,1,1
8120,0,2,3,0,0,6,0,0,0,11,0,3,3,3,4,4,0,0,1,4,2,4,1
8121,0,3,3,0,0,6,0,0,0,1,0,3,3,3,4,4,0,1,1,4,2,1,1
8122,1,4,2,0,0,3,1,0,1,2,1,3,3,2,7,7,0,2,1,0,7,4,1


In [33]:
# 종속변수/독립변수 분리
x = df_encoding.drop('type', axis=1)
y = df['type']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y)

In [34]:
# 모델 생성
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(x_train, y_train)

In [38]:
# 평가
y_hat = model.predict(x_test)

matrix = confusion_matrix(y_test, y_hat)
print(matrix)
print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')

[[1258    5]
 [ 125 1050]]
정확도: 0.947


In [50]:
from sklearn.metrics import roc_auc_score

pred_proba_positive = model.predict_proba(x_test)[:,1]
print(f'AUC: {roc_auc_score(y_test, pred_proba_positive)}')

AUC: 0.9978443759370631


# [답안]

## 패키지로딩

In [53]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, roc_auc_score, roc_curve
import pandas as pd
import numpy as np

## 데이터 로딩 및 확인

In [95]:
df = pd.read_csv('./dataset/mushrooms.csv')
print(df.shape)
display(df.head())

(8124, 23)


Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,poisonous,convex,smooth,brown,yes,pungent,free,close,narrow,black,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,yes,almond,free,close,broad,black,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,yes,anise,free,close,broad,brown,enlarging,club,smooth,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,yes,pungent,free,close,narrow,brown,enlarging,equal,smooth,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,tapering,equal,smooth,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   type                      8124 non-null   object
 1   cap_shape                 8124 non-null   object
 2   cap_surface               8124 non-null   object
 3   cap_color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill_attachment           8124 non-null   object
 7   gill_spacing              8124 non-null   object
 8   gill_size                 8124 non-null   object
 9   gill_color                8124 non-null   object
 10  stalk_shape               8124 non-null   object
 11  stalk_root                8124 non-null   object
 12  stalk_surface_above_ring  8124 non-null   object
 13  stalk_surface_below_ring  8124 non-null   object
 14  stalk_color_above_ring  

## 데이터 인코딩 및 분리

### 라벨 인코딩

In [79]:
x = df.drop('type', axis=1)
y = df['type']

- scikit-learn LabelEncoder를 이용한 label encoding

In [80]:
from sklearn.preprocessing import LabelEncoder

x = x.apply(lambda col:LabelEncoder().fit_transform(col))
# x의 컬럼값을 lambda의 컬럼값으로 가져와 라벨인코딩 실행
x.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,2,3,0,1,7,1,0,1,0,0,2,3,3,7,7,0,2,1,4,0,3,4
1,2,3,9,1,0,1,0,0,0,0,1,3,3,7,7,0,2,1,4,1,2,0
2,0,3,8,1,1,1,0,0,1,0,1,3,3,7,7,0,2,1,4,1,2,2
3,2,2,8,1,7,1,0,1,1,0,2,3,3,7,7,0,2,1,4,0,3,4
4,2,3,3,0,6,1,1,0,0,1,2,3,3,7,7,0,2,1,0,1,0,0


- pandas map을 이용한 label encoding

In [56]:
def labeling(col):
    map_data = { v:i for i,v in enumerate(np.sort(col.unique()))}  # 각 컬럼의 고유값을 가져와 정렬 후 enumerate 각 인덱스와 값가져오기
    return map_data

x = x.apply(lambda col:col.map(labeling(col)))
# apply로 각 컬럼에 적용
# 각 컬럼값을 lambda의 매개변수인 col에 가져와 map함수를 통해 맵핑된 딕셔너리 객체를 이용해 라벨링

In [57]:
x.head()

Unnamed: 0,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,stalk_shape,stalk_root,stalk_surface_above_ring,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,2,3,0,1,7,1,0,1,0,0,2,3,3,7,7,0,2,1,4,0,3,4
1,2,3,9,1,0,1,0,0,0,0,1,3,3,7,7,0,2,1,4,1,2,0
2,0,3,8,1,1,1,0,0,1,0,1,3,3,7,7,0,2,1,4,1,2,2
3,2,2,8,1,7,1,0,1,1,0,2,3,3,7,7,0,2,1,4,0,3,4
4,2,3,3,0,6,1,1,0,0,1,2,3,3,7,7,0,2,1,0,1,0,0


In [58]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(6499, 22) (1625, 22) (6499,) (1625,)


### 원-핫 인코딩

In [96]:
x = df.drop('type', axis=1)
y = df['type']

In [97]:
print(x.shape, y.shape)

x = pd.get_dummies(x)
print(x.shape)

(8124, 22) (8124,)
(8124, 117)


In [98]:
y = y.map({'edible':0, 'poisonous':1})

In [99]:
y.head()

0    1
1    0
2    0
3    1
4    0
Name: type, dtype: int64

In [100]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, stratify=y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(6499, 117) (1625, 117) (6499,) (1625,)


## 모델 생성

In [101]:
model = MultinomialNB()
model.fit(x_train, y_train)

## 모델 평가

In [102]:
model.classes_   # 모델이 분류해야할 클래스값 목록 보여줌

array([0, 1], dtype=int64)

In [103]:
y_hat = model.predict(x_test)
print('실제값:', np.array(y_test[:7]))
print('예측값:', y_hat[:7])

실제값: [0 0 1 0 0 1 0]
예측값: [0 0 1 0 0 1 0]


In [105]:
cf_mat = confusion_matrix(y_test, y_hat)#, labels=['poisonous', 'edible'])  # labels: 네거티브와 포지티브에 사용할 분류값 지정
print(cf_mat)

print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')
#print(f"정밀도: {precision_score(y_test, y_hat, pos_label='edible'):.3f}") # pos_label: 포지티브로 사용할 레이블 지정 필요
print(f"정밀도: {precision_score(y_test, y_hat):.3f}")
print(f'AUC: {roc_auc_score(y_test, model.predict_proba(x_test)[:,1]):.3f}')

# 순위형 명목형 데이터가 아니기때문에 원핫인코딩이 더 정확함

[[841   1]
 [ 74 709]]
정확도: 0.954
정밀도: 0.999
AUC: 0.997
