# D1211_WORK_김현우 
#### 독버섯 감지

[1] 모듈 로딩 및 데이터 준비 <HR>

In [1]:
# ===================================================================================================================================
# [1-1] 모듈 로딩
# ===================================================================================================================================

# 분석
import pandas as pd
import numpy as np

# 시각화
import seaborn as sns
import koreanize_matplotlib
import matplotlib.pyplot as plt

# 머신러닝 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# ===================================================================================================================================
# [1-2] 데이터 준비
# ===================================================================================================================================

FILE_NAME = '../Data/mushrooms.csv'
dataDF = pd.read_csv(FILE_NAME)
mushDF = dataDF.copy()

[2] 데이터 확인 및 전처리 <HR>

In [2]:
# ===================================================================================================================================
# 컬럼 이름 의미
# ===================================================================================================================================
# class                     : 독성 여부 (e = edible 식용, p = poisonous 독버섯)​
# cap-shape                 : 갓 모양 (b 종, c 뾰족, x 볼록, f 평평, k 혹, s 움푹)​
# cap-surface               : 갓 표면 (f 섬유질, g 홈이 있는, y 비늘, s 매끈)​
# cap-color                 : 갓 색깔 (n 갈색, b 크림색, c 시나몬, g 회색, r 초록, p 분홍, u 보라, e 빨강, w 흰색, y 노랑 등)​
# bruises                   : 멍/손상 여부 (t 멍 듦, f 멍 없음)​
# odor                      : 냄새 (a 없음, l 아몬드, c 고리버섯 냄새, y 생연유, f 썩은, m 곰팡이, n 무취, p 고약한, s 매운 등)​

# ===================================================================================================================================
# 주름(gill) 관련 컬럼
# ===================================================================================================================================
# gill-attachment           : 주름이 대에 붙는 형태 (a 붙음, f 자유, d 내려옴, n 멀어짐 등)​
# gill-spacing              : 주름 간격 (c 촘촘, w 빽빽, d 성긴)​
# gill-size                 : 주름 크기 (b 넓음, n 좁음)​
# gill-color                : 주름 색깔 (k 검정, n 갈색, b 크림, h 초콜릿, g 회색, r 초록, o 주황, p 분홍, u 보라, e 빨강, w 흰색, y 노랑 등)​

# ===================================================================================================================================
# 대(stalk) 관련 컬럼
# ===================================================================================================================================
# stalk-shape               : 대 모양 (e 부풀어오름, t 가늘어짐)​
# stalk-root                : 대 밑부분 형태 (b 방망이, c 곤봉, u 컵, e 균사발, z 뿌리, r 뿌리줄기, ? 미상)​
# stalk-surface-above-ring  : 턱받이(링) 위쪽 대 표면 (f 섬유질, y 비늘, k 비단결, s 매끈 등)​
# stalk-surface-below-ring  : 턱받이 아래쪽 대 표면 (같은 코드: f,y,k,s)​
# stalk-color-above-ring    : 턱받이 위쪽 대 색깔 (n 갈색, b 크림, c 시나몬, g 회색, o 주황, p 분홍, e 빨강, w 흰색, y 노랑 등)​
# stalk-color-below-ring    : 턱받이 아래쪽 대 색깔 (위와 동일 코드)​

# ===================================================================================================================================
# 베일·링·포자 관련 컬럼
# ===================================================================================================================================
# veil-type                 : 베일 타입 (p 부분 베일, u 전체 베일이었으나 실제 데이터는 거의 p만 존재)​
# veil-color                : 베일 색깔 (n 갈색, o 주황, w 흰색, y 노랑)​
# ring-number               : 링(턱받이) 개수 (n 없음, o 하나, t 두 개)​
# ring-type                 : 링 형태 (c 거미줄형, e 사라지는, f 벌어지는, l 큼, n 없음, p 늘어뜨려진, s 덮개형, z 띠 모양)​
# spore-print-color         : 포자 무늬 색 (k 검정, n 갈색, b 크림, h 초콜릿, r 초록, o 주황, u 보라, w 흰색, y 노랑)​

# ===================================================================================================================================
# 개체수·서식지 컬럼
# ===================================================================================================================================
# population                : 개체 분포 (a 매우 많음, c 군집, n 다수, s 산발적, v 몇 개, y 단독)​
# habitat                   : 서식 환경 (g 풀밭, l 낙엽, m 초원, p 길가, u 도시, w 폐기물장, d 숲)​

In [3]:
# ===================================================================================================================================
# [2-1] 데이터 확인 - 모든 컬럼들이 범주형
# ===================================================================================================================================
# 전체 데이터 확인
display(mushDF.head(3))

# 데이터 타입 확인
print(mushDF.info())    # 모든 컬럼들이 범주형 데이터   

# class 컬럼 확인
print(mushDF["class"].value_counts())

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [4]:
# ===================================================================================================================================
# [2-2] 결측치, 이상치, 중복값 확인
# ===================================================================================================================================
print(f"중복값 : {mushDF.duplicated().sum()}\n")    # 중복값 X
print(f"결측치 : {mushDF.isnull().sum().sum()}\n")  # 결측치 X

# 전부 범주형이라 이상치 확인하기 힘들어 이상한 값이나 특이값 정도를 찾도록 함
for col in mushDF.columns:
    print(mushDF[col].value_counts(), '\n')

# -> 확인 결과 
mushDF['stalk-root'] = mushDF['stalk-root'].replace('?', "unknown")
print(mushDF['stalk-root'].value_counts())

중복값 : 0

결측치 : 0

class
e    4208
p    3916
Name: count, dtype: int64 

cap-shape
x    3656
f    3152
k     828
b     452
s      32
c       4
Name: count, dtype: int64 

cap-surface
y    3244
s    2556
f    2320
g       4
Name: count, dtype: int64 

cap-color
n    2284
g    1840
e    1500
y    1072
w    1040
b     168
p     144
c      44
u      16
r      16
Name: count, dtype: int64 

bruises
f    4748
t    3376
Name: count, dtype: int64 

odor
n    3528
f    2160
s     576
y     576
a     400
l     400
p     256
c     192
m      36
Name: count, dtype: int64 

gill-attachment
f    7914
a     210
Name: count, dtype: int64 

gill-spacing
c    6812
w    1312
Name: count, dtype: int64 

gill-size
b    5612
n    2512
Name: count, dtype: int64 

gill-color
b    1728
p    1492
w    1202
n    1048
g     752
h     732
u     492
k     408
e      96
y      86
o      64
r      24
Name: count, dtype: int64 

stalk-shape
t    4608
e    3516
Name: count, dtype: int64 

stalk-root
b    3776
?    2480


[3] 인코딩 <hr>
- 순서가 중요한 값들이 아니라 원래는 onehot encoder를 사용하려고 함.
- 근데 랜덤포레스트 모델은 데이터의 크기를 계산하지 않아서 문제 X
- Mushrooms 데이터는 컬럼 종류가 워낙 많아서 onehot encoder를 쓰면 컬럼이 너무 늘어남
- 그래서 편의상 label encoder 사용하기로 함 

In [5]:
label_encoders = {}  

for col in mushDF.columns:
    lbEncoder = LabelEncoder()
    mushDF[col] = lbEncoder.fit_transform(mushDF[col])
    label_encoders[col] = lbEncoder   

display(mushDF)
display(label_encoders)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


{'class': LabelEncoder(),
 'cap-shape': LabelEncoder(),
 'cap-surface': LabelEncoder(),
 'cap-color': LabelEncoder(),
 'bruises': LabelEncoder(),
 'odor': LabelEncoder(),
 'gill-attachment': LabelEncoder(),
 'gill-spacing': LabelEncoder(),
 'gill-size': LabelEncoder(),
 'gill-color': LabelEncoder(),
 'stalk-shape': LabelEncoder(),
 'stalk-root': LabelEncoder(),
 'stalk-surface-above-ring': LabelEncoder(),
 'stalk-surface-below-ring': LabelEncoder(),
 'stalk-color-above-ring': LabelEncoder(),
 'stalk-color-below-ring': LabelEncoder(),
 'veil-type': LabelEncoder(),
 'veil-color': LabelEncoder(),
 'ring-number': LabelEncoder(),
 'ring-type': LabelEncoder(),
 'spore-print-color': LabelEncoder(),
 'population': LabelEncoder(),
 'habitat': LabelEncoder()}

[4] 피쳐(x)와 타겟(y) 분리 <hr>

In [6]:
# ===================================================================================================================================
# [4-1] 타겟 분리 - 타겟은 class컬럼(식용버섯, 독버섯)이 확실
# ===================================================================================================================================

targetSR = mushDF['class']
print(targetSR)

# ===================================================================================================================================
# [4-2] 크래머V 통계량에 따른 상위 4개의 피쳐 구하기

# 클래스와의 피어슨 상관계수 확인? 
# → 각 컬럼들이 값들이 아닌 범주형이라 크기의 순서 X, 선형 관계를 그릴 수 없음
# → 따라서 범주형 변수 간의 연관성 강도를 측정하는 크래머V를 사용해 class 컬럼과 연관성 있는 피쳐를 선정하기로 함
# ===================================================================================================================================

# 1. 크래머V 함수 정의

from scipy.stats import chi2_contingency

def cramers_v(x, y):
    """두 범주형 변수의 연관성 계산"""
    # 교차표 만들기
    table = pd.crosstab(x, y)
    
    # 카이제곱 통계량 계산
    chi2, p_value, dof, expected = chi2_contingency(table)
    
    # 샘플 개수
    n = table.sum().sum()
    
    # 최소 차원
    k = min(table.shape) - 1
    
    if k == 0:
        return 0
    
    # 크래머V 계산
    cramers_v_value = np.sqrt(chi2 / (n * k))
    
    return float(cramers_v_value)

# 2. 각 피처와 클래스의 크래머 V 계산
results = []

for col in mushDF.columns:
    if col != 'class':  # class 제외
        v_value = cramers_v(mushDF[col], targetSR)
        results.append({'Feature': col, 'Cramers_V': v_value})

# 3. 결과를 DataFrame으로 만들고 정렬
cramersDF = pd.DataFrame(results)
cramersDF = cramersDF.sort_values('Cramers_V', ascending=False)

print(cramersDF)

# 4. 상위 4개 피처 선택
top_4 = cramersDF.head(4)['Feature'].tolist()
print(f"\n상위 4개 피처:\n{top_4}")

# ===================================================================================================================================
# [4-3] 피쳐 분리

# odor                      : 냄새        (a 없음, l 아몬드, c 고리버섯 냄새, y 생연유, f 썩은, m 곰팡이, n 무취, p 고약한, s 매운 등)​
# spore-print-color         : 포자 무늬 색 (k 검정, n 갈색, b 크림, h 초콜릿, r 초록, o 주황, u 보라, w 흰색, y 노랑)​
# gill-color                : 주름 색깔    (k 검정, n 갈색, b 크림, h 초콜릿, g 회색, r 초록, o 주황, p 분홍, u 보라, e 빨강, w 흰색, y 노랑 등)​
# ring-type                 : 링 형태      (c 거미줄형, e 사라지는, f 벌어지는, l 큼, n 없음, p 늘어뜨려진, s 덮개형, z 띠 모양)​
# ===================================================================================================================================

featureDF = mushDF[top_4]
display(featureDF)

# ===================================================================================================================================

0       1
1       0
2       0
3       1
4       0
       ..
8119    0
8120    0
8121    0
8122    1
8123    0
Name: class, Length: 8124, dtype: int64
                     Feature  Cramers_V
4                       odor   0.971005
19         spore-print-color   0.752645
8                 gill-color   0.680830
18                 ring-type   0.603271
11  stalk-surface-above-ring   0.587944
12  stalk-surface-below-ring   0.574837
7                  gill-size   0.539758
13    stalk-color-above-ring   0.524850
14    stalk-color-below-ring   0.514725
3                    bruises   0.501280
20                population   0.487376
21                   habitat   0.440136
10                stalk-root   0.406805
6               gill-spacing   0.348052
0                  cap-shape   0.245571
2                  cap-color   0.218427
17               ring-number   0.214772
1                cap-surface   0.196925
16                veil-color   0.153421
5            gill-attachment   0.128424
9         

Unnamed: 0,odor,spore-print-color,gill-color,ring-type
0,6,2,4,4
1,0,3,4,4
2,3,3,5,4
3,6,2,5,4
4,5,3,4,0
...,...,...,...,...
8119,5,0,11,4
8120,5,0,11,4
8121,5,0,5,4
8122,8,7,0,0


[5] 학습용 / 테스트용 데이터셋 분할 <hr>
(x: featurue, y: target)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(
    featureDF,
    targetSR,
    random_state=42,
    test_size=0.2,
    stratify=targetSR)

print(f'[TRAIN] x_train:{x_train.shape},  y_train:{y_train.shape}')
print(f'[TEST]  x_test:{x_test.shape} ,  y_test:{y_test.shape}')

[TRAIN] x_train:(6499, 4),  y_train:(6499,)
[TEST]  x_test:(1625, 4) ,  y_test:(1625,)


[6] 배깅 - 랜덤포레스트 모델 생성 및 학습, 비교<hr>

In [8]:
# ===================================================================================================================================
# 1. 초기모델 생성
# ===================================================================================================================================
rfModel = RandomForestClassifier(
    n_estimators=100,           # 나무 개수 
    max_depth=None,             # 최대 깊이 
    min_samples_split=2,        # 분할 최소 샘플
    min_samples_leaf=1,         # 리프 노드 최소 샘플
    random_state=42,           
    n_jobs=-1,                  
    class_weight='balanced'     # 불균형 클래스 처리
)

# ===================================================================================================================================
# 2. 초기모델 학습
# ===================================================================================================================================
rfModel.fit(x_train, y_train)

# ===================================================================================================================================
# 3. 초기모델 성능
# ===================================================================================================================================
print(f"랜덤포레스트 모델 score : {rfModel.score(x_test, y_test)}")

랜덤포레스트 모델 score : 0.9932307692307693


[7] 교차 검증 <hr>
StratifiedKFold

In [9]:
skf = StratifiedKFold(
    n_splits=5, 
    shuffle=True, 
    random_state=42)

crossVAL = cross_val_score(
    rfModel, 
    x_train, y_train,
    cv=skf,
    scoring='accuracy',
    n_jobs=-1    
)   

print(f"fold별 교차검증 score : {crossVAL}")
print(f"평균                 : {crossVAL.mean():.4f}")
print(f"표준편차              : {crossVAL.std():.4f}")

fold별 교차검증 score : [0.99384615 0.99538462 0.99692308 0.99307692 0.99538106]
평균                 : 0.9949
표준편차              : 0.0013


[8] 하이퍼파라미터 튜닝 <hr>

In [10]:
# ===================================================================================================================================
# [8-1] test할 하이퍼파라미터셋
# ===================================================================================================================================

param_dist = {
    'n_estimators'     : [50, 100, 200, 300],    # 나무 개수
    'max_depth'        : [5, 10, 15, 20, None],  # 최대 깊이
    'min_samples_split': [2, 5, 10],             # 분할 최소 샘플
    'min_samples_leaf' : [1, 2, 4]}              # 리프 최소 샘플


# ===================================================================================================================================
# [8-2] best 하이퍼파라미터 찾기
# ===================================================================================================================================

random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced'),
    param_distributions=param_dist,
    cv=skf,           
    scoring='accuracy'  # 정확도 기준
)

random_search.fit(x_train, y_train)
print(f"최적 파라미터: {random_search.best_params_}")
print(f"최고 점수   : {random_search.best_score_:.4f}")

최적 파라미터: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 20}
최고 점수   : 0.9949


[9] 최종 모델 생성 <hr>
최적의 하이퍼파라미터
- max_depth=15 
- min_samples_split=5
- min_samples_leaf=2
- n_estimators=300

In [11]:
# ===================================================================================================================================
# 1. 최종모델 생성
# ===================================================================================================================================
final_rfModel = RandomForestClassifier(
    **random_search.best_params_,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced'
)

# ===================================================================================================================================
# 2. 최종모델 학습
# ===================================================================================================================================
final_rfModel.fit(x_train, y_train)

# ===================================================================================================================================
# 3. 최종모델 성능
# ===================================================================================================================================

print(f"랜덤포레스트 모델 score : {final_rfModel.score(x_test, y_test)}")

랜덤포레스트 모델 score : 0.9932307692307693


[10] 예측 모델 생성 <hr>

In [None]:
# # ===================================================================================================================================
# # 버섯의 냄새/포자 색/주름 색/링 형태 ==> 버섯 독성 예측
# # ===================================================================================================================================
"""
냄새 (odor) 입력값:
  a(아몬드)  c(크레오소트)  f(악취)  l(감초)  m(곰팡이)
  n(냄새없음) p(자극적)     s(향신료) y(생선)

포자 색 (spore-print-color) 입력값:
  b(검정)  h(초콜릿) k(검정) n(갈색) o(주황)
  r(초록)  u(보라)   w(흰색) y(노랑)

주름 색 (gill-color) 입력값:
  b(담황)  e(빨강)  g(회색)   h(초콜릿) k(검정)  n(갈색)
  o(주황)  p(분홍)  r(초록)   u(보라)   w(흰색)  y(노랑)

링 형태 (ring-type) 입력값:
  e(사라짐) f(벌어짐) l(큼) n(없음) p(늘어짐)
"""
# ===================================================================================================================================

in_data = input("버섯의 냄새/포자 색/주름 색/링 형태 입력(예:p k k p):").strip().split()
print(f'new_data : {in_data}', '\n')

# DF 형식으로 저장
newDF = pd.DataFrame([in_data], columns=['odor', 'spore-print-color', 'gill-color', 'ring-type'])

# 인풋 데이터 encode
for col in newDF.columns:
    newDF[col] = label_encoders[col].transform(newDF[col])

# display(newDF)

# 예측 진행
pre_target = final_rfModel.predict(newDF)
proba_ = final_rfModel.predict_proba(newDF)  # 예측 확률

# 클래스 0, 1 확률
edible_prob = proba_[0][0] * 100      # 식용(0)일 확률
poisonous_prob = proba_[0][1] * 100   # 독버섯(1)일 확률

print(f"\n[예측 확률]")
print(f" 식용버섯일 확률: {edible_prob}%")
print(f" 독성버섯일 확률: {poisonous_prob}%")

result = '식용 (Edible)' if pre_target[0] == 0 else '독성 (Poisonous)'

print(f'\n냄새: {in_data[0]}, 포자 색: {in_data[1]} 주름 색: {in_data[2]} 링 형태: {in_data[3]} 버섯은 {result} 입니다.')

In [None]:
print(dataDF[['class', 'odor', 'spore-print-color', 'gill-color', 'ring-type']])

     class odor spore-print-color gill-color ring-type
0        p    p                 k          k         p
1        e    a                 n          k         p
2        e    l                 n          n         p
3        p    p                 k          n         p
4        e    n                 n          k         e
...    ...  ...               ...        ...       ...
8119     e    n                 b          y         p
8120     e    n                 b          y         p
8121     e    n                 b          n         p
8122     p    y                 w          b         e
8123     e    n                 o          y         p

[8124 rows x 5 columns]


[11] 성능 평가 <hr>

In [None]:
# ====================================================================
# 성능 평가 (메트릭 계산)
# ====================================================================
pre_target_test = final_rfModel.predict(x_test)
proba_test = final_rfModel.predict_proba(x_test)

result = classification_report(y_test, pre_target_test)
print(result)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       842
           1       1.00      0.99      0.99       783

    accuracy                           0.99      1625
   macro avg       0.99      0.99      0.99      1625
weighted avg       0.99      0.99      0.99      1625

