In [1]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 불필요한 경고가 뜨지 않게..
import warnings
warnings.filterwarnings('ignore')

# 데이터 전처리 알고리즘 (비지도 학습)
# 문자열 데이터를 숫자로 변환한다.
from sklearn.preprocessing import LabelEncoder
# 표준편차를 기반으로 표준화 한다.
# 잘못된 학습을 정상화 시키는 목적으로 사용한다.
from sklearn.preprocessing import StandardScaler
# 최소값을 0, 최대값1로 하는 표준화
from sklearn.preprocessing import MinMaxScaler


# 데이터를 학습용과 검증으로 나눈다.
from sklearn.model_selection import train_test_split
# 교차 검증
from sklearn.model_selection import cross_val_score

# 평가 함수
# 정확도 평가 함수
from sklearn.metrics import accuracy_score
# mse 평가 함수
from sklearn.metrics import mean_squared_error

# 학습 알고리즘 - 분류
# 최 근접 이웃
# 학습시 : 주어진 데이터를 저장만 한다.
# 예측 : 주변의 데이터를 보고 가장 많은 결과로 결정한다.
from sklearn.neighbors import KNeighborsClassifier
# 선형
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# 트리
# 학습시 : 주어진 데이터를 가지고 질문들을 생성한다.
# 예측시 : 질문을 통해 최종 결과를 예측한다.
from sklearn.tree import DecisionTreeClassifier
# 앙상블 - 다수의 알고리즘이 던지는 결과를 취합하여 최종 결과를 결정한다.
# 트리들을 사용한다.
# 학습시 - 데이터를 랜덤하게 섞어서 80%를 추출한다.
# 이렇게 추출된 데이터를 트리의 개수만큼 생성하여 각 트리들에게 주고
# 학습을 수행한다.
# 예측시 - 각 트리가 던지는 결과를 취합하여 다수결의 원칙으로 최종 결과를
# 결정한다.
from sklearn.ensemble import RandomForestClassifier
# 부스팅
# 앙상블 알고리즘이 잘못 예측한 데이터를 다시 학습하는 방식
# 학습과 예측 원리를 앙상블과 동일하다.
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# 학습 알고리즘 - 회귀
# 최 근접 이웃
from sklearn.neighbors import KNeighborsRegressor
# 선형
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
# 트리
from sklearn.tree import DecisionTreeRegressor
# 앙상블
from sklearn.ensemble import RandomForestRegressor
# 부스팅
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# 딥러닝
import tensorflow as tf

# 딥러닝 모델의 각 층을 관리하는 객체
from tensorflow.keras.models import Sequential
# 선형회귀를 수행하는 은닉층
from tensorflow.keras.layers import Dense
# 활성화 함수를 관리하는 것
from tensorflow.keras.layers import Activation

# 조기중단
from tensorflow.keras.callbacks import EarlyStopping

# 성능이 개선되면 모델을 파일로 자동 저장한다
from tensorflow.keras.callbacks import ModelCheckpoint

# 저장된 딥러닝모델 불러오기 .h5파일
from tensorflow.keras.models import load_model


# 원핫 인코딩 하는 함수
from tensorflow.keras.utils import to_categorical

# 객체를 저장하는 모듈
import pickle # ㅅㅂ 이게 이거였노 ㅆㅂ

### 데이터를 불러온다

In [2]:
df1 = pd.read_csv('data/mushrooms.csv') # class 칼럼이 결과 데이터임
df1

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8031,p,k,y,e,f,f,f,c,n,b,...,k,p,p,p,w,o,e,w,v,p
8032,p,x,s,n,f,f,f,c,n,b,...,s,w,w,p,w,o,e,w,v,d
8033,p,k,y,n,f,f,f,c,n,b,...,s,p,w,p,w,o,e,w,v,l
8034,e,k,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l


### 데이터 전처리

In [3]:
# 결측치 확인
df1.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [4]:
# 정보확인
df1.info()
# object가 졸라 많다., 딕셔너리에 오브젝트를  포문 어쩌고 한다고 함

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8036 entries, 0 to 8035
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8036 non-null   object
 1   cap-shape                 8036 non-null   object
 2   cap-surface               8036 non-null   object
 3   cap-color                 8036 non-null   object
 4   bruises                   8036 non-null   object
 5   odor                      8036 non-null   object
 6   gill-attachment           8036 non-null   object
 7   gill-spacing              8036 non-null   object
 8   gill-size                 8036 non-null   object
 9   gill-color                8036 non-null   object
 10  stalk-shape               8036 non-null   object
 11  stalk-root                8036 non-null   object
 12  stalk-surface-above-ring  8036 non-null   object
 13  stalk-surface-below-ring  8036 non-null   object
 14  stalk-color-above-ring  

In [5]:
# 인코더를 담을 딕셔너리
encoder_dict = {}

# 컬럼 이름 목록을 가져온다
columns = df1.columns
columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [6]:
# 컬럼의 이름만큼 반복
for c1 in columns : 
    # Lable Encoder 생성한다
    encoder1 = LabelEncoder()
    # 학습 및 변환
    df1[c1] = encoder1.fit_transform(df1[c1])
    # 인코더를 딕셔너리에 담아준다
    encoder_dict[c1] = encoder1

In [7]:
df1 # object 타입이 전부 다 숫자로 변환됐음

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8031,1,3,3,2,0,2,1,0,1,0,...,1,6,6,0,2,1,0,7,4,4
8032,1,5,2,4,0,2,1,0,1,0,...,2,7,7,0,2,1,0,7,4,0
8033,1,3,3,4,0,2,1,0,1,0,...,2,6,7,0,2,1,0,7,4,2
8034,0,3,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2


In [8]:
encoder_dict # 각 컬럼 이름대로 Label Encoder생성됐음

{'class': LabelEncoder(),
 'cap-shape': LabelEncoder(),
 'cap-surface': LabelEncoder(),
 'cap-color': LabelEncoder(),
 'bruises': LabelEncoder(),
 'odor': LabelEncoder(),
 'gill-attachment': LabelEncoder(),
 'gill-spacing': LabelEncoder(),
 'gill-size': LabelEncoder(),
 'gill-color': LabelEncoder(),
 'stalk-shape': LabelEncoder(),
 'stalk-root': LabelEncoder(),
 'stalk-surface-above-ring': LabelEncoder(),
 'stalk-surface-below-ring': LabelEncoder(),
 'stalk-color-above-ring': LabelEncoder(),
 'stalk-color-below-ring': LabelEncoder(),
 'veil-type': LabelEncoder(),
 'veil-color': LabelEncoder(),
 'ring-number': LabelEncoder(),
 'ring-type': LabelEncoder(),
 'spore-print-color': LabelEncoder(),
 'population': LabelEncoder(),
 'habitat': LabelEncoder()}

In [9]:
# 입력과 결과로 나눠준다.
X = df1.drop('class', axis=1)
y = df1['class']

display(X)
display(y)

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,5,2,4,1,6,1,0,1,4,0,...,2,7,7,0,2,1,4,2,3,5
1,5,2,9,1,0,1,0,0,4,0,...,2,7,7,0,2,1,4,3,2,1
2,0,2,8,1,3,1,0,0,5,0,...,2,7,7,0,2,1,4,3,2,3
3,5,3,8,1,6,1,0,1,5,0,...,2,7,7,0,2,1,4,2,3,5
4,5,2,3,0,5,1,1,0,4,1,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8031,3,3,2,0,2,1,0,1,0,1,...,1,6,6,0,2,1,0,7,4,4
8032,5,2,4,0,2,1,0,1,0,1,...,2,7,7,0,2,1,0,7,4,0
8033,3,3,4,0,2,1,0,1,0,1,...,2,6,7,0,2,1,0,7,4,2
8034,3,2,4,0,5,0,0,0,5,0,...,2,5,5,0,1,1,4,0,1,2


0       1
1       0
2       0
3       1
4       0
       ..
8031    1
8032    1
8033    1
8034    0
8035    0
Name: class, Length: 8036, dtype: int32

In [10]:
# 입력데이터 표준화
# 모든 컬럼의 값의 범위를 비슷한 수준으로 맞춰서
# 학습의 정상화를 보장받을 수 있게 한다.
scaler1 = StandardScaler()
X = scaler1.fit_transform(X)
X

array([[ 1.02521414,  0.1416885 , -0.20116403, ..., -0.66413842,
        -0.51759105,  2.026306  ],
       [ 1.02521414,  0.1416885 ,  1.75734595, ..., -0.24250654,
        -1.3162396 , -0.29318334],
       [-2.08839918,  0.1416885 ,  1.36564395, ..., -0.24250654,
        -1.3162396 ,  0.86656133],
       ...,
       [-0.22023119,  0.95266344, -0.20116403, ...,  1.44402098,
         0.2810575 ,  0.286689  ],
       [-0.22023119,  0.1416885 , -0.20116403, ..., -1.50740218,
        -2.11488816,  0.286689  ],
       [-2.08839918, -1.48026136,  1.36564395, ...,  1.44402098,
        -1.3162396 , -0.29318334]])

### 학습

In [11]:
model1 = KNeighborsClassifier()
model2 = LogisticRegression()
model3 = SVC()
model4 = DecisionTreeClassifier()
model5 = RandomForestClassifier()
model6 = AdaBoostClassifier()
model7 = GradientBoostingClassifier()

In [12]:
# 교차검증 수행
# cross_val_score(검증모델, 독립변수, 종속변수, 평가지표, 검증횟수)
r1 = cross_val_score(model1, X, y, scoring='accuracy', cv = 10)
r2 = cross_val_score(model2, X, y, scoring='accuracy', cv=10)
r3 = cross_val_score(model3, X, y, scoring='accuracy', cv=10)
r4 = cross_val_score(model4, X, y, scoring='accuracy', cv=10)
r5 = cross_val_score(model5, X, y, scoring='accuracy', cv=10)
r6 = cross_val_score(model6, X, y, scoring='accuracy', cv=10)
r7 = cross_val_score(model7, X, y, scoring='accuracy', cv=10)
# 리턴값 : 매 회차마다 정확도를 리스트로 뽑아냄

In [13]:
# r1
# 각 모델의 성능 평균 수치를 출력
print(r1.mean())
print(r2.mean())
print(r3.mean())
print(r4.mean())
print(r5.mean())
print(r6.mean())
print(r7.mean())
# 교차검증 하니까 다 높지만 kNN이 높다, 로지스틱 회귀는 딴거보다 ㅂㄹ

0.949230342682602
0.877924666827754
0.9543360408418678
0.9487320557858279
0.9557059038555664
0.9562040358605477
0.9606872239053796


In [14]:
model4

DecisionTreeClassifier()

In [15]:
# 선정한 모델에 전체 데이터 학습
best_model = DecisionTreeClassifier()
best_model.fit(X, y)

DecisionTreeClassifier()

In [16]:
# 저장한다
with open('train_model.dat','wb') as fp : 
    pickle.dump(encoder_dict, fp) # 인코더 딕셔너리
    pickle.dump(scaler1, fp) # 표준화 한것도 저장해야하고
    pickle.dump(best_model, fp) # 모델도 저장해야함

In [17]:
# 저장한거 확인
with open('train_model.dat', 'rb') as fp :
    a1 = pickle.load(fp)
    a2 = pickle.load(fp)
    a3 = pickle.load(fp) # 저장한 순서대로 가져옴

In [18]:
print(a1) # 딕셔너리 출력되고
print(a2) # standartScaler()도 출력됐고
print(a3) # 디시전트리 모델도 잘 출력됨
# 잘 저장됐음

{'class': LabelEncoder(), 'cap-shape': LabelEncoder(), 'cap-surface': LabelEncoder(), 'cap-color': LabelEncoder(), 'bruises': LabelEncoder(), 'odor': LabelEncoder(), 'gill-attachment': LabelEncoder(), 'gill-spacing': LabelEncoder(), 'gill-size': LabelEncoder(), 'gill-color': LabelEncoder(), 'stalk-shape': LabelEncoder(), 'stalk-root': LabelEncoder(), 'stalk-surface-above-ring': LabelEncoder(), 'stalk-surface-below-ring': LabelEncoder(), 'stalk-color-above-ring': LabelEncoder(), 'stalk-color-below-ring': LabelEncoder(), 'veil-type': LabelEncoder(), 'veil-color': LabelEncoder(), 'ring-number': LabelEncoder(), 'ring-type': LabelEncoder(), 'spore-print-color': LabelEncoder(), 'population': LabelEncoder(), 'habitat': LabelEncoder()}
StandardScaler()
DecisionTreeClassifier()
