In [16]:
# 기본
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import shutil

# 경고 뜨지 않게...
import warnings
warnings.filterwarnings('ignore')

# 그래프 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
# plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['font.size'] = 16
plt.rcParams['figure.figsize'] = 20, 10
plt.rcParams['axes.unicode_minus'] = False
# 매직명령어 => 쥬피터노트북에서 그래프 삽입 기능 
%matplotlib inline
# 글꼴 선명화 
%config InlineBackend.figure_format = 'retina'

# 랜덤 모듈
import random

# 학습 모델 저장 및 복원
import pickle

# 딥러닝 라이브러리
import tensorflow as tf
# 신경망 모델을 관리하는 객체
from tensorflow.keras.models import Sequential
# 선형 회귀 레이어
from tensorflow.keras.layers import Dense
# 활성화 함수를 정의하는 객체
from tensorflow.keras.layers import Activation
# 원핫 인코딩을 수행하는 함수
from tensorflow.keras.utils import to_categorical

# 저장된 학습모델을 복원한다.
from tensorflow.keras.models import load_model

# 모델을 자동 저장한다.
from tensorflow.keras.callbacks import ModelCheckpoint
# 성능이 더이상 좋아지지 않을 경우 중단 시킨다.
from tensorflow.keras.callbacks import EarlyStopping

# 평가함수
# 분류용
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

# 회귀용
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# 표준화
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# 문자열 => 숫자
from sklearn.preprocessing import LabelEncoder

# 전체데이터를 학습용과 검증으로 나눈다.
from sklearn.model_selection import train_test_split

# 랜덤시드 설정
# 데이터를 랜덤하게 섞거나 가중치를 랜덤하게 설정하는 등..
# 작업에서 랜덤을 적용하는 경우가 더러 있다.
# 이에, 시드를 고정시킨다.
random_seed = 1
np.random.seed(random_seed)
random.seed(random_seed)
tf.random.set_seed(random_seed)

# 현재 프로젝트에서 GPU 메모리 사용을 필요한 만큼만 쓸 수 있도록 한다.
# 컴퓨터에 있는 GPU 정보들을 가져온다.

gpus = tf.config.experimental.list_physical_devices('GPU')
# gpu가 있다면...
if len(gpus) > 0 :
    try :
        for gpu in gpus :
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e :
        print(e)

In [17]:
# 데이터를 읽어온다.
df1 = pd.read_csv('./data/mushrooms.csv')
df1.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [19]:
df1.isna().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [20]:
# 문자열 => 숫자
# 컬럼 이름을 가져온다.
# 작업을 하기전에 문자열이 저장되어 있는 컬럼만 추출해서 해주세요
# c1 = df1.columns
# 컬럼의 수만큼 반복한다.
# for c2 in c1 :
    # 결과 데이터 컬럼이면 pass
    # if c2 == 'class':
        # continue
    # df1[c2] = df1[c2].factorize()[0]    
# df1

In [21]:
# LabelEncoder
# 미래에 발생되는 데이터에 대한 예측을 수행해야 한다면
# 여기서 사용한 encoder들을 파일로 저장해야 한다.
# 문자열로 구송된 컬럼들만 추출해서 반복문 돌려서 저장한다.
encoder_dict = {}

# 문자열 컬럼의 수만큼 반복한다.
c1 = df1.columns

for c2 in c1 :
    encoder = LabelEncoder()
    df1[c2] = encoder.fit_transform(df1[c2])
    # 딕셔너리에 담는다.
    encoder_dict[c2] = encoder

display(encoder_dict)
df1

{'class': LabelEncoder(),
 'cap-shape': LabelEncoder(),
 'cap-surface': LabelEncoder(),
 'cap-color': LabelEncoder(),
 'bruises': LabelEncoder(),
 'odor': LabelEncoder(),
 'gill-attachment': LabelEncoder(),
 'gill-spacing': LabelEncoder(),
 'gill-size': LabelEncoder(),
 'gill-color': LabelEncoder(),
 'stalk-shape': LabelEncoder(),
 'stalk-root': LabelEncoder(),
 'stalk-surface-above-ring': LabelEncoder(),
 'stalk-surface-below-ring': LabelEncoder(),
 'stalk-color-above-ring': LabelEncoder(),
 'stalk-color-below-ring': LabelEncoder(),
 'veil-type': LabelEncoder(),
 'veil-color': LabelEncoder(),
 'ring-number': LabelEncoder(),
 'ring-type': LabelEncoder(),
 'spore-print-color': LabelEncoder(),
 'population': LabelEncoder(),
 'habitat': LabelEncoder()}

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,3,2,4,0,5,0,0,0,11,...,2,5,5,0,1,1,4,0,1,2
8120,0,5,2,4,0,5,0,0,0,11,...,2,5,5,0,0,1,4,0,4,2
8121,0,2,2,4,0,5,0,0,0,5,...,2,5,5,0,1,1,4,0,1,2
8122,1,3,3,4,0,8,1,0,1,0,...,1,7,7,0,2,1,0,7,4,2


In [22]:
# 입력과 결과로 나눈다.
X = df1.drop(['class'], axis=1)
y = df1['class']

In [23]:
len(y.value_counts())

2

In [24]:
# 표준화
scaler1 = StandardScaler()
X = scaler1.fit_transform(X)
display(X)

array([[ 1.02971224,  0.14012794, -0.19824983, ..., -0.67019486,
        -0.5143892 ,  2.03002809],
       [ 1.02971224,  0.14012794,  1.76587407, ..., -0.2504706 ,
        -1.31310821, -0.29572966],
       [-2.08704716,  0.14012794,  1.37304929, ..., -0.2504706 ,
        -1.31310821,  0.86714922],
       ...,
       [-0.8403434 ,  0.14012794, -0.19824983, ..., -1.50964337,
        -2.11182722,  0.28570978],
       [-0.21699152,  0.95327039, -0.19824983, ...,  1.42842641,
         0.28432981,  0.28570978],
       [ 1.02971224,  0.14012794, -0.19824983, ...,  0.16925365,
        -2.11182722,  0.28570978]])

In [25]:
# 2진 분류 옵션들
out_nodes = 1
loss_function = 'binary_crossentropy'
activation_function = 'sigmoid'

# 다중 분류 옵션들
# out_nodes = len(y.value_counts())
# loss_function = 'categorical_crossentropy'
# activation_function = 'softmax'

# 회귀
# 출력 결과를 하나만 뽑아서 Series로 나왔다면...
# 1로 설정한다.
# if type(y) is pd.core.series.Series:
#     out_nodes = 1
# else : 
#     out_nodes = y.shape[1]
    
# loss_function = 'mean_squared_error'

In [26]:
# 입력노드의 개수
# 입력 데이터 행 하나의 컬럼의 개수
input_size = X.shape[1]
input_size

22

In [27]:
# 신경망 설계
model = Sequential()

model.add(Dense(60, input_dim=input_size))
model.add(Activation('relu'))

model.add(Dense(40))
model.add(Activation('relu'))

model.add(Dense(20))
model.add(Activation('relu'))

model.add(Dense(out_nodes))
model.add(Activation(activation_function))

In [28]:
# 모델 컴파일
model.compile(loss=loss_function, optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 60)                1380      
                                                                 
 activation (Activation)     (None, 60)                0         
                                                                 
 dense_1 (Dense)             (None, 40)                2440      
                                                                 
 activation_1 (Activation)   (None, 40)                0         
                                                                 
 dense_2 (Dense)             (None, 20)                820       
                                                                 
 activation_2 (Activation)   (None, 20)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                

In [29]:
# 학습모델을 저장할 경로
path = './model/25'

# 만약 폴더가 있다면 삭제한다.
if os.path.isdir(path):
    shutil.rmtree(path)

# 폴더를 생성한다.
os.makedirs(os.path.join(path))

PermissionError: [WinError 32] 다른 프로세스가 파일을 사용 중이기 때문에 프로세스가 액세스 할 수 없습니다: './model/25\\2204-1.8353015548200347e-05.h5'

In [30]:
path1 = path + '/{epoch}-{val_loss}.h5'
path2 = path + '/best_model.h5'

# 저장콜백
call1 = ModelCheckpoint(filepath=path1, monitor='val_loss', save_best_only=True)
call2 = ModelCheckpoint(filepath=path2, monitor='val_loss', save_best_onlu=True)

In [31]:
# 조기 중단
call3 = EarlyStopping(monitor='val_loss', patience=300)

In [32]:
# 학습과 검증데이터로 나눈다.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [33]:
from tensorflow.keras.callbacks import TensorBoard
callbacks = [EarlyStopping(patience=3, monitor='val_loss'), TensorBoard(log_dir='./logs', histogram_freq=1)]

In [None]:
# 학습
# validation_split : 주어진 데이터에서 사용할 검증데이터 비율
history = model.fit(X_train, y_train, epochs=200000, batch_size=10000,
                    validation_data=[X_test, y_test], callbacks=[call1, call2, call3, callbacks])

Epoch 1/200000
Epoch 2/200000
Epoch 3/200000
Epoch 4/200000
Epoch 5/200000
Epoch 6/200000
Epoch 7/200000
Epoch 8/200000
Epoch 9/200000
Epoch 10/200000
Epoch 11/200000
Epoch 12/200000
Epoch 13/200000
Epoch 14/200000
Epoch 15/200000
Epoch 16/200000
Epoch 17/200000
Epoch 18/200000
Epoch 19/200000
Epoch 20/200000
Epoch 21/200000
Epoch 22/200000
Epoch 23/200000
Epoch 24/200000
Epoch 25/200000
Epoch 26/200000
Epoch 27/200000
Epoch 28/200000
Epoch 29/200000
Epoch 30/200000
Epoch 31/200000
Epoch 32/200000
Epoch 33/200000
Epoch 34/200000
Epoch 35/200000
Epoch 36/200000
Epoch 37/200000
Epoch 38/200000
Epoch 39/200000
Epoch 40/200000
Epoch 41/200000
Epoch 42/200000
Epoch 43/200000
Epoch 44/200000
Epoch 45/200000
Epoch 46/200000
Epoch 47/200000
Epoch 48/200000
Epoch 49/200000
Epoch 50/200000
Epoch 51/200000
Epoch 52/200000
Epoch 53/200000
Epoch 54/200000
Epoch 55/200000
Epoch 56/200000
Epoch 57/200000
Epoch 58/200000
Epoch 59/200000
Epoch 60/200000
Epoch 61/200000
Epoch 62/200000
Epoch 63/200000
E

In [51]:
# 모델을 가져온다.
best_model = load_model(path + '/best_model.h5')
best_model

<keras.engine.sequential.Sequential at 0x1c39ee88400>

In [36]:
a1 = best_model.evaluate(X_train, y_train)
print(f'손실률 : {a1[0]}')
print(f'정확도 : {a1[1]}')

손실률 : 2.8269027119165457e-09
정확도 : 1.0


In [35]:
a1 = best_model.evaluate(X_test, y_test)
print(f'손실률 : {a1[0]}')
print(f'정확도 : {a1[1]}')

손실률 : 1.579514830041262e-08
정확도 : 1.0


In [54]:
# 저장
result_obj = {
    'encoder' : encoder_dict,
    'scaler' : scaler1,
}

with open(path + '/result_obj.dat', 'wb') as fp :
    pickle.dump(result_obj, fp)

In [2]:
%load_ext tensorboard

In [3]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 5580), started 7:25:07 ago. (Use '!kill 5580' to kill it.)

In [55]:
with open(path + '/result_obj.dat', 'rb') as fp :
    result_obj2 = pickle.load(fp)
result_obj2

{'encoder': {'class': LabelEncoder(),
  'cap-shape': LabelEncoder(),
  'cap-surface': LabelEncoder(),
  'cap-color': LabelEncoder(),
  'bruises': LabelEncoder(),
  'odor': LabelEncoder(),
  'gill-attachment': LabelEncoder(),
  'gill-spacing': LabelEncoder(),
  'gill-size': LabelEncoder(),
  'gill-color': LabelEncoder(),
  'stalk-shape': LabelEncoder(),
  'stalk-root': LabelEncoder(),
  'stalk-surface-above-ring': LabelEncoder(),
  'stalk-surface-below-ring': LabelEncoder(),
  'stalk-color-above-ring': LabelEncoder(),
  'stalk-color-below-ring': LabelEncoder(),
  'veil-type': LabelEncoder(),
  'veil-color': LabelEncoder(),
  'ring-number': LabelEncoder(),
  'ring-type': LabelEncoder(),
  'spore-print-color': LabelEncoder(),
  'population': LabelEncoder(),
  'habitat': LabelEncoder()},
 'scaler': StandardScaler()}