### 데이터셋 준비

In [76]:
# 필요한 모델 임포트
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from tensorflow import keras 
from tensorflow.keras import models, layers, activations, initializers, losses, optimizers, metrics, utils

import keras_tuner as kt
import numpy as np
import IPython

In [5]:
import pandas as pd
import numpy as np

In [6]:
from google.colab import files
uploaded = files.upload()

Saving diabetes_data.csv to diabetes_data.csv


In [7]:
# 당뇨병 데이터 불러오기
df = pd.read_csv('diabetes_data.csv')

In [8]:
# feature와 target 분리
target = df[['Diabetes']]
data = df.drop(columns=['Diabetes'])

In [72]:
# train_test 분리
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=33)

### 데이터 전처리

In [10]:
# Pipeline 을 위한 전처리 모듈 불러오기
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
x_train.head(3)

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP
50498,13.0,0.0,1.0,1.0,32.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0
44300,10.0,0.0,1.0,1.0,22.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,10.0,20.0,0.0,0.0,1.0
49497,8.0,1.0,1.0,1.0,35.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,1.0


In [67]:
# 파이프라인


# 숫자칼럼들만 모아서 전처리
numeric_features = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', ]
numeric_transformer = StandardScaler() # cf) RobustScaler MinMaxScaler

# 범주칼럼들만 모아서 전처리 - 건강하지 않은 날짜들을 다 범주형 칼럼으로 보는 경우
categorical_features = ['Age', 'Sex', 'HighChol', 'CholCheck', 'Smoker',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'DiffWalk',
       'Stroke', 'HighBP',]
categorical_transformer = OneHotEncoder(categories='auto', 
                                        handle_unknown='ignore',)
# 전처리 객체 생성
preprocessor = ColumnTransformer(
    transformers=[ # List of (name, transformer, column(s))
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


# 파이프라인 돌리기
preprocessor_pipe = Pipeline(steps=[('preprocessor', preprocessor)]) # preprocessing-only
preprocessor_pipe.fit(x_train)

# 트랜스폼
train_data_transformed_more_numerics = preprocessor_pipe.transform(x_train)
test_data_transformed_more_numerics = preprocessor_pipe.transform(x_test)

# to dense로 재할당
#train_data_transformed_more_numerics = train_data_transformed_more_numerics.todense()
#test_data_transformed_more_numerics = test_data_transformed_more_numerics.todense() 

In [73]:
# target 데이터 원핫인코딩
y_train = utils.to_categorical(y_train)
y_test = utils.to_categorical(y_test)

In [28]:
# 전처리된 데이터 shape
test_data_transformed_more_numerics.shape

(14139, 41)

### 하이퍼 모델 생성

In [41]:
# hyper model 만들기
def build_hyper_model(hp):
    
    model = models.Sequential()
    model.add(layers.Dense(input_dim=41, units=256, activation='elu', kernel_initializer=initializers.he_normal()))
        
    # Tune the number of hidden layer (Choose an optimal value between 1~3)
    for layer_num in range(hp.Int('num_layers', min_value=1, max_value=3)): 
      
        # Tune the number of perceptrons in a dense layer (Choose an optimal value between 32~512) 
        hp_units = hp.Int('units_' + str(layer_num), min_value=32, max_value=512, step=32) # 32:512 & step 32, all parameter names should be unique (we name the inner parameters 'units_' + str(i))
        hp_activations = hp.Choice('activation_' + str(layer_num), values=['relu', 'elu'])

        model.add(layers.Dense(units = hp_units, activation = hp_activations))

    model.add(layers.Dense(2, activation='softmax')) # class 10 : 0~9

    # Tune the learning rate for the optimizer (Choose an optimal value from 0.01, 0.001, or 0.0001)
    hp_learning_rate = hp.Choice('learning_rate', values = [1e-2, 1e-3, 1e-4]) 
    
    model.compile(optimizer = keras.optimizers.Adam(learning_rate = hp_learning_rate),
                loss = keras.losses.CategoricalCrossentropy(), # use sparse c.c when our labels are looks like "1" (single integer), not "[1,0,0]" (one-hot vector) (@ http://j.mp/2XS0jmv)
                metrics = ['accuracy'])
    
    return model

In [42]:
# 베이즈 서치 객체 생성
tuner = kt.BayesianOptimization(build_hyper_model,
                                objective = 'val_accuracy', # Hyper-params tuning을 위한 목적함수 설정 (metric to minimize or maximize)
                                max_trials = 10, # 서로 다른 Hyper-params 조합으로 시도할 총 Trial 횟수 설정
                                directory = 'test_prac_dir', # Path to the working directory
                                project_name = 'diabetes_hyper_1') # Name to use as directory name for files saved by this Tuner

In [43]:
# 튜닝 선택지 요약
tuner.search_space_summary()

Search space summary
Default search space size: 8
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 3, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
activation_0 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'elu'], 'ordered': False}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
activation_1 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'elu'], 'ordered': False}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
activation_2 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'elu'], 'ordered': False}


In [None]:
# 베이즈 서치 실행
tuner.search(train_data_transformed_more_numerics, y_train, epochs=10, validation_data = (test_data_transformed_more_numerics, y_test)) # epochs == learning epoch for training a single model(epoch for each trial) 


In [47]:
# 실행결과
tuner.results_summary(num_trials=3) # Show "n" best trial results

Results summary
Results in test_prac_dir/diabetes_hyper_1
Showing 3 best trials
Objective(name="val_accuracy", direction="max")

Trial 09 summary
Hyperparameters:
num_layers: 1
units_0: 256
activation_0: elu
learning_rate: 0.01
units_1: 512
activation_1: relu
units_2: 448
activation_2: elu
Score: 0.7542259097099304

Trial 05 summary
Hyperparameters:
num_layers: 1
units_0: 96
activation_0: relu
learning_rate: 0.0001
units_1: 416
activation_1: relu
units_2: 480
activation_2: elu
Score: 0.754155158996582

Trial 06 summary
Hyperparameters:
num_layers: 3
units_0: 320
activation_0: elu
learning_rate: 0.001
units_1: 96
activation_1: relu
units_2: 448
activation_2: relu
Score: 0.7538015246391296


In [52]:
top3_models = tuner.get_best_hyperparameters(num_trials=3)
top3_models[0].space

[Int(name: 'num_layers', min_value: 1, max_value: 3, step: 1, sampling: linear, default: 1),
 Int(name: 'units_0', min_value: 32, max_value: 512, step: 32, sampling: linear, default: 32),
 Choice(name: 'activation_0', values: ['relu', 'elu'], ordered: False, default: relu),
 Choice(name: 'learning_rate', values: [0.01, 0.001, 0.0001], ordered: True, default: 0.01),
 Int(name: 'units_1', min_value: 32, max_value: 512, step: 32, sampling: linear, default: 32),
 Choice(name: 'activation_1', values: ['relu', 'elu'], ordered: False, default: relu),
 Int(name: 'units_2', min_value: 32, max_value: 512, step: 32, sampling: linear, default: 32),
 Choice(name: 'activation_2', values: ['relu', 'elu'], ordered: False, default: relu)]

In [55]:
top3_models[0].values

{'num_layers': 1,
 'units_0': 256,
 'activation_0': 'elu',
 'learning_rate': 0.01,
 'units_1': 512,
 'activation_1': 'relu',
 'units_2': 448,
 'activation_2': 'elu'}

In [77]:
# 가장 좋은 모델 평가 결과
top3model = tuner.get_best_models(num_models=3)
top_model = top3model[0]
top_model.summary()

results = top_model.evaluate(test_data_transformed_more_numerics, y_test)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               10752     
                                                                 
 dense_1 (Dense)             (None, 256)               65792     
                                                                 
 dense_2 (Dense)             (None, 2)                 514       
                                                                 
Total params: 77,058
Trainable params: 77,058
Non-trainable params: 0
_________________________________________________________________


In [78]:
results

[0.513058066368103, 0.7542259097099304]

In [None]:
# 실제 모델 보다 정확도가 올라감

In [None]:
# end of file