<a href="https://colab.research.google.com/github/in3der/HCI/blob/main/hci_example2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 사례2: 당뇨병 발병 예측하기
## 목표
- 딥러닝과 그리드 탐색방법을 사용해 당뇨병 발병을 예측하는 사례를 소개한다
- 또한 딥러닝의 개념과 최적화를 위한 파라미터를 소개하고, 적절한 파라미터를
선택하는 방법도 다룬다

## 순서
- 준비하기
- 케라스모델설계
- 그리드탐색실행하기
- 드롭아웃규제로과적합줄이기
- 최적의초매개변수찾기
- 최적초매개변수를사용해예측하기


In [None]:
# 준비하기
import pandas as pd
import numpy as np
import sklearn
from tensorflow import keras

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['n_pregnant', 'glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thickness (mm)',
         'serum_insulin (mm U/ml)', 'BMI', 'pedigree_function', 'age', 'class']
df = pd.read_csv(url, names=names)

df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_pregnant,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
glucose_concentration,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
blood_pressure (mm Hg),768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
skin_thickness (mm),768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
serum_insulin (mm U/ml),768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
pedigree_function,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
class,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [None]:
# 데이터 전처리
cols = ['glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thickness (mm)', 'serum_insulin (mm U/ml)', 'BMI']
for col in cols:
  df[col].replace(0, np.nan, inplace=True)
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_pregnant,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
glucose_concentration,763.0,121.686763,30.535641,44.0,99.0,117.0,141.0,199.0
blood_pressure (mm Hg),733.0,72.405184,12.382158,24.0,64.0,72.0,80.0,122.0
skin_thickness (mm),541.0,29.15342,10.476982,7.0,22.0,29.0,36.0,99.0
serum_insulin (mm U/ml),394.0,155.548223,118.775855,14.0,76.25,125.0,190.0,846.0
BMI,757.0,32.457464,6.924988,18.2,27.5,32.3,36.6,67.1
pedigree_function,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
class,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [None]:
df.dropna(inplace=True)
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
n_pregnant,392.0,3.30102,3.211424,0.0,1.0,2.0,5.0,17.0
glucose_concentration,392.0,122.627551,30.860781,56.0,99.0,119.0,143.0,198.0
blood_pressure (mm Hg),392.0,70.663265,12.496092,24.0,62.0,70.0,78.0,110.0
skin_thickness (mm),392.0,29.145408,10.516424,7.0,21.0,29.0,37.0,63.0
serum_insulin (mm U/ml),392.0,156.056122,118.84169,14.0,76.75,125.5,190.0,846.0
BMI,392.0,33.086224,7.027659,18.2,28.4,33.2,37.1,67.1
pedigree_function,392.0,0.523046,0.345488,0.085,0.26975,0.4495,0.687,2.42
age,392.0,30.864796,10.200777,21.0,23.0,27.0,36.0,81.0
class,392.0,0.331633,0.471401,0.0,0.0,0.0,1.0,1.0


In [None]:
dataset = df.values
print(dataset.shape)

(392, 9)


In [None]:
# 데이터 전처리
# 데이터셋 분리, 정규화
X = dataset[:, 0:8]
Y = dataset[:, 8].astype(int)

In [None]:
print(X.shape)
print(Y.shape)
print(X[:5])
print(Y[:5])

(392, 8)
(392,)
[[1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01]
 [3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
  2.600e+01]
 [2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
  5.300e+01]
 [1.000e+00 1.890e+02 6.000e+01 2.300e+01 8.460e+02 3.010e+01 3.980e-01
  5.900e+01]]
[0 1 1 1 1]


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)

In [None]:
X_standardized = scaler.transform(X)
data = pd.DataFrame(X_standardized)
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,392.0,-9.063045e-18,1.001278,-1.029213,-0.717427,-0.40564,0.529718,4.271153
1,392.0,1.132881e-17,1.001278,-2.161731,-0.766596,-0.117696,0.660984,2.445459
2,392.0,-4.531523e-16,1.001278,-3.739001,-0.694164,-0.053146,0.587873,3.151946
3,392.0,1.087565e-16,1.001278,-2.108484,-0.775531,-0.013844,0.747843,3.223325
4,392.0,1.064908e-16,1.001278,-1.196867,-0.668179,-0.257445,0.285988,5.81299
5,392.0,1.631348e-16,1.001278,-2.120941,-0.667678,0.01621,0.57187,4.846172
6,392.0,1.8126090000000003e-17,1.001278,-1.269525,-0.734091,-0.213147,0.475164,5.497667
7,392.0,1.110223e-16,1.001278,-0.968299,-0.771985,-0.379357,0.504056,4.921123


In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.11.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.11.0


In [None]:
# 케라스 모델 정의
import tensorflow as tf
from tensorflow import keras
import scikeras

from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold

In [None]:
seed = 6
np.random.seed(seed)

def create_model():
  model = keras.models.Sequential()
  model.add(keras.layers.Dense(8, input_dim=8, kernel_initializer='normal', activation='relu'))
  model.add(keras.layers.Dense(4, kernel_initializer='normal', activation='relu'))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  adam = keras.optimizers.Adam(learning_rate=0.01)

  model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = create_model()
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 4)                 36        
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 113 (452.00 Byte)
Trainable params: 113 (452.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
# 그리드 탐색 실행하기
model = KerasClassifier(model=create_model)

In [None]:
batch_size=[10, 20, 40]
epochs = [10, 50, 100]

param_grid = dict(batch_size = batch_size, epochs=epochs)

In [None]:
grid = GridSearchCV(estimator=model, param_grid=param_grid,
                    cv=KFold(random_state=seed, shuffle=True), verbose=0)
grid_results = grid.fit(X_standardized, Y)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/5



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 2

In [None]:
print("Best: {0}, **using** {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.7858487504057124, **using** {'batch_size': 10, 'epochs': 10}
0.7858487504057124 (0.04512673258748606) with: {'batch_size': 10, 'epochs': 10}
0.7832521908471275 (0.02039079519140098) with: {'batch_size': 10, 'epochs': 50}
0.7527101590392731 (0.03731035633616516) with: {'batch_size': 10, 'epochs': 100}
0.7654008438818565 (0.044558926235447015) with: {'batch_size': 20, 'epochs': 10}
0.7552418046088932 (0.02824698497143789) with: {'batch_size': 20, 'epochs': 50}
0.7425186627718273 (0.030116278340449568) with: {'batch_size': 20, 'epochs': 100}
0.7578383641674782 (0.04656027978298349) with: {'batch_size': 40, 'epochs': 10}
0.7679324894514767 (0.04235457459467241) with: {'batch_size': 40, 'epochs': 50}
0.7806556312885428 (0.043639464096071535) with: {'batch_size': 40, 'epochs': 100}


In [None]:
# 드롭아웃 정규화
seed = 6
np.random.seed(seed)

def create_model(learn_rate, dropout_rate):
  model = keras.models.Sequential()
  model.add(keras.layers.Dense(8, input_dim=8, kernel_initializer='normal', activation='relu'))
  model.add(keras.layers.Dropout(dropout_rate))
  model.add(keras.layers.Dense(4, kernel_initializer='normal', activation='relu'))
  model.add(keras.layers.Dropout(dropout_rate))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  adam = keras.optimizers.Adam(learning_rate=learn_rate)

  model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = KerasClassifier(model=create_model, epochs=50, batch_size=40, verbose=0)

In [None]:
learn_rate = [0.001, 0.01, 0.1]
dropout_rate = [0.0, 0.1, 0.2]

param_grid = dict(model__learn_rate=learn_rate, model__dropout_rate=dropout_rate)

grid = GridSearchCV(estimator = model, param_grid = param_grid,
                    cv = KFold(random_state=seed, shuffle=True), verbose=0)
grid_results = grid.fit(X_standardized, Y)

In [None]:
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.7755598831548198, using {'model__dropout_rate': 0.2, 'model__learn_rate': 0.01}
0.7629016553067187 (0.03728106518287333) with: {'model__dropout_rate': 0.0, 'model__learn_rate': 0.001}
0.7730931515741641 (0.04142285693854927) with: {'model__dropout_rate': 0.0, 'model__learn_rate': 0.01}
0.7577409931840312 (0.054966224409328475) with: {'model__dropout_rate': 0.0, 'model__learn_rate': 0.1}
0.7629016553067185 (0.042878011515018986) with: {'model__dropout_rate': 0.1, 'model__learn_rate': 0.001}
0.7578059071729958 (0.04108760660363957) with: {'model__dropout_rate': 0.1, 'model__learn_rate': 0.01}
0.7604673807205453 (0.04585381926194726) with: {'model__dropout_rate': 0.1, 'model__learn_rate': 0.1}
0.7680298604349236 (0.045545340391602004) with: {'model__dropout_rate': 0.2, 'model__learn_rate': 0.001}
0.7755598831548198 (0.04419040955956239) with: {'model__dropout_rate': 0.2, 'model__learn_rate': 0.01}
0.7273287893541058 (0.047009158296643504) with: {'model__dropout_rate': 0.2, 'model_

In [None]:
# 최적 초매개변수 찾기
seed = 6
np.random.seed(seed)

def create_model(init, activation):
  model = keras.models.Sequential()
  model.add(keras.layers.Dense(8, input_dim = 8, kernel_initializer=init, activation=activation))
  model.add(keras.layers.Dropout(0.2))
  model.add(keras.layers.Dense(4, kernel_initializer=init, activation=activation))
  model.add(keras.layers.Dropout(0.2))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  adam = keras.optimizers.Adam(learning_rate=0.01)

  model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = KerasClassifier(model = create_model, epochs = 50,
                        batch_size = 40, verbose = 0)

In [None]:
activation = ['softmax', 'relu', 'tanh', 'linear']
init = ['uniform', 'normal', 'zero']

param_grid = dict(model__activation = activation, model__init = init)

grid = GridSearchCV(estimator = model, param_grid = param_grid,
                    cv = KFold(random_state=seed, shuffle=True), verbose=0)
grid_results = grid.fit(X_standardized, Y)

In [None]:
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.7858812074001946, using {'model__activation': 'linear', 'model__init': 'normal'}
0.7755274261603375 (0.04138176425052657) with: {'model__activation': 'softmax', 'model__init': 'uniform'}
0.762836741317754 (0.045654844199930336) with: {'model__activation': 'softmax', 'model__init': 'normal'}
0.7653683868873742 (0.03434011371057473) with: {'model__activation': 'softmax', 'model__init': 'zero'}
0.7782538136968516 (0.03717971031180021) with: {'model__activation': 'relu', 'model__init': 'uniform'}
0.7807205452775072 (0.03277889706065267) with: {'model__activation': 'relu', 'model__init': 'normal'}
0.6685167153521585 (0.02715733189031009) with: {'model__activation': 'relu', 'model__init': 'zero'}
0.7755274261603375 (0.04501439958760607) with: {'model__activation': 'tanh', 'model__init': 'uniform'}
0.7627718273287893 (0.03443905832896611) with: {'model__activation': 'tanh', 'model__init': 'normal'}
0.6685167153521585 (0.02715733189031009) with: {'model__activation': 'tanh', 'model__in

In [None]:
# 뉴런의 개수 최적화
seed = 6
np.random.seed(seed)

def create_model(neuron1, neuron2):
  model = keras.models.Sequential()
  model.add(keras.layers.Dense(neuron1, input_dim = 8, kernel_initializer='uniform', activation='linear'))
  model.add(keras.layers.Dropout(0.2))
  model.add(keras.layers.Dense(neuron2, kernel_initializer='uniform', activation='linear'))
  model.add(keras.layers.Dropout(0.2))
  model.add(keras.layers.Dense(1, activation='sigmoid'))
  adam = keras.optimizers.Adam(learning_rate=0.01)

  model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = KerasClassifier(model = create_model, epochs = 50,
                        batch_size = 40, verbose = 0)

In [None]:
neuron1 = [4, 8, 16]
neuron2 = [2, 4, 8]

param_grid = dict(model__neuron1 = neuron1, model__neuron2 = neuron2)

grid = GridSearchCV(estimator = model, param_grid = param_grid,
                    cv = KFold(random_state=seed, shuffle=True), verbose=0)
grid_results = grid.fit(X_standardized, Y)

In [None]:
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Best: 0.7909444985394352, using {'model__neuron1': 8, 'model__neuron2': 2}
0.7884128529698149 (0.02965908718747858) with: {'model__neuron1': 4, 'model__neuron2': 2}
0.7858812074001946 (0.03434980631906684) with: {'model__neuron1': 4, 'model__neuron2': 4}
0.7807854592664718 (0.03432461825861244) with: {'model__neuron1': 4, 'model__neuron2': 8}
0.7909444985394352 (0.029783497399735404) with: {'model__neuron1': 8, 'model__neuron2': 2}
0.7807854592664718 (0.03235254208581953) with: {'model__neuron1': 8, 'model__neuron2': 4}
0.7833171048360921 (0.03308105024972066) with: {'model__neuron1': 8, 'model__neuron2': 8}
0.7833171048360921 (0.027789215433780337) with: {'model__neuron1': 16, 'model__neuron2': 2}
0.7833171048360921 (0.03308105024972066) with: {'model__neuron1': 16, 'model__neuron2': 4}
0.7858812074001946 (0.03434980631906684) with: {'model__neuron1': 16, 'model__neuron2': 8}


In [None]:
# 최적의 초매개변수를 사용해 예측하기
import numpy as np
y_pred = grid.predict(X_standardized)

In [None]:
print(y_pred.shape)

(392,)


In [None]:
print(y_pred[:5])

[0 1 0 1 1]


In [None]:
print(Y[:5])

[0 1 1 1 1]


In [None]:
from sklearn.metrics import classification_report, accuracy_score
print(accuracy_score(Y, y_pred))
print(classification_report(Y, y_pred))

0.7908163265306123
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       262
           1       0.74      0.57      0.64       130

    accuracy                           0.79       392
   macro avg       0.77      0.73      0.75       392
weighted avg       0.79      0.79      0.78       392



In [None]:
# 하나의 사례에 대한 예측
example = df.iloc[1]
print(example)

n_pregnant                   0.000
glucose_concentration      137.000
blood_pressure (mm Hg)      40.000
skin_thickness (mm)         35.000
serum_insulin (mm U/ml)    168.000
BMI                         43.100
pedigree_function            2.288
age                         33.000
class                        1.000
Name: 4, dtype: float64


In [None]:
prediction = grid.predict(X_standardized[1].reshape(1, -1))
print(prediction)

[1]
