In [2]:
import numpy as np
import pandas as pd

In [3]:
names = ['n_pregnant', 'glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thickness (mm)', 'serum_insulin (mu U/ml)', 'BMI', 'pedigree_function', 'age', 'class']
df = pd.read_csv('data/pima-indians-diabetes.csv', names=names)

df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# Data Preprocessing

In [4]:
df[df['blood_pressure (mm Hg)'] == 0]

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
7,10,115,0,0,0,35.3,0.134,29,0
15,7,100,0,0,0,30.0,0.484,32,1
49,7,105,0,0,0,0.0,0.305,24,0
60,2,84,0,0,0,0.0,0.304,21,0
78,0,131,0,0,0,43.2,0.27,26,1
81,2,74,0,0,0,0.0,0.102,22,0
172,2,87,0,23,0,28.9,0.773,25,0
193,11,135,0,0,0,52.3,0.578,40,1
222,7,119,0,0,0,25.2,0.209,37,0
261,3,141,0,0,0,30.0,0.761,27,1


### 먼저 결측값을 `NaN`(`np.nan`) 값으로 코딩한 후에, 결측값이 있는 행들을 제거하려고 한다.

In [6]:
columns = ['glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thickness (mm)', 'serum_insulin (mu U/ml)', 'BMI']
for col in columns:
    df[col].replace(0, np.nan)

df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### 결측값 제거

In [7]:
df.dropna(inplace=True)
df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [8]:
# 데이터프레임을 numpy 배열로 변환 
dataset = df.values
print(dataset.shape)

(392, 9)


In [9]:
X = dataset[:, 0:8]
Y = dataset[:, 8].astype(int)

print(X.shape)
print(Y.shape)

(392, 8)
(392,)


# 데이터 정규화

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# 훈련 데이터의 변환과 출력
X_standardized = scaler.fit_transform(X)

data = pd.DataFrame(X_standardized)
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,-9.063045e-18,1.132881e-17,-4.531523e-16,1.087565e-16,1.064908e-16,1.631348e-16,1.8126090000000003e-17,1.110223e-16
std,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278
min,-1.029213,-2.161731,-3.739001,-2.108484,-1.196867,-2.120941,-1.269525,-0.9682991
25%,-0.7174265,-0.7665958,-0.694164,-0.7755315,-0.6681786,-0.667678,-0.7340909,-0.771985
50%,-0.4056403,-0.1176959,-0.05314565,-0.01384444,-0.2574448,0.01621036,-0.2131475,-0.3793569
75%,0.5297185,0.6609841,0.5878727,0.7478426,0.2859877,0.5718696,0.4751644,0.5040564
max,4.271153,2.445459,3.151946,3.223325,5.81299,4.846172,5.497667,4.921123


# Keras Model Definition

## Import Library

In [14]:
from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import Adam

In [16]:
def create_model():
    #  케라스 모델 정의 
    model = Sequential()
    model.add(Dense(8, input_dim = 8, 
                    kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, input_dim = 8, 
                    kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # 모델 컴파일 
    adam = Adam(learning_rate = 0.01)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model

model = create_model()
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 72        
                                                                 
 dense_4 (Dense)             (None, 4)                 36        
                                                                 
 dense_5 (Dense)             (None, 1)                 5         
                                                                 
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________
None


# Grid Search with scikit-learn

In [17]:
# 최적 배치 크기와 에포크를 정하기 위한 그리드 탐색

# 랜덤 시드 정의
seed = 6
np.random.seed(seed)

# 모델 생성
model = KerasClassifier(build_fn = create_model, verbose = 1)

# 그리드 탐색 매개변수 정의 
batch_size = [10, 20, 40]
epochs = [10, 50, 100]

# 그리드 탐색 매개변수를 딕셔너리로 만들기 
param_grid = dict(batch_size=batch_size, epochs=epochs)

# GridSearchCV 빌드와 적합
grid = GridSearchCV(estimator = model, param_grid = param_grid, 
                    cv = KFold(random_state=seed, shuffle=True), 
                    verbose = 10)
grid_results = grid.fit(X_standardized, Y)

# 결과 보고
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START batch_size=10, epochs=10....................................
Epoch 1/10
Epoch 2/10
 1/32 [..............................] - ETA: 0s - loss: 0.6249 - accuracy: 0.9000

  model = KerasClassifier(build_fn = create_model, verbose = 1)
2024-10-15 19:53:26.450308: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[CV 1/5; 1/9] END .....batch_size=10, epochs=10;, score=0.709 total time=   0.4s
[CV 2/5; 1/9] START batch_size=10, epochs=10....................................
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[CV 2/5; 1/9] END .....batch_size=10, epochs=10;, score=0.684 total time=   0.3s
[CV 3/5; 1/9] START batch_size=10, epochs=10....................................
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[CV 3/5; 1/9] END .....batch_size=10, epochs=10;, score=0.744 total time=   0.3s
[CV 4/5; 1/9] START batch_size=10, epochs=10....................................
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[CV 4/5; 1/9] END .....batch_size=10, epochs=10;, score=0.859 total time=   0.3s
[CV 5/5; 1/

# Dropout

In [18]:
from keras.layers import Dropout

# 랜덤 시드 정의
seed = 6
np.random.seed(seed)

# 모델 정의 
def create_model(learn_rate, dropout_rate): # 학습률과 드롭아웃 비율을 인자로
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))        # 드롭아웃 레이어 추가 
    model.add(Dense(4, input_dim = 8, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(dropout_rate))        # 드롭아웃 레이어 추가   
    model.add(Dense(1, activation='sigmoid'))
    
    # 모델 컴파일 
    adam = Adam(learning_rate = learn_rate) # 학습률에 대한 변수 
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

# 모델 생성 
model = KerasClassifier(build_fn = create_model, epochs = 50, batch_size = 10, verbose = 0)

# 그리트 탐색 매개변수 정의
learn_rate = [0.001, 0.01, 0.1]
dropout_rate = [0.0, 0.1, 0.2]

# 그리드 탐색 매개변수를 딕셔너리로 변환 
param_grid = dict(learn_rate=learn_rate, dropout_rate=dropout_rate)

# GridSearchCV 빌드와 적합
grid = GridSearchCV(estimator = model, param_grid = param_grid, 
                    cv = KFold(random_state=seed, shuffle=True), 
                    verbose = 10)
grid_results = grid.fit(X_standardized, Y)

# 결과 보고
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

  model = KerasClassifier(build_fn = create_model, epochs = 50,


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START dropout_rate=0.0, learn_rate=0.001..........................
[CV 1/5; 1/9] END dropout_rate=0.0, learn_rate=0.001;, score=0.785 total time=   0.6s
[CV 2/5; 1/9] START dropout_rate=0.0, learn_rate=0.001..........................
[CV 2/5; 1/9] END dropout_rate=0.0, learn_rate=0.001;, score=0.684 total time=   0.6s
[CV 3/5; 1/9] START dropout_rate=0.0, learn_rate=0.001..........................
[CV 3/5; 1/9] END dropout_rate=0.0, learn_rate=0.001;, score=0.782 total time=   0.6s
[CV 4/5; 1/9] START dropout_rate=0.0, learn_rate=0.001..........................
[CV 4/5; 1/9] END dropout_rate=0.0, learn_rate=0.001;, score=0.808 total time=   0.5s
[CV 5/5; 1/9] START dropout_rate=0.0, learn_rate=0.001..........................
[CV 5/5; 1/9] END dropout_rate=0.0, learn_rate=0.001;, score=0.821 total time=   0.6s
[CV 1/5; 2/9] START dropout_rate=0.0, learn_rate=0.01...........................
[CV 1/5; 2/9] END dropou

# Find Best hyperparameter

In [19]:
# activation, init 그리드 탐색 

# 랜덤 시드 지정
seed = 6
np.random.seed(seed)

# 모델 정의 
def create_model(activation, init):  # 인자 지정 
    # 모델 생성
    model = Sequential()
    model.add(Dense(8, input_dim = 8, kernel_initializer= init, activation= activation))
    model.add(Dense(4, input_dim = 8, kernel_initializer= init, activation= activation))
    model.add(Dense(1, activation='sigmoid'))
    
    # 모델 컴파일 
    adam = Adam(learning_rate = 0.001)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    return model

# 모델 생성
model = KerasClassifier(build_fn = create_model, epochs = 100, 
                        batch_size = 20, verbose = 0)

# 그리드 탐색 매개변수 정의 
activation = ['softmax', 'relu', 'tanh', 'linear']
init = ['uniform', 'normal', 'zero']

# 그리드 탐색 매개변수를 딕셔너리로 변환 
param_grid = dict(activation = activation, init = init)

# GridSearchCV 빌드와 적합
grid = GridSearchCV(estimator = model, param_grid = param_grid, 
                    cv = KFold(random_state=seed, shuffle=True), verbose = 10)
grid_results = grid.fit(X_standardized, Y)

# 결과 보고 
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

  model = KerasClassifier(build_fn = create_model, epochs = 100,


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5; 1/12] START activation=softmax, init=uniform...........................
[CV 1/5; 1/12] END activation=softmax, init=uniform;, score=0.797 total time=   0.6s
[CV 2/5; 1/12] START activation=softmax, init=uniform...........................
[CV 2/5; 1/12] END activation=softmax, init=uniform;, score=0.696 total time=   0.6s
[CV 3/5; 1/12] START activation=softmax, init=uniform...........................
[CV 3/5; 1/12] END activation=softmax, init=uniform;, score=0.808 total time=   0.6s
[CV 4/5; 1/12] START activation=softmax, init=uniform...........................
[CV 4/5; 1/12] END activation=softmax, init=uniform;, score=0.808 total time=   0.6s
[CV 5/5; 1/12] START activation=softmax, init=uniform...........................
[CV 5/5; 1/12] END activation=softmax, init=uniform;, score=0.769 total time=   0.6s
[CV 1/5; 2/12] START activation=softmax, init=normal............................
[CV 1/5; 2/12] END activatio

# Optimize number of neurons

In [20]:
# 히든 레이어의 뉴런의 최적 개수를 찾기 위한 그리드 탐색 

# 랜덤 시드 설정 
seed = 6
np.random.seed(seed)

# 모델 정의 
def create_model(neuron1, neuron2):
    model = Sequential()
    model.add(Dense(neuron1, input_dim = 8, kernel_initializer= 'uniform', activation= 'linear'))
    model.add(Dense(neuron2, input_dim = neuron1, kernel_initializer= 'uniform', activation= 'linear'))
    model.add(Dense(1, activation='sigmoid'))
    
    # 모델 컴파일
    adam = Adam(learning_rate = 0.001)
    model.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
    
    return model

# 모델 생성
model = KerasClassifier(build_fn = create_model, epochs = 100, batch_size = 32, verbose = 0)

# 그리드 탐색 매개변수 설정 
neuron1 = [4, 8, 16]
neuron2 = [2, 4, 8]

# 그리드 탐색 매개변수를 딕셔너리로 변화 
param_grid = dict(neuron1 = neuron1, neuron2 = neuron2)

# GridSearchCV 빌드와 적합
grid = GridSearchCV(estimator = model, 
                    param_grid = param_grid, 
                    cv = KFold(random_state=seed, shuffle=True), 
                    refit = True, 
                    verbose = 10)
grid_results = grid.fit(X_standardized, Y)

# 결과 보고
print("Best: {0}, using {1}".format(grid_results.best_score_, grid_results.best_params_))
means = grid_results.cv_results_['mean_test_score']
stds = grid_results.cv_results_['std_test_score']
params = grid_results.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print('{0} ({1}) with: {2}'.format(mean, stdev, param))

  model = KerasClassifier(build_fn = create_model,


Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5; 1/9] START neuron1=4, neuron2=2........................................
[CV 1/5; 1/9] END .........neuron1=4, neuron2=2;, score=0.785 total time=   0.6s
[CV 2/5; 1/9] START neuron1=4, neuron2=2........................................
[CV 2/5; 1/9] END .........neuron1=4, neuron2=2;, score=0.722 total time=   0.6s
[CV 3/5; 1/9] START neuron1=4, neuron2=2........................................
[CV 3/5; 1/9] END .........neuron1=4, neuron2=2;, score=0.795 total time=   0.6s
[CV 4/5; 1/9] START neuron1=4, neuron2=2........................................
[CV 4/5; 1/9] END .........neuron1=4, neuron2=2;, score=0.821 total time=   0.5s
[CV 5/5; 1/9] START neuron1=4, neuron2=2........................................
[CV 5/5; 1/9] END .........neuron1=4, neuron2=2;, score=0.782 total time=   0.6s
[CV 1/5; 2/9] START neuron1=4, neuron2=4........................................
[CV 1/5; 2/9] END .........neuron1=4, neuron2=4;,

# Predict

In [24]:
# 최적 초매개변수를 가지고 예측값 생성
y_pred = grid.predict(X_standardized)

print(y_pred.shape)

(392, 1)


In [22]:
from sklearn.metrics import classification_report, accuracy_score
print(accuracy_score(Y, y_pred))
print(classification_report(Y, y_pred))

0.7831632653061225
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       262
           1       0.72      0.57      0.64       130

    accuracy                           0.78       392
   macro avg       0.76      0.73      0.74       392
weighted avg       0.78      0.78      0.78       392



In [25]:
prediction = grid.predict(X_standardized[1].reshape(1, -1))
print(prediction)

[[1]]
