# 피마인디언의 당뇨병 예측 (DNN 2진분류)

In [1]:
import tensorflow as tf
import pandas as pd
import seaborn as sns

In [2]:
data_path = 'datas_dnn/pima-indians-diabetes3.csv'
df = pd.read_csv(data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pregnant   768 non-null    int64  
 1   plasma     768 non-null    int64  
 2   pressure   768 non-null    int64  
 3   thickness  768 non-null    int64  
 4   insulin    768 non-null    int64  
 5   bmi        768 non-null    float64
 6   pedigree   768 non-null    float64
 7   age        768 non-null    int64  
 8   diabetes   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
df.describe()

Unnamed: 0,pregnant,plasma,pressure,thickness,insulin,bmi,pedigree,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.head()

Unnamed: 0,pregnant,plasma,pressure,thickness,insulin,bmi,pedigree,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
X = df.loc[:, 'pregnant':'age']
y = df['diabetes']

In [8]:
X.shape

(768, 8)

In [10]:
y.shape

(768,)

# 모델 구조 설계
- 입력데이터(features): 8개
- 출력(target): 2진분류 (1개 노드)

In [11]:
# lib import
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

In [17]:
# 모델 구조 만들기
model = Sequential()
# input부터 레이어 순서대로 입력
# shape: 데이터의 갯수를 tuple로 넣기
# 은닉층 노드의 경우에는 테스트를 하면서 최적의 값을 찾아야하지만 대체적으로 2의 제곱으로 줄여나가는 방법이 일반적
model.add(Input(shape=(8,)))
model.add(Dense(12, activation = 'relu', name = 'Dense_1'))
model.add(Dense(8, activation = 'relu', name = 'Dense_2'))
model.add(Dense(1, activation = 'sigmoid', name = 'Dense_3'))
model.summary()

# 학습 방식 정식

In [18]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
             )

# 학습 진행

In [19]:
history = model.fit(
    X, y, # <- 학습데이터
    epochs=100, # <- 전체 데이터 학습 횟수
    batch_size=5, # <= 한번 학습할 때마다 5개 데이터셋을 학습함. 1 iteration
    verbose='auto'
)

Epoch 1/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4948 - loss: 1.6391  
Epoch 2/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5260 - loss: 1.1800  
Epoch 3/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5807 - loss: 1.0061  
Epoch 4/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5964 - loss: 0.9462  
Epoch 5/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6107 - loss: 0.8339  
Epoch 6/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6068 - loss: 0.8025  
Epoch 7/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6328 - loss: 0.7513  
Epoch 8/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6146 - loss: 0.8188  
Epoch 9/100
[1m