## diabetes 예제 딥러닝 분류

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import tensorflow as tf
import kerastuner as kt
from keras.models import Sequential
from keras.layers import Dense,Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, confusion_matrix

In [34]:
file_path = './diabetes.csv'
data = pd.read_csv(file_path)

data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [35]:
# 데이터 확인
print("데이터 형태:", data.shape)
print("\n컬럼 정보:\n", data.columns)
print("\n클래스 분포:\n", data["Outcome"].value_counts())

# 결측치 확인
print("=== 결측치 현황 ===")
print(data.isnull().sum())

데이터 형태: (768, 9)

컬럼 정보:
 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

클래스 분포:
 Outcome
0    500
1    268
Name: count, dtype: int64
=== 결측치 현황 ===
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [36]:
# 레이블 분포 확인
print("\n=== 레이블 분포 ===")
print(data['Outcome'].value_counts())


=== 레이블 분포 ===
Outcome
0    500
1    268
Name: count, dtype: int64


In [37]:
# 특성(X)과 레이블(y) 분리
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

# 훈련/테스트 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터셋 Shape 확인
print("\n=== 데이터셋 Shape ===")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


=== 데이터셋 Shape ===
X_train shape: (614, 8)
X_test shape: (154, 8)
y_train shape: (614,)
y_test shape: (154,)


In [38]:
# 분류기 초기화
classifiers = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

# 모델 학습 및 평가
results = []
for name, clf in classifiers.items():
    clf.fit(X_train, y_train) # 모델 학습
    y_pred = clf.predict(X_test) # 테스트 데이터 예측
    acc = accuracy_score(y_test, y_pred) # 정확도 계산
    cm = confusion_matrix(y_test, y_pred) # 혼동 행렬 계산 (TP|FN|FP|TN)
    cr = classification_report(y_test, y_pred) # 분류 보고서
    f1 = f1_score(y_test, y_pred) # f1값 계산
    results.append((name, acc, cm, cr, f1)) # 결과를 리스트에 저장

# 결과 출력
print("\n=== 모델 성능 비교 ===")
for name, acc, cm, cr, f1 in results:
    print(f"\n======={name}=======")
    print(f"Accuracy: {acc:.4f}")
    print(f"f1_score: {f1}")
    print(cr)
    print("Confusion Matrix:")
    print(cm)


=== 모델 성능 비교 ===

Accuracy: 0.7273
f1_score: 0.6181818181818182
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        99
           1       0.62      0.62      0.62        55

    accuracy                           0.73       154
   macro avg       0.70      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154

Confusion Matrix:
[[78 21]
 [21 34]]

Accuracy: 0.7532
f1_score: 0.6607142857142857
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

Confusion Matrix:
[[79 20]
 [18 37]]

Accuracy: 0.7468
f1_score: 0.6548672566371682
              precision    recall  f1-score   support

           0       0.81      0.79      0.80        99
    

In [39]:
# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # 훈련 데이터로 학습
X_test_scaled = scaler.transform(X_test)        # 테스트 데이터 변환

# 데이터셋 Shape 확인
print("\n=== 데이터셋 Shape ===")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


=== 데이터셋 Shape ===
X_train_scaled shape: (614, 8)
X_test_scaled shape: (154, 8)
y_train shape: (614,)
y_test shape: (154,)


### 텐서플로우로 딥러닝

In [41]:
# MLP 신경망 정의
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),  # 입력층 → 은닉층1 (64개 뉴런)
    Dense(32, activation='relu'),  # 은닉층1 → 은닉층2 (32개 뉴런)
    Dense(1, activation='sigmoid')  # 출력층 (시그모이드 활성화)
])

# 모델 컴파일
model.compile(
    optimizer=Adam(learning_rate=0.001),  # Adam 옵티마이저 (학습률 0.001)
    loss='binary_crossentropy',  # 이진 교차 엔트로피 손실
    metrics=['accuracy']  # 정확도 추적
)

# 검증 세트 분리 (훈련 데이터의 20%)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train_scaled, y_train, 
    test_size=0.2, 
    random_state=42
)

# 학습 (배치 크기 32, 50 에포크)
history = model.fit(
    X_train_sub, y_train_sub,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    shuffle=True,
    verbose=1  # 학습 과정 출력
)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.7347 - loss: 0.5418 - val_accuracy: 0.7805 - val_loss: 0.5300
Epoch 2/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7493 - loss: 0.5211 - val_accuracy: 0.7805 - val_loss: 0.4967
Epoch 3/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7665 - loss: 0.4903 - val_accuracy: 0.7642 - val_loss: 0.4776
Epoch 4/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7657 - loss: 0.4790 - val_accuracy: 0.7642 - val_loss: 0.4713
Epoch 5/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7813 - loss: 0.4598 - val_accuracy: 0.7642 - val_loss: 0.4740
Epoch 6/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7758 - loss: 0.4831 - val_accuracy: 0.7561 - val_loss: 0.4742
Epoch 7/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━

In [42]:
# 테스트 평가
test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
y_pred_proba = model.predict(X_test_scaled)  # 확률 예측
y_pred_dl = (y_pred_proba > 0.5).astype(int).flatten()  # 0.5 기준 이진 분류

# 성능 지표 계산
f1_dl = f1_score(y_test, y_pred_dl)
print("\n=== 딥러닝 모델 (TensorFlow) ===")
print(f"F1 Score: {f1_dl:.4f}")
print(f"정확도: {test_acc:.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dl))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 

=== 딥러닝 모델 (TensorFlow) ===
F1 Score: 0.6486
정확도: 0.7468
Confusion Matrix:
[[79 20]
 [19 36]]


### 파이토치로 딥러닝

In [44]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 검증 세트 분리 (스케일링된 데이터 사용)
X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

# 데이터 텐서 변환 (스케일링된 데이터만 사용)
X_train_sub_tensor = torch.tensor(X_train_sub, dtype=torch.float32)
y_train_sub_tensor = torch.tensor(y_train_sub.values, dtype=torch.float32).view(-1, 1)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# DataLoader 생성 (배치 단위 데이터 처리)
# DataLoader 생성
train_dataset = TensorDataset(X_train_sub_tensor, y_train_sub_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # 훈련 데이터 로더
val_loader = DataLoader(val_dataset, batch_size=32) # 검증 데이터 로더
test_loader = DataLoader(test_dataset, batch_size=32)

print("\n=== pytorch 데이터셋 Shape ===")
print("X_train_sub_tensor shape:", X_train_sub_tensor.shape)
print("y_train_sub_tensor shape:", y_train_sub_tensor.shape)
print("X_val_tensor shape:", X_val_tensor.shape)
print("y_val_tensor shape:", y_val_tensor.shape)
print("X_test_tensor shape:", X_test_tensor.shape)
print("y_test_tensor shape:", y_test_tensor.shape)
print("\n")

# 신경망 정의 (이진 분류기)
class HeartDiseaseClassifier(nn.Module):
    def __init__(self, input_dim):
        super(HeartDiseaseClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)  # 입력층 -> 은닉층 1 (64개 뉴런)
        self.fc2 = nn.Linear(64, 32)         # 은닉층 1 -> 은닉층 2 (32개 뉴런)
        self.fc3 = nn.Linear(32, 1)          # 은닉층 2 -> 출력층 (1개 뉴런)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))         # ReLU 활성화
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))      # 출력층에 시그모이드 적용
        return x

# 모델 초기화, 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HeartDiseaseClassifier(input_dim=X_train_scaled.shape[1]).to(device) # 입력차원 자동 설정
criterion = nn.BCELoss() # 이진 교차 엔트로피 손실 함수
optimizer = optim.Adam(model.parameters(), lr=0.001) #Adam 옵티마이저

# 학습 루프
num_epochs = 50
for epoch in range(num_epochs):
    # 훈련 단계
    model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad() # 기울기 초기화
        outputs = model(inputs) # 순전파
        loss = criterion(outputs, labels) # 손실 계산
        loss.backward() # 역전파
        optimizer.step() # 가중치 업데이트
        train_loss += loss.item() #손실 누적
    
    # 검증 단계
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad(): # 기울기 계산 비활
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item() # 검증 손실 계산
            predicted = (outputs > 0.5).float() # 0.5를 기준으로 예측 클래스 결정
            correct += (predicted == labels).sum().item() # 정확히 예측한 개수
            total += labels.size(0) # 전체 데이터 개수
    
    # 에포크당 결과 출력
    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss/len(train_loader):.4f} | "
          f"Val Loss: {val_loss/len(val_loader):.4f} | "
          f"Val Acc: {correct/total:.4f}")

# 테스트 평가
model.eval()
with torch.no_grad():
    y_pred_proba = model(X_test_tensor.to(device)) # 테스트 데이터 예측 확률
    y_pred_dl = (y_pred_proba > 0.5).float().cpu().numpy().flatten() # 이진 예측값으로 변환

# 성능 지표 (정확도, 혼동행렬, F1, Precision, Recall)
accuracy = accuracy_score(y_test, y_pred_dl)
cm_dl = confusion_matrix(y_test, y_pred_dl)
f1 = f1_score(y_test, y_pred_dl)
precision = precision_score(y_test, y_pred_dl)
recall = recall_score(y_test, y_pred_dl)

print("\n=== 딥러닝 모델 (PyTorch) ===")
print(f"정확도: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:")
print(cm_dl)


=== pytorch 데이터셋 Shape ===
X_train_sub_tensor shape: torch.Size([491, 8])
y_train_sub_tensor shape: torch.Size([491, 1])
X_val_tensor shape: torch.Size([123, 8])
y_val_tensor shape: torch.Size([123, 1])
X_test_tensor shape: torch.Size([154, 8])
y_test_tensor shape: torch.Size([154, 1])


Epoch 1/50 | Train Loss: 0.6788 | Val Loss: 0.6539 | Val Acc: 0.6585
Epoch 2/50 | Train Loss: 0.6234 | Val Loss: 0.6013 | Val Acc: 0.6829
Epoch 3/50 | Train Loss: 0.5718 | Val Loss: 0.5619 | Val Acc: 0.6748
Epoch 4/50 | Train Loss: 0.5445 | Val Loss: 0.5307 | Val Acc: 0.7236
Epoch 5/50 | Train Loss: 0.5119 | Val Loss: 0.5098 | Val Acc: 0.7480
Epoch 6/50 | Train Loss: 0.4981 | Val Loss: 0.4904 | Val Acc: 0.7480
Epoch 7/50 | Train Loss: 0.4819 | Val Loss: 0.4770 | Val Acc: 0.7480
Epoch 8/50 | Train Loss: 0.4686 | Val Loss: 0.4669 | Val Acc: 0.7805
Epoch 9/50 | Train Loss: 0.4646 | Val Loss: 0.4605 | Val Acc: 0.7967
Epoch 10/50 | Train Loss: 0.4630 | Val Loss: 0.4558 | Val Acc: 0.8211
Epoch 11/50 | Train