In [4]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torchsummary import summary
import numpy as np

In [6]:
# 데이터 불러오기
data = pd.read_csv("C:/Users/LIM/OneDrive - gachon.ac.kr/바탕 화면/study/인공지능개론/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
# 데이터 확인
print(data.columns.tolist())  #컬럼명 확인
data.dtypes            #데이터 타입 확인

['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [10]:
# 결측치 제거
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [12]:
# 레이블 갯수 확인
data['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [24]:
# 데이터 분리
X = data.drop(columns=['Outcome'])  #입력값 분리
X.head() #입력값 분리 확인

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [26]:
X = data.drop('Outcome', axis=1).values  #넘파이 변환

In [28]:
y = data["Outcome"]  #출력값 분리
y.head()  #출력값 분리 확인

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [30]:
y = data['Outcome'].values  #넘파이 변환

In [32]:
scaler = StandardScaler()  #정규화를 위한 객체 생성
X = scaler.fit_transform(X)  #정규화

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  #데이터 분할

In [38]:
# Pytorch Tensor로 변환
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

In [40]:
# 학습용 데이터셋과 데이터로더 생성
train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [42]:
# shape 확인
X_train.shape, y_train.shape, X_test.shape, y_test.shape    

(torch.Size([614, 8]),
 torch.Size([614]),
 torch.Size([154, 8]),
 torch.Size([154]))

In [44]:
# 모델 정의
class diabetesDense(nn.Module):
    def __init__(self):
        super(diabetesDense, self).__init__()
        self.fc1 = nn.Linear(8, 16)
        self.fc2 = nn.Linear(16, 32)
        self.fc3 = nn.Linear(32, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x

# Initialize the model, loss function, and optimizer
model = diabetesDense()

In [46]:
# 학습용, 테스트용 데이터 준비
train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [48]:
# 손실 함수와 최적화 알고리즘 설정
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [52]:
# 학습 파라미터 초기화
epochs = 30
train_losses = []
test_accuracies = []

In [56]:
# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # 모델을 GPU로 이동

diabetesDense(
  (fc1): Linear(in_features=8, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
)

In [60]:
# 학습 루프
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device).float(), labels.to(device).float()  # 데이터를 GPU로 이동 및 dtype 변환
        labels = labels.view(-1, 1)  # BCEWithLogitsLoss를 사용할 경우 labels 크기 맞추기

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Calculate average loss over an epoch
    train_losses.append(running_loss / len(train_dataloader))

    # Evaluate on test data
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            inputs, labels = inputs.to(device).float(), labels.to(device).float()
            labels = labels.view(-1, 1)

            outputs = model(inputs)
            predicted = (outputs >= 0.5).float()  # 이진 분류에서는 0.5 기준으로 예측값 결정
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    test_accuracies.append(accuracy)

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {train_losses[-1]:.4f}, Accuracy: {accuracy:.2f}%")


Epoch 1/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 2/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 3/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 4/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 5/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 6/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 7/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 8/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 9/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 10/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 11/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 12/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 13/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 14/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 15/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 16/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 17/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 18/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 19/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 20/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 21/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 22/30, Loss: 0.0000, Accuracy: 31.17%
Epoch 23/30, Loss: 0.0000, Accuracy: 31.1

In [84]:
# 모델 평가 모드 전환
model.eval()

diabetesDense(
  (fc1): Linear(in_features=8, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
)

In [86]:
# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # 모델을 GPU로 이동

diabetesDense(
  (fc1): Linear(in_features=8, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=32, bias=True)
  (fc4): Linear(in_features=32, out_features=1, bias=True)
)

In [88]:
# 결과 저장
all_labels = []
all_predictions = []

In [90]:
# 평가 루프
with torch.no_grad():
    for inputs, labels in test_dataloader:
        inputs, labels = inputs.to(device).float(), labels.to(device).float()  # 데이터 GPU로 이동 및 dtype 변환
        labels = labels.view(-1, 1)  # BCEWithLogitsLoss 사용 시 크기 맞추기

        outputs = model(inputs)  #예측값 계산

        # 이진 분류일 경우 확률 기준으로 예측
        predicted = (outputs >= 0.5).float()  # 0.5 기준으로 분류

        # 결과 누적
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

In [92]:
# numpy로 변환
all_labels = np.array(all_labels)
all_predictions = np.array(all_predictions)

In [94]:
# 지표 게산
conf_matrix = confusion_matrix(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')

In [96]:
# 특이도 계산
specificity = []
for i in range(conf_matrix.shape[0]):
    tn = conf_matrix.sum() - (conf_matrix[i, :].sum() + conf_matrix[:, i].sum() - conf_matrix[i, i])
    fp = conf_matrix[:, i].sum() - conf_matrix[i, i]
    specificity.append(tn / (tn + fp))

In [98]:
# 결과 출력
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'F1 Score: {f1:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'Specificity: {np.mean(specificity):.2f}')

Confusion Matrix:
[[ 3 96]
 [10 45]]
F1 Score: 0.20
Precision: 0.26
Recall: 0.31
Specificity: 0.42
