## diabetes 예제 딥러닝 회귀
outcome 삭제 후 bmi 회귀

In [82]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


In [83]:
file_path = './diabetes.csv'
data = pd.read_csv(file_path)

data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [84]:
# 데이터 확인
print("데이터 형태:", data.shape)
print("\n컬럼 정보:\n", data.columns)
print("\n클래스 분포:\n", data["BMI"].value_counts())
print("\BMI 분포:\n", data["BMI"].describe())

데이터 형태: (768, 9)

컬럼 정보:
 Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

클래스 분포:
 BMI
32.0    13
31.6    12
31.2    12
0.0     11
32.4    10
        ..
36.7     1
41.8     1
42.6     1
42.8     1
46.3     1
Name: count, Length: 248, dtype: int64
\BMI 분포:
 count    768.000000
mean      31.992578
std        7.884160
min        0.000000
25%       27.300000
50%       32.000000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64


In [85]:
# 결측치 확인
print("=== 결측치 현황 ===")
print(data.isnull().sum())

=== 결측치 현황 ===
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [86]:
# 불필요한 칼럼 제거
data = data.drop(['Outcome'], axis=1) # axis=1: 열(columns)을 기준으로 작업
print(data.columns)

# 특성(X)과 레이블(y) 분리
X = data.drop("BMI", axis=1)
y = data["BMI"]

# 훈련/테스트 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


In [87]:
# 데이터셋 Shape 확인
print("\n=== 데이터셋 Shape ===")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



=== 데이터셋 Shape ===
X_train_scaled shape: (614, 7)
X_test_scaled shape: (154, 7)
y_train shape: (614,)
y_test shape: (154,)


In [88]:
models = {
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression(),
    "KNN": KNeighborsRegressor()
}

print("=== 머신러닝 모델 성능 ===")
for name, model in models.items():
    model.fit(X_train_scaled, y_train) # 모델 학습
    y_pred = model.predict(X_test_scaled) # 테스트 데이터 예측

    # 성능지표 계산
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
 
    # 결과 출력
    print(f"\n======={name}=======")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")


=== 머신러닝 모델 성능 ===

RMSE: 7.27
MAE: 4.95
R²: 0.2565

RMSE: 10.56
MAE: 7.21
R²: -0.5674

RMSE: 7.04
MAE: 5.25
R²: 0.3024

RMSE: 7.23
MAE: 5.21
R²: 0.2651

RMSE: 7.21
MAE: 5.13
R²: 0.2685


### 텐서플로우 회귀 모델

In [90]:
# 모델 구조 정의 (배치 정규화 및 드롭아웃 추가)
# model = Sequential([
#     Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
#     BatchNormalization(),  # 배치 정규화 추가
#     Dropout(0.3),          # 드롭아웃 추가
#     Dense(32, activation='relu'),
#     Dense(1)  # 출력층
# ])
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)  # 출력층 (활성화 함수 없음)
])


optimizer = Adam(learning_rate=0.001) # 학습률

# 모델 컴파일 (회귀 설정)
model.compile(
    optimizer=optimizer,
    loss='mse',  # 평균 제곱 오차
    metrics=['mae']  # 평균 절대 오차
)

# 조기 종료 콜백 추가
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2, #X_train_scaled의 20%를 검증 데이터로 분리
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],  # 조기 종료 적용  
    verbose=1
)

# 테스트 평가
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=0)
y_pred_dl = model.predict(X_test_scaled).flatten()

# 성능 지표 계산
rmse_dl = np.sqrt(mean_squared_error(y_test, y_pred_dl))
mae_dl = mean_absolute_error(y_test, y_pred_dl)
r2_dl = r2_score(y_test, y_pred_dl)

print("\n=== 딥러닝 모델 (TensorFlow) ===")
print(f"RMSE: {rmse_dl:.2f}")
print(f"MAE: {mae_dl:.2f}")
print(f"R²: {r2_dl:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 1093.0327 - mae: 32.0018 - val_loss: 1093.2402 - val_mae: 32.2482
Epoch 2/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1050.6125 - mae: 31.5051 - val_loss: 1076.3490 - val_mae: 31.9767
Epoch 3/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 1060.3048 - mae: 31.4582 - val_loss: 1058.1602 - val_mae: 31.6850
Epoch 4/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 1018.8233 - mae: 30.9162 - val_loss: 1038.3901 - val_mae: 31.3612
Epoch 5/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 990.9546 - mae: 30.4784 - val_loss: 1015.4113 - val_mae: 30.9795
Epoch 6/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 973.9841 - mae: 30.2426 - val_loss: 987.1510 - val_mae: 30.4926
Epoch 7/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

### PyTorch 회귀 모델

In [104]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 데이터 텐서 변환 (스케일링 없음, 원본 데이터 사용)
# 데이터프레임 → NumPy 배열 변환
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)  
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# 데이터 로더 설정
train_dataset = TensorDataset(X_train_tensor, y_train_tensor) # 훈련 데이터
test_dataset = TensorDataset(X_test_tensor, y_test_tensor) # 테스트 데이터
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # 훈련 데이터 로더. 무작위
test_loader = DataLoader(test_dataset, batch_size=32) # 테스트 데이터 로더

# 회귀 모델 정의 
class RegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        # 신경망 구조 정의
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64), # 입력층 => 은닉층 1 (64개 뉴런)
            nn.ReLU(), # ReLU 활성화 함수
            nn.Linear(64, 32), # 은닉층 1 => 은닉층 2 (32개 뉴런)
            nn.ReLU(),
            nn.Linear(32, 1) # 은닉층 2 => 출력층 (1개 뉴런)
        )

    def forward(self, x):
        # 순전파: 입력 데이터를 신경망에 통과 시킴
        return self.model(x)

# GPU 사용 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델 인스턴스 생성, 지정된 장치 이동
model = RegressionModel(input_dim=X_train.shape[1]).to(device)

# 손실 함수: 평균 제곱 오차(MSE)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 옵티마이저

# 학습 루프
model.train() # 학습모드

for epoch in range(50):
    total_loss = 0.0
    for X_batch, y_batch in train_loader:
        # 데이터를 저장된 장치로 이동
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        
        optimizer.zero_grad() # 기울기 초기화
        outputs = model(X_batch) # 순전파: 예측값 계산
        loss = criterion(outputs, y_batch) # 손실계산
        loss.backward() # 역전파: 기울기 계산
        optimizer.step() # 가중치 업데이트
        
        total_loss += loss.item() # 누적 손실 업데이트
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}") # 에포크 별 평균 손실

# 평가
model.eval() #평가모드
preds, actuals = [], []

# 기울기 계산 비활
with torch.no_grad():
    for X_batch, y_batch in test_loader:

        # 테스트 데이터를 장치로 이동
        X_batch = X_batch.to(device)

        # 예측 수행 및 CPU로 이동, numpy배열로 변환
        outputs = model(X_batch).cpu().numpy().flatten()

        #결과 저장
        preds.extend(outputs)
        actuals.extend(y_batch.numpy().flatten())

# 성능 지표 계산
rmse = np.sqrt(mean_squared_error(actuals, preds))
mae = mean_absolute_error(actuals, preds)
r2 = r2_score(actuals, preds)

print("\n=== 딥러닝 모델 성능 ===")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.4f}")

Epoch 1, Loss: 586.1363
Epoch 2, Loss: 133.9089
Epoch 3, Loss: 77.6408
Epoch 4, Loss: 70.8924
Epoch 5, Loss: 65.8519
Epoch 6, Loss: 69.5524
Epoch 7, Loss: 61.7202
Epoch 8, Loss: 57.6348
Epoch 9, Loss: 55.2084
Epoch 10, Loss: 53.0583
Epoch 11, Loss: 51.2878
Epoch 12, Loss: 51.0842
Epoch 13, Loss: 51.4703
Epoch 14, Loss: 46.3022
Epoch 15, Loss: 45.0804
Epoch 16, Loss: 44.6994
Epoch 17, Loss: 48.4845
Epoch 18, Loss: 43.2154
Epoch 19, Loss: 42.5472
Epoch 20, Loss: 41.7415
Epoch 21, Loss: 41.4282
Epoch 22, Loss: 39.1167
Epoch 23, Loss: 39.4584
Epoch 24, Loss: 43.0503
Epoch 25, Loss: 39.7018
Epoch 26, Loss: 40.9226
Epoch 27, Loss: 38.4746
Epoch 28, Loss: 44.6369
Epoch 29, Loss: 41.2977
Epoch 30, Loss: 40.3252
Epoch 31, Loss: 39.5333
Epoch 32, Loss: 38.9060
Epoch 33, Loss: 38.6953
Epoch 34, Loss: 39.7129
Epoch 35, Loss: 37.5243
Epoch 36, Loss: 44.0891
Epoch 37, Loss: 38.8220
Epoch 38, Loss: 39.0751
Epoch 39, Loss: 38.8287
Epoch 40, Loss: 38.5279
Epoch 41, Loss: 38.5770
Epoch 42, Loss: 38.2164