In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder

file_path = "https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv"
data = pd.read_csv(file_path)

data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [46]:
# 데이터 확인
print("데이터 형태:", data.shape)
print("\n컬럼 정보:\n", data.columns)

print("\n클래스 분포:\n", data["bmi"].value_counts())
print("\BMI 분포:\n", data["bmi"].describe())

데이터 형태: (1338, 7)

컬럼 정보:
 Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

클래스 분포:
 bmi
32.300    13
28.310     9
30.495     8
30.875     8
31.350     8
          ..
46.200     1
23.800     1
44.770     1
32.120     1
30.970     1
Name: count, Length: 548, dtype: int64
\BMI 분포:
 count    1338.000000
mean       30.663397
std         6.098187
min        15.960000
25%        26.296250
50%        30.400000
75%        34.693750
max        53.130000
Name: bmi, dtype: float64


In [47]:
# 결측치 확인
print("=== 결측치 현황 ===")
print(data.isnull().sum())

=== 결측치 현황 ===
age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [48]:
label_encoder = LabelEncoder()

# 인코딩할 컬럼 리스트
columns_to_encode = ['sex', 'region','smoker']  # 인코딩할 컬럼 리스트

# 특성(X)과 레이블(y) 분리
X = data.drop("bmi", axis=1)
y = data["bmi"]

# 훈련/테스트 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ValueError: could not convert string to float: 'female'

In [None]:
# 데이터셋 Shape 확인
print("\n=== 데이터셋 Shape ===")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
# 모델 구조 정의 (배치 정규화 및 드롭아웃 추가)
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),  # 배치 정규화 추가
    Dropout(0.3),          # 드롭아웃 추가
    Dense(32, activation='relu'),
    Dense(1)  # 출력층
])


optimizer = Adam(learning_rate=0.001) # 학습률

# 모델 컴파일 (회귀 설정)
model.compile(
    optimizer=optimizer,
    loss='mse',  # 평균 제곱 오차
    metrics=['mae']  # 평균 절대 오차
)

# 조기 종료 콜백 추가
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2, #X_train_scaled의 20%를 검증 데이터로 분리
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],  # 조기 종료 적용  
    verbose=1
)

# 테스트 평가
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=0)
y_pred_dl = model.predict(X_test_scaled).flatten()

# 성능 지표 계산
rmse_dl = np.sqrt(mean_squared_error(y_test, y_pred_dl))
mae_dl = mean_absolute_error(y_test, y_pred_dl)
r2_dl = r2_score(y_test, y_pred_dl)

print("\n=== 딥러닝 모델 (TensorFlow) ===")
print(f"RMSE: {rmse_dl:.2f}")
print(f"MAE: {mae_dl:.2f}")
print(f"R²: {r2_dl:.4f}")