## kc_house 예제 (MLP 딥러닝 회귀)

딥러닝 회귀 : MSE지표 필수

In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.callbacks import EarlyStopping


### 데이터 로드 및 확인

In [46]:
file_path = './kc_house_data.csv'
df = pd.read_csv(file_path)

df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [47]:
# 데이터 확인
print("데이터 shape:", df.shape)
print("\n컬럼 정보:\n", df.columns)

# 결측치 확인
print("=== 결측치 현황 ===")
print(df.isnull().sum())

데이터 shape: (21613, 21)

컬럼 정보:
 Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')
=== 결측치 현황 ===
id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64


In [48]:
#레이블 분포 확인
print("\n==== 레이블 분포 ====")
print(df['price'].value_counts())


==== 레이블 분포 ====
price
350000.0     172
450000.0     172
550000.0     159
500000.0     152
425000.0     150
            ... 
607010.0       1
1362500.0      1
298800.0       1
957500.0       1
402101.0       1
Name: count, Length: 4028, dtype: int64


### 데이터 전처리
#### 불필요한 컬럼 제거

In [50]:
df = df.drop(['id', 'date'], axis=1) # axis=1: 열(columns)을 기준으로 작업
print(df.columns)

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')


#### 타겟 변수 로그 변환 (가격 분포 정규화)

In [52]:
df['price_log'] = np.log1p(df['price'])

#### 특성과 타겟(레이블 분리)

In [54]:
X = df.drop(['price', 'price_log'], axis=1)
y = df['price_log']  # 로그 변환된 가격 사용

#### 훈련 및 테스트 세트 분리

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
# 데이터 스케일링 (회귀는 원-핫 엔코딩이 필요 없음)
scaler_X = StandardScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

In [77]:
# 데이터셋 Shape 확인
print("\n=== 데이터셋 Shape ===")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



=== 데이터셋 Shape ===
X_train_scaled shape: (17290, 18)
X_test_scaled shape: (4323, 18)
y_train shape: (17290,)
y_test shape: (4323,)


### 5가지 분류 (RF, DT, LR, KNN, SVM) 하고, RMSE 및 R² 확인

In [60]:
models = {
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression(),
    "KNN": KNeighborsRegressor()
}

print("=== 머신러닝 모델 성능 ===")
for name, model in models.items():
    model.fit(X_train_scaled, y_train) # 모델 학습
    y_pred = model.predict(X_test_scaled) # 테스트 데이터 예측
    
    # 지수 변환으로 원래 가격 복원 (로그 변환된 값을 되돌림)
    y_pred_orig = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test)

    # 성능지표 계산
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
    mae = mean_absolute_error(y_test_orig, y_pred_orig)
    r2 = r2_score(y_test_orig, y_pred_orig)

    # 결과 출력
    print(f"\n{name}:")
    print(f"RMSE: {rmse:,.0f} 달러")
    print(f"MAE: {mae:,.0f} 달러")
    print(f"R²: {r2:.4f}")

=== 머신러닝 모델 성능 ===

SVM:
RMSE: 184,106 달러
MAE: 84,188 달러
R²: 0.7758

Decision Tree:
RMSE: 187,468 달러
MAE: 101,570 달러
R²: 0.7675

Random Forest:
RMSE: 136,057 달러
MAE: 71,894 달러
R²: 0.8775

Linear Regression:
RMSE: 275,303 달러
MAE: 117,259 달러
R²: 0.4987

KNN:
RMSE: 182,232 달러
MAE: 92,274 달러
R²: 0.7803


### MLP 딥러닝 (텐서플로우)

In [73]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError  # MSE 손실 함수 사용

# 모델 구조 정의

# 모델 구조 정의 (배치 정규화 및 드롭아웃 추가)
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),  # 배치 정규화 추가
    Dropout(0.3),          # 드롭아웃 추가
    Dense(32, activation='relu'),
    Dense(1)  # 출력층
])

optimizer = Adam(learning_rate=0.001) # 학습률

# 모델 컴파일 (회귀 설정)
model.compile(
    optimizer=optimizer,
    loss='mse',  # 평균 제곱 오차
    metrics=['mae']  # 평균 절대 오차
)

# 조기 종료 콜백 추가
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2, #X_train_scaled의 20%를 검증 데이터로 분리
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],  # 조기 종료 적용  
    verbose=1
)

# 테스트 평가 및 결과 복원
y_pred_log = model.predict(X_test_scaled).flatten()
y_pred_orig = np.expm1(y_pred_log)  # 로그 변환 복원
y_test_orig = np.expm1(y_test)  # 실제 값 복원

# 성능 지표 계산
rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
mae = mean_absolute_error(y_test_orig, y_pred_orig)
r2 = r2_score(y_test_orig, y_pred_orig)

print("\n=== 딥러닝 모델 (TensorFlow) ===")
print(f"RMSE: {rmse:,.0f} 달러")
print(f"MAE: {mae:,.0f} 달러")
print(f"R²: {r2:.4f}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 112.6130 - mae: 9.7741 - val_loss: 3.7505 - val_mae: 1.5852
Epoch 2/50
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 17.2495 - mae: 3.0439 - val_loss: 5.4301 - val_mae: 1.8040
Epoch 3/50
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 12.7987 - mae: 2.6925 - val_loss: 1.6603 - val_mae: 1.1309
Epoch 4/50
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 2.1876 - mae: 1.1168 - val_loss: 0.2194 - val_mae: 0.3669
Epoch 5/50
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.2528 - mae: 0.3789 - val_loss: 0.0728 - val_mae: 0.2102
Epoch 6/50
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 0.1920 - mae: 0.3317 - val_loss: 0.0980 - val_mae: 0.2447
Epoch 7/50
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - loss: 0

In [75]:
# # PyTorch 회귀 모델
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import DataLoader, TensorDataset

# # 데이터 텐서 변환
# X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)  # 수정: X_train_scaled 사용
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
# X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)  # 스케일링된 데이터 사용

# # 신경망 정의
# # 모델 구조 개선
# class EnhancedHousePricePredictor(nn.Module):
#     def __init__(self, input_dim):
#         super().__init__()
#         self.fc1 = nn.Linear(input_dim, 256)  # 은닉층 뉴런 수 증가
#         self.bn1 = nn.BatchNorm1d(256)       # 배치 정규화 추가
#         self.dropout = nn.Dropout(0.3)       # 드롭아웃 추가 (과적합 방지)
#         self.fc2 = nn.Linear(256, 128)
#         self.fc3 = nn.Linear(128, 64)
#         self.fc4 = nn.Linear(64, 1)          # 출력층 (회귀: 출력값 1개)

#     def forward(self, x):
#         x = torch.relu(self.bn1(self.fc1(x)))
#         x = self.dropout(x)
#         x = torch.relu(self.fc2(x))
#         x = torch.relu(self.fc3(x))
#         return self.fc4(x)
        
# # 모델 설정
# model = EnhancedHousePricePredictor(input_dim=X_train.shape[1])
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
#     X_train_scaled, y_train, 
#     test_size=0.2, 
#     random_state=42
# )

# # DataLoader
# train_dataset = TensorDataset(
#     torch.tensor(X_train_sub, dtype=torch.float32),
#     torch.tensor(y_train_sub.values, dtype=torch.float32).view(-1, 1)
# )
# val_dataset = TensorDataset(
#     torch.tensor(X_val_sub, dtype=torch.float32),
#     torch.tensor(y_val_sub.values, dtype=torch.float32).view(-1, 1)
# )
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=64)

# # 학습 루프 (조기 종료 추가)
# num_epochs = 500
# best_val_loss = float('inf')
# patience = 10
# patience_counter = 0

# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0.0
#     for inputs, labels in train_loader:
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item()
    
#     # 검증 손실 계산
#     model.eval()
#     val_loss = 0.0
#     with torch.no_grad():
#         for inputs, labels in val_loader:
#             outputs = model(inputs)
#             val_loss += criterion(outputs, labels).item()
    
#     train_loss /= len(train_loader)
#     val_loss /= len(val_loader)
    
#     # 조기 종료
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         patience_counter = 0
#         torch.save(model.state_dict(), 'best_model.pth')  # 최적 모델 저장
#     else:
#         patience_counter += 1
#         if patience_counter >= patience:
#             print(f"Early stopping at epoch {epoch+1}")
#             break
    
#     # 50에포크마다 결과 출력
#     if (epoch+1) % 50 == 0:
#         print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

# # 최적 모델 로드
# model.load_state_dict(torch.load('best_model.pth'))

# # 테스트 평가
# model.eval()
# with torch.no_grad():
#     y_pred_log = model(X_test_tensor).numpy().flatten()

# # 결과 복원 및 평가
# y_pred_orig = np.expm1(y_pred_log)
# y_test_orig = np.expm1(y_test)

# print("\n=== 딥러닝 모델 성능 ===")
# print(f"RMSE: {np.sqrt(mean_squared_error(y_test_orig, y_pred_orig)):,.0f} 달러")
# print(f"MAE: {mean_absolute_error(y_test_orig, y_pred_orig):,.0f} 달러")
# print(f"R²: {r2_score(y_test_orig, y_pred_orig):.4f}")

Early stopping at epoch 34

=== 딥러닝 모델 성능 ===
RMSE: 258,990 달러
MAE: 103,407 달러
R²: 0.5563
