In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 데이터 로드 및 전처리
data = pd.read_csv("kc_house_data.csv")
# 데이터 확인
print("데이터 형태:", data.shape)
print("\n컬럼 정보:\n", data.columns)

# 1. 불필요한 컬럼 제거
data = data.drop(['id', 'date'], axis=1)

# 2. 범주형 변수 처리 (zipcode 원-핫 인코딩)
data['zipcode'] = data['zipcode'].astype(str)
data = pd.get_dummies(data, columns=['zipcode'])

# 3. 타겟 변수 로그 변환 (가격 분포 정규화)
data['price_log'] = np.log1p(data['price'])

# 4. 특징(X)과 타겟(y) 분리
X = data.drop(['price', 'price_log'], axis=1)
y = data['price_log']  # 로그 변환된 가격 사용

# 5. 데이터 스케일링
scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(X)

# 6. 훈련/테스트 세트 분할
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


데이터 형태: (21613, 21)

컬럼 정보:
 Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')


In [15]:
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

models = {
    "SVM": SVR(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Linear Regression": LinearRegression(),
    "KNN": KNeighborsRegressor()
}

print("=== 머신러닝 모델 성능 ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # 지수 변환으로 원래 가격 복원
    y_pred_orig = np.expm1(y_pred)
    y_test_orig = np.expm1(y_test)
    
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
    mae = mean_absolute_error(y_test_orig, y_pred_orig)
    r2 = r2_score(y_test_orig, y_pred_orig)
    
    print(f"\n{name}:")
    print(f"RMSE: {rmse:,.0f} 달러")
    print(f"MAE: {mae:,.0f} 달러")
    print(f"R²: {r2:.4f}")


=== 머신러닝 모델 성능 ===

SVM:
RMSE: 153,438 달러
MAE: 71,900 달러
R²: 0.8443

Decision Tree:
RMSE: 190,026 달러
MAE: 103,256 달러
R²: 0.7611

Random Forest:
RMSE: 137,459 달러
MAE: 72,293 달러
R²: 0.8750

Linear Regression:
RMSE: 224,358 달러
MAE: 81,322 달러
R²: 0.6670

KNN:
RMSE: 170,925 달러
MAE: 85,996 달러
R²: 0.8067


In [45]:
# PyTorch 회귀 모델
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 데이터 텐서 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# 신경망 정의
# 모델 구조 개선
class EnhancedHousePricePredictor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # 은닉층 뉴런 수 증가
        self.bn1 = nn.BatchNorm1d(256)       # 배치 정규화 추가
        self.dropout = nn.Dropout(0.3)       # 드롭아웃 추가 (과적합 방지)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)          # 출력층 (회귀: 출력값 1개)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)
        
# 모델 설정
model = EnhancedHousePricePredictor(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 학습 루프
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # 20에포크마다 결과 출력
    if (epoch+1) % 50 == 0:
        print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss/len(train_loader):.4f}")

# 테스트 평가
model.eval()
with torch.no_grad():
    y_pred_log = model(X_test_tensor).numpy().flatten()

# 결과 복원 및 평가
y_pred_orig = np.expm1(y_pred_log)  # 지수 변환
y_test_orig = np.expm1(y_test)

print("\n=== 딥러닝 모델 성능 ===")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_orig, y_pred_orig)):,.0f} 달러")
print(f"MAE: {mean_absolute_error(y_test_orig, y_pred_orig):,.0f} 달러")
print(f"R²: {r2_score(y_test_orig, y_pred_orig):.4f}")

Epoch 50/500 | Train Loss: 0.1120
Epoch 100/500 | Train Loss: 0.0503
Epoch 150/500 | Train Loss: 0.0435
Epoch 200/500 | Train Loss: 0.0380
Epoch 250/500 | Train Loss: 0.0347
Epoch 300/500 | Train Loss: 0.0323
Epoch 350/500 | Train Loss: 0.0306
Epoch 400/500 | Train Loss: 0.0288
Epoch 450/500 | Train Loss: 0.0262
Epoch 500/500 | Train Loss: 0.0260

=== 딥러닝 모델 성능 ===
RMSE: 142,247 달러
MAE: 75,597 달러
R²: 0.8662
