In [1]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.5.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
Downloading pyaml-25.5.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize

   -------------------- ------------------- 1/2 [scikit-optimize]
   -------------------- ------------------- 1/2 [scikit-optimize]
   ---------------------------------------- 2/2 [scikit-optimize]

Successfully installed pyaml-25.5.0 scikit-optimize-0.10.2


In [3]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   - -------------------------------------- 5.2/150.0 MB 26.2 MB/s eta 0:00:06
   --- ------------------------------------ 13.4/150.0 MB 33.3 MB/s eta 0:00:05
   ----- ---------------------------------- 19.7/150.0 MB 32.2 MB/s eta 0:00:05
   ------ --------------------------------- 22.5/150.0 MB 27.8 MB/s eta 0:00:05
   ------- -------------------------------- 28.3/150.0 MB 27.9 MB/s eta 0:00:05
   --------- ------------------------------ 35.7/150.0 MB 29.0 MB/s eta 0:00:04
   ----------- ---------------------------- 43.3/150.0 MB 30.3 MB/s eta 0:00:04
   ------------- -------------------------- 50.9/150.0 MB 31.1 MB/s eta 0:00:04
   --------------- ------------------------ 57.1/150.0 MB 31.1 MB/s eta 0:00:03
   ---------------- ----------------------- 63.4/150.0 MB 31.4

In [5]:
!pip install lightgbm


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 10.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
import xgboost as xgb
import lightgbm as lgb
import warnings
import joblib
import os
import json
from datetime import datetime

# LightGBM 경고 메시지 무시
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')

# 1. 데이터 준비
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# EDA에서 추출된 주요 변수
main_features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    '1stFlrSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd',
    'GarageYrBlt', 'MasVnrArea', 'Fireplaces', 'BsmtFinSF1', 'LotFrontage',
    'WoodDeckSF', '2ndFlrSF', 'OpenPorchSF', 'HalfBath', 'LotArea',
    'Neighborhood', 'ExterQual', 'KitchenQual', 'BsmtQual', 'GarageType',
    'SaleCondition'
]

# 학습/검증 데이터 분리
X = train[main_features]
y = train['SalePrice']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 수치형/범주형 분리
numeric_feats = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_feats = X.select_dtypes(include=['object']).columns.tolist()

# 2. 전처리 파이프라인
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_feats),
    ('cat', categorical_transformer, categorical_feats)
])

# 3. 개별 모델 정의
base_models = {
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=1.0),
    'ElasticNet': ElasticNet(alpha=1.0, l1_ratio=0.5),
    'XGBoost': xgb.XGBRegressor(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        tree_method='hist',
        random_state=42
    ),
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=500,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    )
}

print('각 모델 트레이닝')

각 모델 트레이닝


In [15]:
individual_results = {}
trained_models = {}

for name, model in base_models.items():
    print(f"\nTraining {name}...")
    pipe = Pipeline([
        ('Preprocexsor', preprocessor),
        ('reg', model)
    ])

    pipe.fit(X_train, y_train)
    print(f"\nTraining {name} Completed...")

    y_val_pred = pipe.predict(X_val) # 해당 코드와 수요일 코드 비교 필요 -> 얼마나 간결해졌는가
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"{name} 모델 rmse : {val_rmse}")

    individual_results[name] = val_rmse
    trained_models[name] = pipe


Training Ridge...

Training Ridge Completed...
Ridge 모델 rmse : 32886.611682055016

Training Lasso...

Training Lasso Completed...
Lasso 모델 rmse : 32993.20265185749

Training ElasticNet...

Training ElasticNet Completed...
ElasticNet 모델 rmse : 37908.45726466044

Training XGBoost...


  model = cd_fast.enet_coordinate_descent(



Training XGBoost Completed...
XGBoost 모델 rmse : 27164.23648844193

Training LightGBM...

Training LightGBM Completed...
LightGBM 모델 rmse : 31217.708559019466




# 스태킹 앙상블 모델

In [24]:
## 원래 형태 딕셔너리
## 이를 변환해야함
## 원래 형태 리스트 타입으로 (내부는 튜플형태)

stacking_models = list(trained_models.values())
stacking_names = list(trained_models.keys())
estimators = list(zip(stacking_names, stacking_models))

stacking_regressor = StackingRegressor(
    estimators = estimators,
    final_estimator = Ridge(alpha = 1.0),
    cv = 5, n_jobs = -1
)

stacking_regressor.fit(X_train, y_train)

# 앙상블 모델 검증
y_val_pred = stacking_regressor.predict(X_val)
ensemble_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
ensemble_rmse



np.float64(28165.657335583746)

## 모델 테스트

In [25]:
X_test = test[main_features]
test_predictions = stacking_regressor.predict(X_test)
pd.DataFrame({
    'Id' : test['Id'],
    'Saleprice': test_predictions
})



Unnamed: 0,Id,Saleprice
0,1461,129808.364039
1,1462,164551.512067
2,1463,186657.585677
3,1464,190667.914572
4,1465,206828.676687
...,...,...
1454,2915,68641.914990
1455,2916,72949.239007
1456,2917,160327.211971
1457,2918,112886.382235
