In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/Users/hyeom/Documents/GitHub/advanced_project/hye_project/for_machine_learning_2.csv')

In [3]:
df['price'].describe()

count    20560.000000
mean       157.974951
std        106.658402
min          7.000000
25%         80.000000
50%        129.000000
75%        204.000000
max        935.000000
Name: price, dtype: float64

In [11]:
df_outlier = pd.read_csv('/Users/hyeom/Documents/GitHub/advanced_project/jiwon_project/csv_files/preprocessing_filtered.csv')

In [17]:
count_900_up = df.loc[df['price'] >= 900, 'price'].count()

count_900_up

8

In [9]:

# 0) feature lists
cat_cols    = [
    'neigh_cluster_reduced','neighbourhood_group_cleansed',
    'room_type_ord','room_new_type_ord','room_structure_type','amen_grp',
    'description_length_group', 'name_length_group' # 새로 추가
]
num_cols    = [
    'latitude','longitude','accommodates','bath_score_mul',
    'amenities_cnt','review_scores_rating',
    'number_of_reviews','number_of_reviews_ltm','region_score_norm',
    'host_response_time_score', 'host_response_rate_score'                      # 새로 추가
]
bin_cols    = [
    'instant_bookable','is_long_term','host_is_superhost',
    'has_Air_conditioning','has_Wifi',
    'has_Bathtub','has_Carbon_monoxide_alarm','has_Elevator',
    'neighborhood_overview_exists'                                      # 새로 추가
]
other_flags = ['grp01_high','grp04_high']
features    = cat_cols + num_cols + bin_cols + other_flags

In [10]:
import numpy as np
import pandas as pd
import joblib

# 1) 모델 로드 (stacking된 최종 모델)
model = joblib.load('/Users/hyeom/Documents/GitHub/advanced_project/hye_project/03_MachineLearning/final_ensemble_model_2.pkl')

# 2) 로그 스케일 예측
pred_log = model.predict(df[features])

# 3) 실제 달러 단위로 변환
pred_price = np.expm1(pred_log)

# 4) 분포 요약 출력
print("최소 예측가: $", pred_price.min())
print("최대 예측가: $", pred_price.max())
print(pd.Series(pred_price).describe())


최소 예측가: $ 9.310559245450362
최대 예측가: $ 908.5222883798167
count    20560.000000
mean       153.985506
std         96.461188
min          9.310559
25%         83.935764
50%        124.533562
75%        214.910902
max        908.522288
dtype: float64


In [18]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd

# 1) 실제값 설정 (로그 스케일, 달러 스케일)
y_true_log   = df['log_price']
y_true_price = df['price']

# 2) 예측값은 이미 계산된 pred_log, pred_price 사용

# 3) 지표 계산
r2         = r2_score(y_true_log, pred_log)
rmse_log   = np.sqrt(mean_squared_error(y_true_log, pred_log))
mae_log    = mean_absolute_error(y_true_log, pred_log)

rmse_usd   = np.sqrt(mean_squared_error(y_true_price, pred_price))
mae_usd    = mean_absolute_error(y_true_price, pred_price)

# MAPE는 sklearn >=0.24에서 제공, 없으면 수동 계산
mape       = mean_absolute_percentage_error(y_true_price, pred_price) * 100

# 4) 결과 출력
print("=== Final Ensemble Performance ===")
print(f"R² (log)   : {r2:.3f}")
print(f"RMSE (log) : {rmse_log:.3f}")
print(f"MAE  (log) : {mae_log:.3f}")
print(f"RMSE ($)   : ${rmse_usd:,.2f}")
print(f"MAE  ($)   : ${mae_usd:,.2f}")
print(f"MAPE       : {mape:.1f} %")

=== Final Ensemble Performance ===
R² (log)   : 0.899
RMSE (log) : 0.204
MAE  (log) : 0.148
RMSE ($)   : $36.81
MAE  ($)   : $23.06
MAPE       : 15.1 %


In [21]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error
)

# ── 0) 데이터·특성 준비 ──
X = df[features]
y_log   = df['log_price']
y_price = df['price']

# ── 1) 학습/검증 분할 ──
X_tr, X_val, y_tr_log, y_val_log, y_tr_price, y_val_price = train_test_split(
    X, y_log, y_price,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

# ── 2) 샘플 크기 출력 ──
print(f"Train samples: {X_tr.shape[0]} rows")
print(f"Validation samples: {X_val.shape[0]} rows\n")

# ── 3) 모델 로드 및 예측 ──
model = joblib.load(
    '/Users/hyeom/Documents/GitHub/advanced_project/'
    'hye_project/03_MachineLearning/final_ensemble_model_2.pkl'
)
pred_tr_log   = model.predict(X_tr)
pred_val_log  = model.predict(X_val)
pred_tr_price = np.expm1(pred_tr_log)
pred_val_price= np.expm1(pred_val_log)

# ── 4) 지표 계산 함수 ──
def log_metrics(y_true, y_pred):
    return (
        r2_score(y_true, y_pred),
        np.sqrt(mean_squared_error(y_true, y_pred)),
        mean_absolute_error(y_true, y_pred)
    )

def usd_metrics(y_true, y_pred):
    return (
        np.sqrt(mean_squared_error(y_true, y_pred)),
        mean_absolute_error(y_true, y_pred),
        mean_absolute_percentage_error(y_true, y_pred) * 100
    )

# ── 5) 학습/검증 지표 계산 ──
r2_tr, rmse_tr_log, mae_tr_log = log_metrics(y_tr_log, pred_tr_log)
r2_val, rmse_val_log, mae_val_log = log_metrics(y_val_log, pred_val_log)
rmse_tr_usd, mae_tr_usd, mape_tr = usd_metrics(y_tr_price, pred_tr_price)
rmse_val_usd, mae_val_usd, mape_val = usd_metrics(y_val_price, pred_val_price)

# ── 6) 결과 출력 ──
print("=== Train Performance ===")
print(f"R² (log)   : {r2_tr:.3f}")
print(f"RMSE (log) : {rmse_tr_log:.3f}")
print(f"MAE  (log) : {mae_tr_log:.3f}")
print(f"RMSE ($)   : ${rmse_tr_usd:,.2f}")
print(f"MAE  ($)   : ${mae_tr_usd:,.2f}")
print(f"MAPE       : {mape_tr:.1f} %\n")

print("=== Validation Performance ===")
print(f"R² (log)   : {r2_val:.3f}")
print(f"RMSE (log) : {rmse_val_log:.3f}")
print(f"MAE  (log) : {mae_val_log:.3f}")
print(f"RMSE ($)   : ${rmse_val_usd:,.2f}")
print(f"MAE  ($)   : ${mae_val_usd:,.2f}")
print(f"MAPE       : {mape_val:.1f} %\n")

# ── 7) 과적합/과소적합 판단 ──
delta_r2   = r2_tr - r2_val
delta_rmse = rmse_val_log - rmse_tr_log

print("=== Over/Underfitting Check ===")
if delta_r2 > 0.1 and delta_rmse > 0.1:
    print("⚠️ 과적합 가능성이 높습니다.")
elif r2_tr < 0.5 and r2_val < 0.5:
    print("⚠️ 과소적합 가능성이 있습니다.")
else:
    print("✅ 일반화 상태가 양호해 보입니다.")



Train samples: 16448 rows
Validation samples: 4112 rows

=== Train Performance ===
R² (log)   : 0.900
RMSE (log) : 0.203
MAE  (log) : 0.148
RMSE ($)   : $37.04
MAE  ($)   : $23.12
MAPE       : 15.1 %

=== Validation Performance ===
R² (log)   : 0.898
RMSE (log) : 0.204
MAE  (log) : 0.148
RMSE ($)   : $35.84
MAE  ($)   : $22.83
MAPE       : 15.1 %

=== Over/Underfitting Check ===
✅ 일반화 상태가 양호해 보입니다.


In [24]:
model = joblib.load('/Users/hyeom/Documents/GitHub/advanced_project/hye_project/03_MachineLearning/final_ensemble_model_2.pkl')
print("Original train samples:", model.n_train_samples)

AttributeError: 'StackingRegressor' object has no attribute 'n_train_samples'