In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor

In [2]:
df = pd.read_csv('data/FIN_DATA_v4.csv')
df.head()

Unnamed: 0,DATE_YM,STATES,SIZE,PRICE_MEDIAN,COUNTS,PRICE_MEDIAN_LAG_1,PRICE_MEDIAN_LAG_2,PRICE_MEDIAN_LAG_3,CPI,BASE_RATE,...,BASE_RATE_LAG_2,CPI_YOY_LAG_2,REAL_RATE_LAG_2,HOUSE_RATE_LAG_2,CPI_LAG_3,BASE_RATE_LAG_3,CPI_YOY_LAG_3,REAL_RATE_LAG_3,HOUSE_RATE_LAG_3,VARIABLED
0,201504,강남구,대형,128750.0,246,130000.0,125000.0,132750.0,94.625,1.73,...,1.99,0.645882,1.344118,3.0,94.643,1.99,0.974074,1.015926,3.0,-0.030132
1,201505,강남구,대형,133000.0,204,128750.0,130000.0,125000.0,94.89,1.74,...,1.82,0.470511,1.349489,2.0,94.587,1.99,0.645882,1.344118,3.0,0.064
2,201506,강남구,대형,131000.0,183,133000.0,128750.0,130000.0,94.909,1.57,...,1.73,0.436241,1.293759,2.0,94.596,1.82,0.470511,1.349489,2.0,0.007692
3,201507,강남구,대형,139250.0,176,131000.0,133000.0,128750.0,95.08,1.48,...,1.74,0.552088,1.187912,3.0,94.625,1.73,0.436241,1.293759,2.0,0.081553
4,201508,강남구,대형,135000.0,157,139250.0,131000.0,133000.0,95.213,1.49,...,1.57,0.701341,0.868659,3.0,94.89,1.74,0.552088,1.187912,3.0,0.015038


In [3]:
df = df.sort_values('DATE_YM').reset_index(drop=True)
df[['DATE_YM']].head()

Unnamed: 0,DATE_YM
0,201504
1,201504
2,201504
3,201504
4,201504


In [4]:
y = df['VARIABLED']
X = df.drop(columns=['VARIABLED'])

In [5]:
lag_cols = [col for col in X.columns if 'LAG' in col]
X = X.drop(columns=lag_cols)

print("Dropped LAG columns:", len(lag_cols))

Dropped LAG columns: 18


In [6]:
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0,DATE_YM,PRICE_MEDIAN,COUNTS,CPI,BASE_RATE,CPI_YOY,REAL_RATE,HOUSE_RATE,STATES_강서구,STATES_관악구,...,STATES_노원구,STATES_도봉구,STATES_서초구,STATES_송파구,STATES_양천구,STATES_용산구,STATES_종로구,STATES_중구,SIZE_소형,SIZE_중형
0,201504,128750.0,246,94.625,1.73,0.436241,1.293759,2,False,False,...,False,False,False,False,False,False,False,False,False,False
1,201504,30900.0,29,94.625,1.73,0.436241,1.293759,2,False,False,...,False,False,False,False,False,False,True,False,True,False
2,201504,60000.0,315,94.625,1.73,0.436241,1.293759,2,False,False,...,False,False,False,True,False,False,False,False,False,True
3,201504,48700.0,102,94.625,1.73,0.436241,1.293759,2,False,False,...,True,False,False,False,False,False,False,False,False,False
4,201504,86000.0,137,94.625,1.73,0.436241,1.293759,2,False,False,...,False,False,False,True,False,False,False,False,False,False


In [7]:
split_ym = 202012

train_mask = X['DATE_YM'] <= split_ym

X_train = X.loc[train_mask].drop(columns=['DATE_YM'])
X_test  = X.loc[~train_mask].drop(columns=['DATE_YM'])

y_train = y.loc[train_mask]
y_test  = y.loc[~train_mask]

print("Train size:", X_train.shape, "Test size:", X_test.shape)

Train size: (2484, 20) Test size: (2088, 20)


In [8]:
xgb = XGBRegressor(
    n_estimators=800,        # 트리 개수 (너무 작으면 성능 약함 / 너무 크면 과적합 가능)
    learning_rate=0.03,      # 학습률 (작게 두고 n_estimators 늘리는 방식이 보통 안정적)
    max_depth=4,             # 트리 깊이
    subsample=0.8,           # 데이터 일부만 사용 (과적합 완화)
    colsample_bytree=0.8,    # 변수 일부만 사용 (과적합 완화)
    reg_alpha=0.0,           # L1 규제
    reg_lambda=1.0,          # L2 규제
    random_state=42,
    objective='reg:squarederror'
)

# eval_set에 test를 넣어서 학습 중 로그 확인 가능
xgb.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=False
)

In [9]:
y_train_pred = xgb.predict(X_train)
y_test_pred  = xgb.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2  = r2_score(y_test, y_test_pred)

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse  = mean_squared_error(y_test, y_test_pred)

train_rmse = np.sqrt(train_mse)
test_rmse  = np.sqrt(test_mse)

# MAPE는 y에 0이 있으면 터질 수 있어서 아주 작은 값으로 방어
eps = 1e-9
train_mape = mean_absolute_percentage_error(np.maximum(np.abs(y_train), eps), np.abs(y_train_pred))
test_mape  = mean_absolute_percentage_error(np.maximum(np.abs(y_test), eps), np.abs(y_test_pred))

print("Train R2:", train_r2)
print("Test  R2:", test_r2)

print("Train MSE:", train_mse)
print("Test  MSE:", test_mse)

print("Train RMSE:", train_rmse)
print("Test  RMSE:", test_rmse)

print("Train MAPE:", train_mape)
print("Test  MAPE:", test_mape)

Train R2: 0.7694186862629465
Test  R2: -0.07010987816900704
Train MSE: 0.009607074334180413
Test  MSE: 0.15151679318823208
Train RMSE: 0.09801568412341166
Test  RMSE: 0.38925158084230316
Train MAPE: 857426.8256039179
Test  MAPE: 2030876.5964954905
