In [4]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor

In [5]:
data = pd.read_csv("/home/hyuksu/projects/ml/project/데이터/FIN_DATA_v4.csv")

In [6]:
data = data.sort_values("DATE_YM").reset_index(drop=True)

In [7]:
TARGET = "VARIABLED"

In [8]:
data_dum = pd.get_dummies(data, columns=["STATES", "SIZE"], drop_first=False)

In [9]:
X_all = data_dum.drop(columns=[TARGET])
y_all = data_dum[TARGET]

In [10]:
DATE_SPLIT = 202012

train_mask = X_all["DATE_YM"] <= DATE_SPLIT
test_mask  = X_all["DATE_YM"] >  DATE_SPLIT

In [11]:
X_train = X_all.loc[train_mask].copy()
y_train = y_all.loc[train_mask].copy()

X_test = X_all.loc[test_mask].copy()
y_test = y_all.loc[test_mask].copy()

print("Train:", X_train.shape, "Test:", X_test.shape)

Train: (2484, 41) Test: (2088, 41)


In [12]:
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

In [13]:
model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
pred_train = model.predict(X_train)
pred_test  = model.predict(X_test)

In [15]:
def mape(y_true, y_pred, eps=1e-9):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), eps)))

rmse_train = np.sqrt(mean_squared_error(y_train, pred_train))
rmse_test  = np.sqrt(mean_squared_error(y_test, pred_test))

mae_train = mean_absolute_error(y_train, pred_train)
mae_test  = mean_absolute_error(y_test, pred_test)

mape_train = mape(y_train, pred_train)
mape_test  = mape(y_test, pred_test)

r2_train = r2_score(y_train, pred_train)
r2_test  = r2_score(y_test, pred_test)

In [16]:
result = pd.DataFrame(
    {
        "RMSE": [rmse_train, rmse_test],
        "MAE":  [mae_train, mae_test],
        "MAPE": [mape_train, mape_test],
        "R2":   [r2_train, r2_test],
    },
    index=["train", "test"]
)

result

Unnamed: 0,RMSE,MAE,MAPE,R2
train,0.012321,0.009544,144292.468565,0.996356
test,0.146665,0.054459,185109.75638,0.848078
